]> git.proxmox.com Git - mirror_qemu.git/blame - fpu/softfloat.c
softfloat: Factor out RoundAndPackFloat16 and NormalizeFloat16Subnormal
[mirror_qemu.git] / fpu / softfloat.c
CommitLineData
8d725fac
AF
1/*
2 * QEMU float support
3 *
4 * Derived from SoftFloat.
5 */
158142c2
FB
6
7/*============================================================================
8
9This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
10Package, Release 2b.
11
12Written by John R. Hauser. This work was made possible in part by the
13International Computer Science Institute, located at Suite 600, 1947 Center
14Street, Berkeley, California 94704. Funding was partially provided by the
15National Science Foundation under grant MIP-9311980. The original version
16of this code was written as part of a project to build a fixed-point vector
17processor in collaboration with the University of California at Berkeley,
18overseen by Profs. Nelson Morgan and John Wawrzynek. More information
19is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
20arithmetic/SoftFloat.html'.
21
22THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
23been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
24RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
25AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
26COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
27EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
28INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
29OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
30
31Derivative works are acceptable, even for commercial purposes, so long as
32(1) the source code for the derivative work includes prominent notice that
33the work is derivative, and (2) the source code includes prominent notice with
34these four paragraphs for those parts of this code that are retained.
35
36=============================================================================*/
37
2ac8bd03
PM
38/* softfloat (and in particular the code in softfloat-specialize.h) is
39 * target-dependent and needs the TARGET_* macros.
40 */
41#include "config.h"
42
6b4c305c 43#include "fpu/softfloat.h"
158142c2
FB
44
45/*----------------------------------------------------------------------------
46| Primitive arithmetic functions, including multi-word arithmetic, and
47| division and square root approximations. (Can be specialized to target if
48| desired.)
49*----------------------------------------------------------------------------*/
50#include "softfloat-macros.h"
51
52/*----------------------------------------------------------------------------
53| Functions and definitions to determine: (1) whether tininess for underflow
54| is detected before or after rounding by default, (2) what (if anything)
55| happens when exceptions are raised, (3) how signaling NaNs are distinguished
56| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
57| are propagated from function inputs to output. These details are target-
58| specific.
59*----------------------------------------------------------------------------*/
60#include "softfloat-specialize.h"
61
bb4d4bb3
PM
62/*----------------------------------------------------------------------------
63| Returns the fraction bits of the half-precision floating-point value `a'.
64*----------------------------------------------------------------------------*/
65
66INLINE uint32_t extractFloat16Frac(float16 a)
67{
68 return float16_val(a) & 0x3ff;
69}
70
71/*----------------------------------------------------------------------------
72| Returns the exponent bits of the half-precision floating-point value `a'.
73*----------------------------------------------------------------------------*/
74
94a49d86 75INLINE int_fast16_t extractFloat16Exp(float16 a)
bb4d4bb3
PM
76{
77 return (float16_val(a) >> 10) & 0x1f;
78}
79
80/*----------------------------------------------------------------------------
81| Returns the sign bit of the single-precision floating-point value `a'.
82*----------------------------------------------------------------------------*/
83
84INLINE flag extractFloat16Sign(float16 a)
85{
86 return float16_val(a)>>15;
87}
88
158142c2
FB
89/*----------------------------------------------------------------------------
90| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
91| and 7, and returns the properly rounded 32-bit integer corresponding to the
92| input. If `zSign' is 1, the input is negated before being converted to an
93| integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
94| is simply rounded to an integer, with the inexact exception raised if the
95| input cannot be represented exactly as an integer. However, if the fixed-
96| point input is too large, the invalid exception is raised and the largest
97| positive or negative integer is returned.
98*----------------------------------------------------------------------------*/
99
bb98fe42 100static int32 roundAndPackInt32( flag zSign, uint64_t absZ STATUS_PARAM)
158142c2
FB
101{
102 int8 roundingMode;
103 flag roundNearestEven;
104 int8 roundIncrement, roundBits;
760e1416 105 int32_t z;
158142c2
FB
106
107 roundingMode = STATUS(float_rounding_mode);
108 roundNearestEven = ( roundingMode == float_round_nearest_even );
109 roundIncrement = 0x40;
110 if ( ! roundNearestEven ) {
111 if ( roundingMode == float_round_to_zero ) {
112 roundIncrement = 0;
113 }
114 else {
115 roundIncrement = 0x7F;
116 if ( zSign ) {
117 if ( roundingMode == float_round_up ) roundIncrement = 0;
118 }
119 else {
120 if ( roundingMode == float_round_down ) roundIncrement = 0;
121 }
122 }
123 }
124 roundBits = absZ & 0x7F;
125 absZ = ( absZ + roundIncrement )>>7;
126 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
127 z = absZ;
128 if ( zSign ) z = - z;
129 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
130 float_raise( float_flag_invalid STATUS_VAR);
bb98fe42 131 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2
FB
132 }
133 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
134 return z;
135
136}
137
138/*----------------------------------------------------------------------------
139| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
140| `absZ1', with binary point between bits 63 and 64 (between the input words),
141| and returns the properly rounded 64-bit integer corresponding to the input.
142| If `zSign' is 1, the input is negated before being converted to an integer.
143| Ordinarily, the fixed-point input is simply rounded to an integer, with
144| the inexact exception raised if the input cannot be represented exactly as
145| an integer. However, if the fixed-point input is too large, the invalid
146| exception is raised and the largest positive or negative integer is
147| returned.
148*----------------------------------------------------------------------------*/
149
bb98fe42 150static int64 roundAndPackInt64( flag zSign, uint64_t absZ0, uint64_t absZ1 STATUS_PARAM)
158142c2
FB
151{
152 int8 roundingMode;
153 flag roundNearestEven, increment;
760e1416 154 int64_t z;
158142c2
FB
155
156 roundingMode = STATUS(float_rounding_mode);
157 roundNearestEven = ( roundingMode == float_round_nearest_even );
bb98fe42 158 increment = ( (int64_t) absZ1 < 0 );
158142c2
FB
159 if ( ! roundNearestEven ) {
160 if ( roundingMode == float_round_to_zero ) {
161 increment = 0;
162 }
163 else {
164 if ( zSign ) {
165 increment = ( roundingMode == float_round_down ) && absZ1;
166 }
167 else {
168 increment = ( roundingMode == float_round_up ) && absZ1;
169 }
170 }
171 }
172 if ( increment ) {
173 ++absZ0;
174 if ( absZ0 == 0 ) goto overflow;
bb98fe42 175 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
158142c2
FB
176 }
177 z = absZ0;
178 if ( zSign ) z = - z;
179 if ( z && ( ( z < 0 ) ^ zSign ) ) {
180 overflow:
181 float_raise( float_flag_invalid STATUS_VAR);
182 return
bb98fe42 183 zSign ? (int64_t) LIT64( 0x8000000000000000 )
158142c2
FB
184 : LIT64( 0x7FFFFFFFFFFFFFFF );
185 }
186 if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
187 return z;
188
189}
190
fb3ea83a
TM
191/*----------------------------------------------------------------------------
192| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
193| `absZ1', with binary point between bits 63 and 64 (between the input words),
194| and returns the properly rounded 64-bit unsigned integer corresponding to the
195| input. Ordinarily, the fixed-point input is simply rounded to an integer,
196| with the inexact exception raised if the input cannot be represented exactly
197| as an integer. However, if the fixed-point input is too large, the invalid
198| exception is raised and the largest unsigned integer is returned.
199*----------------------------------------------------------------------------*/
200
201static int64 roundAndPackUint64(flag zSign, uint64_t absZ0,
202 uint64_t absZ1 STATUS_PARAM)
203{
204 int8 roundingMode;
205 flag roundNearestEven, increment;
206
207 roundingMode = STATUS(float_rounding_mode);
208 roundNearestEven = (roundingMode == float_round_nearest_even);
209 increment = ((int64_t)absZ1 < 0);
210 if (!roundNearestEven) {
211 if (roundingMode == float_round_to_zero) {
212 increment = 0;
213 } else if (absZ1) {
214 if (zSign) {
215 increment = (roundingMode == float_round_down) && absZ1;
216 } else {
217 increment = (roundingMode == float_round_up) && absZ1;
218 }
219 }
220 }
221 if (increment) {
222 ++absZ0;
223 if (absZ0 == 0) {
224 float_raise(float_flag_invalid STATUS_VAR);
225 return LIT64(0xFFFFFFFFFFFFFFFF);
226 }
227 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
228 }
229
230 if (zSign && absZ0) {
231 float_raise(float_flag_invalid STATUS_VAR);
232 return 0;
233 }
234
235 if (absZ1) {
236 STATUS(float_exception_flags) |= float_flag_inexact;
237 }
238 return absZ0;
239}
240
158142c2
FB
241/*----------------------------------------------------------------------------
242| Returns the fraction bits of the single-precision floating-point value `a'.
243*----------------------------------------------------------------------------*/
244
bb98fe42 245INLINE uint32_t extractFloat32Frac( float32 a )
158142c2
FB
246{
247
f090c9d4 248 return float32_val(a) & 0x007FFFFF;
158142c2
FB
249
250}
251
252/*----------------------------------------------------------------------------
253| Returns the exponent bits of the single-precision floating-point value `a'.
254*----------------------------------------------------------------------------*/
255
94a49d86 256INLINE int_fast16_t extractFloat32Exp(float32 a)
158142c2
FB
257{
258
f090c9d4 259 return ( float32_val(a)>>23 ) & 0xFF;
158142c2
FB
260
261}
262
263/*----------------------------------------------------------------------------
264| Returns the sign bit of the single-precision floating-point value `a'.
265*----------------------------------------------------------------------------*/
266
267INLINE flag extractFloat32Sign( float32 a )
268{
269
f090c9d4 270 return float32_val(a)>>31;
158142c2
FB
271
272}
273
37d18660
PM
274/*----------------------------------------------------------------------------
275| If `a' is denormal and we are in flush-to-zero mode then set the
276| input-denormal exception and return zero. Otherwise just return the value.
277*----------------------------------------------------------------------------*/
278static float32 float32_squash_input_denormal(float32 a STATUS_PARAM)
279{
280 if (STATUS(flush_inputs_to_zero)) {
281 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
282 float_raise(float_flag_input_denormal STATUS_VAR);
283 return make_float32(float32_val(a) & 0x80000000);
284 }
285 }
286 return a;
287}
288
158142c2
FB
289/*----------------------------------------------------------------------------
290| Normalizes the subnormal single-precision floating-point value represented
291| by the denormalized significand `aSig'. The normalized exponent and
292| significand are stored at the locations pointed to by `zExpPtr' and
293| `zSigPtr', respectively.
294*----------------------------------------------------------------------------*/
295
296static void
94a49d86 297 normalizeFloat32Subnormal(uint32_t aSig, int_fast16_t *zExpPtr, uint32_t *zSigPtr)
158142c2
FB
298{
299 int8 shiftCount;
300
301 shiftCount = countLeadingZeros32( aSig ) - 8;
302 *zSigPtr = aSig<<shiftCount;
303 *zExpPtr = 1 - shiftCount;
304
305}
306
307/*----------------------------------------------------------------------------
308| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
309| single-precision floating-point value, returning the result. After being
310| shifted into the proper positions, the three fields are simply added
311| together to form the result. This means that any integer portion of `zSig'
312| will be added into the exponent. Since a properly normalized significand
313| will have an integer portion equal to 1, the `zExp' input should be 1 less
314| than the desired result exponent whenever `zSig' is a complete, normalized
315| significand.
316*----------------------------------------------------------------------------*/
317
94a49d86 318INLINE float32 packFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig)
158142c2
FB
319{
320
f090c9d4 321 return make_float32(
bb98fe42 322 ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
158142c2
FB
323
324}
325
326/*----------------------------------------------------------------------------
327| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
328| and significand `zSig', and returns the proper single-precision floating-
329| point value corresponding to the abstract input. Ordinarily, the abstract
330| value is simply rounded and packed into the single-precision format, with
331| the inexact exception raised if the abstract input cannot be represented
332| exactly. However, if the abstract value is too large, the overflow and
333| inexact exceptions are raised and an infinity or maximal finite value is
334| returned. If the abstract value is too small, the input value is rounded to
335| a subnormal number, and the underflow and inexact exceptions are raised if
336| the abstract input cannot be represented exactly as a subnormal single-
337| precision floating-point number.
338| The input significand `zSig' has its binary point between bits 30
339| and 29, which is 7 bits to the left of the usual location. This shifted
340| significand must be normalized or smaller. If `zSig' is not normalized,
341| `zExp' must be 0; in that case, the result returned is a subnormal number,
342| and it must not require rounding. In the usual case that `zSig' is
343| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
344| The handling of underflow and overflow follows the IEC/IEEE Standard for
345| Binary Floating-Point Arithmetic.
346*----------------------------------------------------------------------------*/
347
94a49d86 348static float32 roundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM)
158142c2
FB
349{
350 int8 roundingMode;
351 flag roundNearestEven;
352 int8 roundIncrement, roundBits;
353 flag isTiny;
354
355 roundingMode = STATUS(float_rounding_mode);
356 roundNearestEven = ( roundingMode == float_round_nearest_even );
357 roundIncrement = 0x40;
358 if ( ! roundNearestEven ) {
359 if ( roundingMode == float_round_to_zero ) {
360 roundIncrement = 0;
361 }
362 else {
363 roundIncrement = 0x7F;
364 if ( zSign ) {
365 if ( roundingMode == float_round_up ) roundIncrement = 0;
366 }
367 else {
368 if ( roundingMode == float_round_down ) roundIncrement = 0;
369 }
370 }
371 }
372 roundBits = zSig & 0x7F;
bb98fe42 373 if ( 0xFD <= (uint16_t) zExp ) {
158142c2
FB
374 if ( ( 0xFD < zExp )
375 || ( ( zExp == 0xFD )
bb98fe42 376 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
158142c2
FB
377 ) {
378 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
f090c9d4 379 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
158142c2
FB
380 }
381 if ( zExp < 0 ) {
e6afc87f
PM
382 if (STATUS(flush_to_zero)) {
383 float_raise(float_flag_output_denormal STATUS_VAR);
384 return packFloat32(zSign, 0, 0);
385 }
158142c2
FB
386 isTiny =
387 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
388 || ( zExp < -1 )
389 || ( zSig + roundIncrement < 0x80000000 );
390 shift32RightJamming( zSig, - zExp, &zSig );
391 zExp = 0;
392 roundBits = zSig & 0x7F;
393 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
394 }
395 }
396 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
397 zSig = ( zSig + roundIncrement )>>7;
398 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
399 if ( zSig == 0 ) zExp = 0;
400 return packFloat32( zSign, zExp, zSig );
401
402}
403
404/*----------------------------------------------------------------------------
405| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
406| and significand `zSig', and returns the proper single-precision floating-
407| point value corresponding to the abstract input. This routine is just like
408| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
409| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
410| floating-point exponent.
411*----------------------------------------------------------------------------*/
412
413static float32
94a49d86 414 normalizeRoundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM)
158142c2
FB
415{
416 int8 shiftCount;
417
418 shiftCount = countLeadingZeros32( zSig ) - 1;
419 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
420
421}
422
423/*----------------------------------------------------------------------------
424| Returns the fraction bits of the double-precision floating-point value `a'.
425*----------------------------------------------------------------------------*/
426
bb98fe42 427INLINE uint64_t extractFloat64Frac( float64 a )
158142c2
FB
428{
429
f090c9d4 430 return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
158142c2
FB
431
432}
433
434/*----------------------------------------------------------------------------
435| Returns the exponent bits of the double-precision floating-point value `a'.
436*----------------------------------------------------------------------------*/
437
94a49d86 438INLINE int_fast16_t extractFloat64Exp(float64 a)
158142c2
FB
439{
440
f090c9d4 441 return ( float64_val(a)>>52 ) & 0x7FF;
158142c2
FB
442
443}
444
445/*----------------------------------------------------------------------------
446| Returns the sign bit of the double-precision floating-point value `a'.
447*----------------------------------------------------------------------------*/
448
449INLINE flag extractFloat64Sign( float64 a )
450{
451
f090c9d4 452 return float64_val(a)>>63;
158142c2
FB
453
454}
455
37d18660
PM
456/*----------------------------------------------------------------------------
457| If `a' is denormal and we are in flush-to-zero mode then set the
458| input-denormal exception and return zero. Otherwise just return the value.
459*----------------------------------------------------------------------------*/
460static float64 float64_squash_input_denormal(float64 a STATUS_PARAM)
461{
462 if (STATUS(flush_inputs_to_zero)) {
463 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
464 float_raise(float_flag_input_denormal STATUS_VAR);
465 return make_float64(float64_val(a) & (1ULL << 63));
466 }
467 }
468 return a;
469}
470
158142c2
FB
471/*----------------------------------------------------------------------------
472| Normalizes the subnormal double-precision floating-point value represented
473| by the denormalized significand `aSig'. The normalized exponent and
474| significand are stored at the locations pointed to by `zExpPtr' and
475| `zSigPtr', respectively.
476*----------------------------------------------------------------------------*/
477
478static void
94a49d86 479 normalizeFloat64Subnormal(uint64_t aSig, int_fast16_t *zExpPtr, uint64_t *zSigPtr)
158142c2
FB
480{
481 int8 shiftCount;
482
483 shiftCount = countLeadingZeros64( aSig ) - 11;
484 *zSigPtr = aSig<<shiftCount;
485 *zExpPtr = 1 - shiftCount;
486
487}
488
489/*----------------------------------------------------------------------------
490| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
491| double-precision floating-point value, returning the result. After being
492| shifted into the proper positions, the three fields are simply added
493| together to form the result. This means that any integer portion of `zSig'
494| will be added into the exponent. Since a properly normalized significand
495| will have an integer portion equal to 1, the `zExp' input should be 1 less
496| than the desired result exponent whenever `zSig' is a complete, normalized
497| significand.
498*----------------------------------------------------------------------------*/
499
94a49d86 500INLINE float64 packFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig)
158142c2
FB
501{
502
f090c9d4 503 return make_float64(
bb98fe42 504 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
158142c2
FB
505
506}
507
508/*----------------------------------------------------------------------------
509| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
510| and significand `zSig', and returns the proper double-precision floating-
511| point value corresponding to the abstract input. Ordinarily, the abstract
512| value is simply rounded and packed into the double-precision format, with
513| the inexact exception raised if the abstract input cannot be represented
514| exactly. However, if the abstract value is too large, the overflow and
515| inexact exceptions are raised and an infinity or maximal finite value is
516| returned. If the abstract value is too small, the input value is rounded
517| to a subnormal number, and the underflow and inexact exceptions are raised
518| if the abstract input cannot be represented exactly as a subnormal double-
519| precision floating-point number.
520| The input significand `zSig' has its binary point between bits 62
521| and 61, which is 10 bits to the left of the usual location. This shifted
522| significand must be normalized or smaller. If `zSig' is not normalized,
523| `zExp' must be 0; in that case, the result returned is a subnormal number,
524| and it must not require rounding. In the usual case that `zSig' is
525| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
526| The handling of underflow and overflow follows the IEC/IEEE Standard for
527| Binary Floating-Point Arithmetic.
528*----------------------------------------------------------------------------*/
529
94a49d86 530static float64 roundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM)
158142c2
FB
531{
532 int8 roundingMode;
533 flag roundNearestEven;
94a49d86 534 int_fast16_t roundIncrement, roundBits;
158142c2
FB
535 flag isTiny;
536
537 roundingMode = STATUS(float_rounding_mode);
538 roundNearestEven = ( roundingMode == float_round_nearest_even );
539 roundIncrement = 0x200;
540 if ( ! roundNearestEven ) {
541 if ( roundingMode == float_round_to_zero ) {
542 roundIncrement = 0;
543 }
544 else {
545 roundIncrement = 0x3FF;
546 if ( zSign ) {
547 if ( roundingMode == float_round_up ) roundIncrement = 0;
548 }
549 else {
550 if ( roundingMode == float_round_down ) roundIncrement = 0;
551 }
552 }
553 }
554 roundBits = zSig & 0x3FF;
bb98fe42 555 if ( 0x7FD <= (uint16_t) zExp ) {
158142c2
FB
556 if ( ( 0x7FD < zExp )
557 || ( ( zExp == 0x7FD )
bb98fe42 558 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
158142c2
FB
559 ) {
560 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
f090c9d4 561 return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
158142c2
FB
562 }
563 if ( zExp < 0 ) {
e6afc87f
PM
564 if (STATUS(flush_to_zero)) {
565 float_raise(float_flag_output_denormal STATUS_VAR);
566 return packFloat64(zSign, 0, 0);
567 }
158142c2
FB
568 isTiny =
569 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
570 || ( zExp < -1 )
571 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
572 shift64RightJamming( zSig, - zExp, &zSig );
573 zExp = 0;
574 roundBits = zSig & 0x3FF;
575 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
576 }
577 }
578 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
579 zSig = ( zSig + roundIncrement )>>10;
580 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
581 if ( zSig == 0 ) zExp = 0;
582 return packFloat64( zSign, zExp, zSig );
583
584}
585
586/*----------------------------------------------------------------------------
587| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
588| and significand `zSig', and returns the proper double-precision floating-
589| point value corresponding to the abstract input. This routine is just like
590| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
591| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
592| floating-point exponent.
593*----------------------------------------------------------------------------*/
594
595static float64
94a49d86 596 normalizeRoundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM)
158142c2
FB
597{
598 int8 shiftCount;
599
600 shiftCount = countLeadingZeros64( zSig ) - 1;
601 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
602
603}
604
158142c2
FB
605/*----------------------------------------------------------------------------
606| Returns the fraction bits of the extended double-precision floating-point
607| value `a'.
608*----------------------------------------------------------------------------*/
609
bb98fe42 610INLINE uint64_t extractFloatx80Frac( floatx80 a )
158142c2
FB
611{
612
613 return a.low;
614
615}
616
617/*----------------------------------------------------------------------------
618| Returns the exponent bits of the extended double-precision floating-point
619| value `a'.
620*----------------------------------------------------------------------------*/
621
622INLINE int32 extractFloatx80Exp( floatx80 a )
623{
624
625 return a.high & 0x7FFF;
626
627}
628
629/*----------------------------------------------------------------------------
630| Returns the sign bit of the extended double-precision floating-point value
631| `a'.
632*----------------------------------------------------------------------------*/
633
634INLINE flag extractFloatx80Sign( floatx80 a )
635{
636
637 return a.high>>15;
638
639}
640
641/*----------------------------------------------------------------------------
642| Normalizes the subnormal extended double-precision floating-point value
643| represented by the denormalized significand `aSig'. The normalized exponent
644| and significand are stored at the locations pointed to by `zExpPtr' and
645| `zSigPtr', respectively.
646*----------------------------------------------------------------------------*/
647
648static void
bb98fe42 649 normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr )
158142c2
FB
650{
651 int8 shiftCount;
652
653 shiftCount = countLeadingZeros64( aSig );
654 *zSigPtr = aSig<<shiftCount;
655 *zExpPtr = 1 - shiftCount;
656
657}
658
659/*----------------------------------------------------------------------------
660| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
661| extended double-precision floating-point value, returning the result.
662*----------------------------------------------------------------------------*/
663
bb98fe42 664INLINE floatx80 packFloatx80( flag zSign, int32 zExp, uint64_t zSig )
158142c2
FB
665{
666 floatx80 z;
667
668 z.low = zSig;
bb98fe42 669 z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
158142c2
FB
670 return z;
671
672}
673
674/*----------------------------------------------------------------------------
675| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
676| and extended significand formed by the concatenation of `zSig0' and `zSig1',
677| and returns the proper extended double-precision floating-point value
678| corresponding to the abstract input. Ordinarily, the abstract value is
679| rounded and packed into the extended double-precision format, with the
680| inexact exception raised if the abstract input cannot be represented
681| exactly. However, if the abstract value is too large, the overflow and
682| inexact exceptions are raised and an infinity or maximal finite value is
683| returned. If the abstract value is too small, the input value is rounded to
684| a subnormal number, and the underflow and inexact exceptions are raised if
685| the abstract input cannot be represented exactly as a subnormal extended
686| double-precision floating-point number.
687| If `roundingPrecision' is 32 or 64, the result is rounded to the same
688| number of bits as single or double precision, respectively. Otherwise, the
689| result is rounded to the full precision of the extended double-precision
690| format.
691| The input significand must be normalized or smaller. If the input
692| significand is not normalized, `zExp' must be 0; in that case, the result
693| returned is a subnormal number, and it must not require rounding. The
694| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
695| Floating-Point Arithmetic.
696*----------------------------------------------------------------------------*/
697
698static floatx80
699 roundAndPackFloatx80(
bb98fe42 700 int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
158142c2
FB
701 STATUS_PARAM)
702{
703 int8 roundingMode;
704 flag roundNearestEven, increment, isTiny;
705 int64 roundIncrement, roundMask, roundBits;
706
707 roundingMode = STATUS(float_rounding_mode);
708 roundNearestEven = ( roundingMode == float_round_nearest_even );
709 if ( roundingPrecision == 80 ) goto precision80;
710 if ( roundingPrecision == 64 ) {
711 roundIncrement = LIT64( 0x0000000000000400 );
712 roundMask = LIT64( 0x00000000000007FF );
713 }
714 else if ( roundingPrecision == 32 ) {
715 roundIncrement = LIT64( 0x0000008000000000 );
716 roundMask = LIT64( 0x000000FFFFFFFFFF );
717 }
718 else {
719 goto precision80;
720 }
721 zSig0 |= ( zSig1 != 0 );
722 if ( ! roundNearestEven ) {
723 if ( roundingMode == float_round_to_zero ) {
724 roundIncrement = 0;
725 }
726 else {
727 roundIncrement = roundMask;
728 if ( zSign ) {
729 if ( roundingMode == float_round_up ) roundIncrement = 0;
730 }
731 else {
732 if ( roundingMode == float_round_down ) roundIncrement = 0;
733 }
734 }
735 }
736 roundBits = zSig0 & roundMask;
bb98fe42 737 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
158142c2
FB
738 if ( ( 0x7FFE < zExp )
739 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
740 ) {
741 goto overflow;
742 }
743 if ( zExp <= 0 ) {
e6afc87f
PM
744 if (STATUS(flush_to_zero)) {
745 float_raise(float_flag_output_denormal STATUS_VAR);
746 return packFloatx80(zSign, 0, 0);
747 }
158142c2
FB
748 isTiny =
749 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
750 || ( zExp < 0 )
751 || ( zSig0 <= zSig0 + roundIncrement );
752 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
753 zExp = 0;
754 roundBits = zSig0 & roundMask;
755 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
756 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
757 zSig0 += roundIncrement;
bb98fe42 758 if ( (int64_t) zSig0 < 0 ) zExp = 1;
158142c2
FB
759 roundIncrement = roundMask + 1;
760 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
761 roundMask |= roundIncrement;
762 }
763 zSig0 &= ~ roundMask;
764 return packFloatx80( zSign, zExp, zSig0 );
765 }
766 }
767 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
768 zSig0 += roundIncrement;
769 if ( zSig0 < roundIncrement ) {
770 ++zExp;
771 zSig0 = LIT64( 0x8000000000000000 );
772 }
773 roundIncrement = roundMask + 1;
774 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
775 roundMask |= roundIncrement;
776 }
777 zSig0 &= ~ roundMask;
778 if ( zSig0 == 0 ) zExp = 0;
779 return packFloatx80( zSign, zExp, zSig0 );
780 precision80:
bb98fe42 781 increment = ( (int64_t) zSig1 < 0 );
158142c2
FB
782 if ( ! roundNearestEven ) {
783 if ( roundingMode == float_round_to_zero ) {
784 increment = 0;
785 }
786 else {
787 if ( zSign ) {
788 increment = ( roundingMode == float_round_down ) && zSig1;
789 }
790 else {
791 increment = ( roundingMode == float_round_up ) && zSig1;
792 }
793 }
794 }
bb98fe42 795 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
158142c2
FB
796 if ( ( 0x7FFE < zExp )
797 || ( ( zExp == 0x7FFE )
798 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
799 && increment
800 )
801 ) {
802 roundMask = 0;
803 overflow:
804 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
805 if ( ( roundingMode == float_round_to_zero )
806 || ( zSign && ( roundingMode == float_round_up ) )
807 || ( ! zSign && ( roundingMode == float_round_down ) )
808 ) {
809 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
810 }
811 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
812 }
813 if ( zExp <= 0 ) {
814 isTiny =
815 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
816 || ( zExp < 0 )
817 || ! increment
818 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
819 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
820 zExp = 0;
821 if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
822 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
823 if ( roundNearestEven ) {
bb98fe42 824 increment = ( (int64_t) zSig1 < 0 );
158142c2
FB
825 }
826 else {
827 if ( zSign ) {
828 increment = ( roundingMode == float_round_down ) && zSig1;
829 }
830 else {
831 increment = ( roundingMode == float_round_up ) && zSig1;
832 }
833 }
834 if ( increment ) {
835 ++zSig0;
836 zSig0 &=
bb98fe42
AF
837 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
838 if ( (int64_t) zSig0 < 0 ) zExp = 1;
158142c2
FB
839 }
840 return packFloatx80( zSign, zExp, zSig0 );
841 }
842 }
843 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
844 if ( increment ) {
845 ++zSig0;
846 if ( zSig0 == 0 ) {
847 ++zExp;
848 zSig0 = LIT64( 0x8000000000000000 );
849 }
850 else {
bb98fe42 851 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
158142c2
FB
852 }
853 }
854 else {
855 if ( zSig0 == 0 ) zExp = 0;
856 }
857 return packFloatx80( zSign, zExp, zSig0 );
858
859}
860
861/*----------------------------------------------------------------------------
862| Takes an abstract floating-point value having sign `zSign', exponent
863| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
864| and returns the proper extended double-precision floating-point value
865| corresponding to the abstract input. This routine is just like
866| `roundAndPackFloatx80' except that the input significand does not have to be
867| normalized.
868*----------------------------------------------------------------------------*/
869
870static floatx80
871 normalizeRoundAndPackFloatx80(
bb98fe42 872 int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
158142c2
FB
873 STATUS_PARAM)
874{
875 int8 shiftCount;
876
877 if ( zSig0 == 0 ) {
878 zSig0 = zSig1;
879 zSig1 = 0;
880 zExp -= 64;
881 }
882 shiftCount = countLeadingZeros64( zSig0 );
883 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
884 zExp -= shiftCount;
885 return
886 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
887
888}
889
158142c2
FB
890/*----------------------------------------------------------------------------
891| Returns the least-significant 64 fraction bits of the quadruple-precision
892| floating-point value `a'.
893*----------------------------------------------------------------------------*/
894
bb98fe42 895INLINE uint64_t extractFloat128Frac1( float128 a )
158142c2
FB
896{
897
898 return a.low;
899
900}
901
902/*----------------------------------------------------------------------------
903| Returns the most-significant 48 fraction bits of the quadruple-precision
904| floating-point value `a'.
905*----------------------------------------------------------------------------*/
906
bb98fe42 907INLINE uint64_t extractFloat128Frac0( float128 a )
158142c2
FB
908{
909
910 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
911
912}
913
914/*----------------------------------------------------------------------------
915| Returns the exponent bits of the quadruple-precision floating-point value
916| `a'.
917*----------------------------------------------------------------------------*/
918
919INLINE int32 extractFloat128Exp( float128 a )
920{
921
922 return ( a.high>>48 ) & 0x7FFF;
923
924}
925
926/*----------------------------------------------------------------------------
927| Returns the sign bit of the quadruple-precision floating-point value `a'.
928*----------------------------------------------------------------------------*/
929
930INLINE flag extractFloat128Sign( float128 a )
931{
932
933 return a.high>>63;
934
935}
936
937/*----------------------------------------------------------------------------
938| Normalizes the subnormal quadruple-precision floating-point value
939| represented by the denormalized significand formed by the concatenation of
940| `aSig0' and `aSig1'. The normalized exponent is stored at the location
941| pointed to by `zExpPtr'. The most significant 49 bits of the normalized
942| significand are stored at the location pointed to by `zSig0Ptr', and the
943| least significant 64 bits of the normalized significand are stored at the
944| location pointed to by `zSig1Ptr'.
945*----------------------------------------------------------------------------*/
946
947static void
948 normalizeFloat128Subnormal(
bb98fe42
AF
949 uint64_t aSig0,
950 uint64_t aSig1,
158142c2 951 int32 *zExpPtr,
bb98fe42
AF
952 uint64_t *zSig0Ptr,
953 uint64_t *zSig1Ptr
158142c2
FB
954 )
955{
956 int8 shiftCount;
957
958 if ( aSig0 == 0 ) {
959 shiftCount = countLeadingZeros64( aSig1 ) - 15;
960 if ( shiftCount < 0 ) {
961 *zSig0Ptr = aSig1>>( - shiftCount );
962 *zSig1Ptr = aSig1<<( shiftCount & 63 );
963 }
964 else {
965 *zSig0Ptr = aSig1<<shiftCount;
966 *zSig1Ptr = 0;
967 }
968 *zExpPtr = - shiftCount - 63;
969 }
970 else {
971 shiftCount = countLeadingZeros64( aSig0 ) - 15;
972 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
973 *zExpPtr = 1 - shiftCount;
974 }
975
976}
977
978/*----------------------------------------------------------------------------
979| Packs the sign `zSign', the exponent `zExp', and the significand formed
980| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
981| floating-point value, returning the result. After being shifted into the
982| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
983| added together to form the most significant 32 bits of the result. This
984| means that any integer portion of `zSig0' will be added into the exponent.
985| Since a properly normalized significand will have an integer portion equal
986| to 1, the `zExp' input should be 1 less than the desired result exponent
987| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
988| significand.
989*----------------------------------------------------------------------------*/
990
991INLINE float128
bb98fe42 992 packFloat128( flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 )
158142c2
FB
993{
994 float128 z;
995
996 z.low = zSig1;
bb98fe42 997 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
158142c2
FB
998 return z;
999
1000}
1001
1002/*----------------------------------------------------------------------------
1003| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1004| and extended significand formed by the concatenation of `zSig0', `zSig1',
1005| and `zSig2', and returns the proper quadruple-precision floating-point value
1006| corresponding to the abstract input. Ordinarily, the abstract value is
1007| simply rounded and packed into the quadruple-precision format, with the
1008| inexact exception raised if the abstract input cannot be represented
1009| exactly. However, if the abstract value is too large, the overflow and
1010| inexact exceptions are raised and an infinity or maximal finite value is
1011| returned. If the abstract value is too small, the input value is rounded to
1012| a subnormal number, and the underflow and inexact exceptions are raised if
1013| the abstract input cannot be represented exactly as a subnormal quadruple-
1014| precision floating-point number.
1015| The input significand must be normalized or smaller. If the input
1016| significand is not normalized, `zExp' must be 0; in that case, the result
1017| returned is a subnormal number, and it must not require rounding. In the
1018| usual case that the input significand is normalized, `zExp' must be 1 less
1019| than the ``true'' floating-point exponent. The handling of underflow and
1020| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1021*----------------------------------------------------------------------------*/
1022
1023static float128
1024 roundAndPackFloat128(
bb98fe42 1025 flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1, uint64_t zSig2 STATUS_PARAM)
158142c2
FB
1026{
1027 int8 roundingMode;
1028 flag roundNearestEven, increment, isTiny;
1029
1030 roundingMode = STATUS(float_rounding_mode);
1031 roundNearestEven = ( roundingMode == float_round_nearest_even );
bb98fe42 1032 increment = ( (int64_t) zSig2 < 0 );
158142c2
FB
1033 if ( ! roundNearestEven ) {
1034 if ( roundingMode == float_round_to_zero ) {
1035 increment = 0;
1036 }
1037 else {
1038 if ( zSign ) {
1039 increment = ( roundingMode == float_round_down ) && zSig2;
1040 }
1041 else {
1042 increment = ( roundingMode == float_round_up ) && zSig2;
1043 }
1044 }
1045 }
bb98fe42 1046 if ( 0x7FFD <= (uint32_t) zExp ) {
158142c2
FB
1047 if ( ( 0x7FFD < zExp )
1048 || ( ( zExp == 0x7FFD )
1049 && eq128(
1050 LIT64( 0x0001FFFFFFFFFFFF ),
1051 LIT64( 0xFFFFFFFFFFFFFFFF ),
1052 zSig0,
1053 zSig1
1054 )
1055 && increment
1056 )
1057 ) {
1058 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
1059 if ( ( roundingMode == float_round_to_zero )
1060 || ( zSign && ( roundingMode == float_round_up ) )
1061 || ( ! zSign && ( roundingMode == float_round_down ) )
1062 ) {
1063 return
1064 packFloat128(
1065 zSign,
1066 0x7FFE,
1067 LIT64( 0x0000FFFFFFFFFFFF ),
1068 LIT64( 0xFFFFFFFFFFFFFFFF )
1069 );
1070 }
1071 return packFloat128( zSign, 0x7FFF, 0, 0 );
1072 }
1073 if ( zExp < 0 ) {
e6afc87f
PM
1074 if (STATUS(flush_to_zero)) {
1075 float_raise(float_flag_output_denormal STATUS_VAR);
1076 return packFloat128(zSign, 0, 0, 0);
1077 }
158142c2
FB
1078 isTiny =
1079 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
1080 || ( zExp < -1 )
1081 || ! increment
1082 || lt128(
1083 zSig0,
1084 zSig1,
1085 LIT64( 0x0001FFFFFFFFFFFF ),
1086 LIT64( 0xFFFFFFFFFFFFFFFF )
1087 );
1088 shift128ExtraRightJamming(
1089 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1090 zExp = 0;
1091 if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
1092 if ( roundNearestEven ) {
bb98fe42 1093 increment = ( (int64_t) zSig2 < 0 );
158142c2
FB
1094 }
1095 else {
1096 if ( zSign ) {
1097 increment = ( roundingMode == float_round_down ) && zSig2;
1098 }
1099 else {
1100 increment = ( roundingMode == float_round_up ) && zSig2;
1101 }
1102 }
1103 }
1104 }
1105 if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1106 if ( increment ) {
1107 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1108 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1109 }
1110 else {
1111 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1112 }
1113 return packFloat128( zSign, zExp, zSig0, zSig1 );
1114
1115}
1116
1117/*----------------------------------------------------------------------------
1118| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1119| and significand formed by the concatenation of `zSig0' and `zSig1', and
1120| returns the proper quadruple-precision floating-point value corresponding
1121| to the abstract input. This routine is just like `roundAndPackFloat128'
1122| except that the input significand has fewer bits and does not have to be
1123| normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1124| point exponent.
1125*----------------------------------------------------------------------------*/
1126
1127static float128
1128 normalizeRoundAndPackFloat128(
bb98fe42 1129 flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 STATUS_PARAM)
158142c2
FB
1130{
1131 int8 shiftCount;
bb98fe42 1132 uint64_t zSig2;
158142c2
FB
1133
1134 if ( zSig0 == 0 ) {
1135 zSig0 = zSig1;
1136 zSig1 = 0;
1137 zExp -= 64;
1138 }
1139 shiftCount = countLeadingZeros64( zSig0 ) - 15;
1140 if ( 0 <= shiftCount ) {
1141 zSig2 = 0;
1142 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1143 }
1144 else {
1145 shift128ExtraRightJamming(
1146 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1147 }
1148 zExp -= shiftCount;
1149 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1150
1151}
1152
158142c2
FB
1153/*----------------------------------------------------------------------------
1154| Returns the result of converting the 32-bit two's complement integer `a'
1155| to the single-precision floating-point format. The conversion is performed
1156| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1157*----------------------------------------------------------------------------*/
1158
c4850f9e 1159float32 int32_to_float32(int32_t a STATUS_PARAM)
158142c2
FB
1160{
1161 flag zSign;
1162
f090c9d4 1163 if ( a == 0 ) return float32_zero;
bb98fe42 1164 if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
158142c2
FB
1165 zSign = ( a < 0 );
1166 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1167
1168}
1169
1170/*----------------------------------------------------------------------------
1171| Returns the result of converting the 32-bit two's complement integer `a'
1172| to the double-precision floating-point format. The conversion is performed
1173| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1174*----------------------------------------------------------------------------*/
1175
c4850f9e 1176float64 int32_to_float64(int32_t a STATUS_PARAM)
158142c2
FB
1177{
1178 flag zSign;
1179 uint32 absA;
1180 int8 shiftCount;
bb98fe42 1181 uint64_t zSig;
158142c2 1182
f090c9d4 1183 if ( a == 0 ) return float64_zero;
158142c2
FB
1184 zSign = ( a < 0 );
1185 absA = zSign ? - a : a;
1186 shiftCount = countLeadingZeros32( absA ) + 21;
1187 zSig = absA;
1188 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1189
1190}
1191
158142c2
FB
1192/*----------------------------------------------------------------------------
1193| Returns the result of converting the 32-bit two's complement integer `a'
1194| to the extended double-precision floating-point format. The conversion
1195| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1196| Arithmetic.
1197*----------------------------------------------------------------------------*/
1198
c4850f9e 1199floatx80 int32_to_floatx80(int32_t a STATUS_PARAM)
158142c2
FB
1200{
1201 flag zSign;
1202 uint32 absA;
1203 int8 shiftCount;
bb98fe42 1204 uint64_t zSig;
158142c2
FB
1205
1206 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1207 zSign = ( a < 0 );
1208 absA = zSign ? - a : a;
1209 shiftCount = countLeadingZeros32( absA ) + 32;
1210 zSig = absA;
1211 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1212
1213}
1214
158142c2
FB
1215/*----------------------------------------------------------------------------
1216| Returns the result of converting the 32-bit two's complement integer `a' to
1217| the quadruple-precision floating-point format. The conversion is performed
1218| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1219*----------------------------------------------------------------------------*/
1220
c4850f9e 1221float128 int32_to_float128(int32_t a STATUS_PARAM)
158142c2
FB
1222{
1223 flag zSign;
1224 uint32 absA;
1225 int8 shiftCount;
bb98fe42 1226 uint64_t zSig0;
158142c2
FB
1227
1228 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1229 zSign = ( a < 0 );
1230 absA = zSign ? - a : a;
1231 shiftCount = countLeadingZeros32( absA ) + 17;
1232 zSig0 = absA;
1233 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1234
1235}
1236
158142c2
FB
1237/*----------------------------------------------------------------------------
1238| Returns the result of converting the 64-bit two's complement integer `a'
1239| to the single-precision floating-point format. The conversion is performed
1240| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1241*----------------------------------------------------------------------------*/
1242
c4850f9e 1243float32 int64_to_float32(int64_t a STATUS_PARAM)
158142c2
FB
1244{
1245 flag zSign;
1246 uint64 absA;
1247 int8 shiftCount;
1248
f090c9d4 1249 if ( a == 0 ) return float32_zero;
158142c2
FB
1250 zSign = ( a < 0 );
1251 absA = zSign ? - a : a;
1252 shiftCount = countLeadingZeros64( absA ) - 40;
1253 if ( 0 <= shiftCount ) {
1254 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1255 }
1256 else {
1257 shiftCount += 7;
1258 if ( shiftCount < 0 ) {
1259 shift64RightJamming( absA, - shiftCount, &absA );
1260 }
1261 else {
1262 absA <<= shiftCount;
1263 }
1264 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1265 }
1266
1267}
1268
c4850f9e 1269float32 uint64_to_float32(uint64_t a STATUS_PARAM)
75d62a58
JM
1270{
1271 int8 shiftCount;
1272
f090c9d4 1273 if ( a == 0 ) return float32_zero;
75d62a58
JM
1274 shiftCount = countLeadingZeros64( a ) - 40;
1275 if ( 0 <= shiftCount ) {
e744c06f 1276 return packFloat32(0, 0x95 - shiftCount, a<<shiftCount);
75d62a58
JM
1277 }
1278 else {
1279 shiftCount += 7;
1280 if ( shiftCount < 0 ) {
1281 shift64RightJamming( a, - shiftCount, &a );
1282 }
1283 else {
1284 a <<= shiftCount;
1285 }
e744c06f 1286 return roundAndPackFloat32(0, 0x9C - shiftCount, a STATUS_VAR);
75d62a58
JM
1287 }
1288}
1289
158142c2
FB
1290/*----------------------------------------------------------------------------
1291| Returns the result of converting the 64-bit two's complement integer `a'
1292| to the double-precision floating-point format. The conversion is performed
1293| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1294*----------------------------------------------------------------------------*/
1295
c4850f9e 1296float64 int64_to_float64(int64_t a STATUS_PARAM)
158142c2
FB
1297{
1298 flag zSign;
1299
f090c9d4 1300 if ( a == 0 ) return float64_zero;
bb98fe42 1301 if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
158142c2
FB
1302 return packFloat64( 1, 0x43E, 0 );
1303 }
1304 zSign = ( a < 0 );
1305 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1306
1307}
1308
c4850f9e 1309float64 uint64_to_float64(uint64_t a STATUS_PARAM)
75d62a58 1310{
17ed2293 1311 int exp = 0x43C;
75d62a58 1312
17ed2293
RH
1313 if (a == 0) {
1314 return float64_zero;
1315 }
1316 if ((int64_t)a < 0) {
1317 shift64RightJamming(a, 1, &a);
1318 exp += 1;
1319 }
1320 return normalizeRoundAndPackFloat64(0, exp, a STATUS_VAR);
75d62a58
JM
1321}
1322
158142c2
FB
1323/*----------------------------------------------------------------------------
1324| Returns the result of converting the 64-bit two's complement integer `a'
1325| to the extended double-precision floating-point format. The conversion
1326| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1327| Arithmetic.
1328*----------------------------------------------------------------------------*/
1329
c4850f9e 1330floatx80 int64_to_floatx80(int64_t a STATUS_PARAM)
158142c2
FB
1331{
1332 flag zSign;
1333 uint64 absA;
1334 int8 shiftCount;
1335
1336 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1337 zSign = ( a < 0 );
1338 absA = zSign ? - a : a;
1339 shiftCount = countLeadingZeros64( absA );
1340 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1341
1342}
1343
158142c2
FB
1344/*----------------------------------------------------------------------------
1345| Returns the result of converting the 64-bit two's complement integer `a' to
1346| the quadruple-precision floating-point format. The conversion is performed
1347| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1348*----------------------------------------------------------------------------*/
1349
c4850f9e 1350float128 int64_to_float128(int64_t a STATUS_PARAM)
158142c2
FB
1351{
1352 flag zSign;
1353 uint64 absA;
1354 int8 shiftCount;
1355 int32 zExp;
bb98fe42 1356 uint64_t zSig0, zSig1;
158142c2
FB
1357
1358 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1359 zSign = ( a < 0 );
1360 absA = zSign ? - a : a;
1361 shiftCount = countLeadingZeros64( absA ) + 49;
1362 zExp = 0x406E - shiftCount;
1363 if ( 64 <= shiftCount ) {
1364 zSig1 = 0;
1365 zSig0 = absA;
1366 shiftCount -= 64;
1367 }
1368 else {
1369 zSig1 = absA;
1370 zSig0 = 0;
1371 }
1372 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1373 return packFloat128( zSign, zExp, zSig0, zSig1 );
1374
1375}
1376
c4850f9e 1377float128 uint64_to_float128(uint64_t a STATUS_PARAM)
1e397ead
RH
1378{
1379 if (a == 0) {
1380 return float128_zero;
1381 }
1382 return normalizeRoundAndPackFloat128(0, 0x406E, a, 0 STATUS_VAR);
1383}
1384
158142c2
FB
1385/*----------------------------------------------------------------------------
1386| Returns the result of converting the single-precision floating-point value
1387| `a' to the 32-bit two's complement integer format. The conversion is
1388| performed according to the IEC/IEEE Standard for Binary Floating-Point
1389| Arithmetic---which means in particular that the conversion is rounded
1390| according to the current rounding mode. If `a' is a NaN, the largest
1391| positive integer is returned. Otherwise, if the conversion overflows, the
1392| largest integer with the same sign as `a' is returned.
1393*----------------------------------------------------------------------------*/
1394
1395int32 float32_to_int32( float32 a STATUS_PARAM )
1396{
1397 flag aSign;
94a49d86 1398 int_fast16_t aExp, shiftCount;
bb98fe42
AF
1399 uint32_t aSig;
1400 uint64_t aSig64;
158142c2 1401
37d18660 1402 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1403 aSig = extractFloat32Frac( a );
1404 aExp = extractFloat32Exp( a );
1405 aSign = extractFloat32Sign( a );
1406 if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1407 if ( aExp ) aSig |= 0x00800000;
1408 shiftCount = 0xAF - aExp;
1409 aSig64 = aSig;
1410 aSig64 <<= 32;
1411 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1412 return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1413
1414}
1415
1416/*----------------------------------------------------------------------------
1417| Returns the result of converting the single-precision floating-point value
1418| `a' to the 32-bit two's complement integer format. The conversion is
1419| performed according to the IEC/IEEE Standard for Binary Floating-Point
1420| Arithmetic, except that the conversion is always rounded toward zero.
1421| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1422| the conversion overflows, the largest integer with the same sign as `a' is
1423| returned.
1424*----------------------------------------------------------------------------*/
1425
1426int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1427{
1428 flag aSign;
94a49d86 1429 int_fast16_t aExp, shiftCount;
bb98fe42 1430 uint32_t aSig;
b3a6a2e0 1431 int32_t z;
37d18660 1432 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1433
1434 aSig = extractFloat32Frac( a );
1435 aExp = extractFloat32Exp( a );
1436 aSign = extractFloat32Sign( a );
1437 shiftCount = aExp - 0x9E;
1438 if ( 0 <= shiftCount ) {
f090c9d4 1439 if ( float32_val(a) != 0xCF000000 ) {
158142c2
FB
1440 float_raise( float_flag_invalid STATUS_VAR);
1441 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1442 }
bb98fe42 1443 return (int32_t) 0x80000000;
158142c2
FB
1444 }
1445 else if ( aExp <= 0x7E ) {
1446 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1447 return 0;
1448 }
1449 aSig = ( aSig | 0x00800000 )<<8;
1450 z = aSig>>( - shiftCount );
bb98fe42 1451 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
158142c2
FB
1452 STATUS(float_exception_flags) |= float_flag_inexact;
1453 }
1454 if ( aSign ) z = - z;
1455 return z;
1456
1457}
1458
cbcef455
PM
1459/*----------------------------------------------------------------------------
1460| Returns the result of converting the single-precision floating-point value
1461| `a' to the 16-bit two's complement integer format. The conversion is
1462| performed according to the IEC/IEEE Standard for Binary Floating-Point
1463| Arithmetic, except that the conversion is always rounded toward zero.
1464| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1465| the conversion overflows, the largest integer with the same sign as `a' is
1466| returned.
1467*----------------------------------------------------------------------------*/
1468
94a49d86 1469int_fast16_t float32_to_int16_round_to_zero(float32 a STATUS_PARAM)
cbcef455
PM
1470{
1471 flag aSign;
94a49d86 1472 int_fast16_t aExp, shiftCount;
bb98fe42 1473 uint32_t aSig;
cbcef455
PM
1474 int32 z;
1475
1476 aSig = extractFloat32Frac( a );
1477 aExp = extractFloat32Exp( a );
1478 aSign = extractFloat32Sign( a );
1479 shiftCount = aExp - 0x8E;
1480 if ( 0 <= shiftCount ) {
1481 if ( float32_val(a) != 0xC7000000 ) {
1482 float_raise( float_flag_invalid STATUS_VAR);
1483 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1484 return 0x7FFF;
1485 }
1486 }
bb98fe42 1487 return (int32_t) 0xffff8000;
cbcef455
PM
1488 }
1489 else if ( aExp <= 0x7E ) {
1490 if ( aExp | aSig ) {
1491 STATUS(float_exception_flags) |= float_flag_inexact;
1492 }
1493 return 0;
1494 }
1495 shiftCount -= 0x10;
1496 aSig = ( aSig | 0x00800000 )<<8;
1497 z = aSig>>( - shiftCount );
bb98fe42 1498 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
cbcef455
PM
1499 STATUS(float_exception_flags) |= float_flag_inexact;
1500 }
1501 if ( aSign ) {
1502 z = - z;
1503 }
1504 return z;
1505
1506}
1507
158142c2
FB
1508/*----------------------------------------------------------------------------
1509| Returns the result of converting the single-precision floating-point value
1510| `a' to the 64-bit two's complement integer format. The conversion is
1511| performed according to the IEC/IEEE Standard for Binary Floating-Point
1512| Arithmetic---which means in particular that the conversion is rounded
1513| according to the current rounding mode. If `a' is a NaN, the largest
1514| positive integer is returned. Otherwise, if the conversion overflows, the
1515| largest integer with the same sign as `a' is returned.
1516*----------------------------------------------------------------------------*/
1517
1518int64 float32_to_int64( float32 a STATUS_PARAM )
1519{
1520 flag aSign;
94a49d86 1521 int_fast16_t aExp, shiftCount;
bb98fe42
AF
1522 uint32_t aSig;
1523 uint64_t aSig64, aSigExtra;
37d18660 1524 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1525
1526 aSig = extractFloat32Frac( a );
1527 aExp = extractFloat32Exp( a );
1528 aSign = extractFloat32Sign( a );
1529 shiftCount = 0xBE - aExp;
1530 if ( shiftCount < 0 ) {
1531 float_raise( float_flag_invalid STATUS_VAR);
1532 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1533 return LIT64( 0x7FFFFFFFFFFFFFFF );
1534 }
bb98fe42 1535 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
1536 }
1537 if ( aExp ) aSig |= 0x00800000;
1538 aSig64 = aSig;
1539 aSig64 <<= 40;
1540 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1541 return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1542
1543}
1544
2f18bbf9
TM
1545/*----------------------------------------------------------------------------
1546| Returns the result of converting the single-precision floating-point value
1547| `a' to the 64-bit unsigned integer format. The conversion is
1548| performed according to the IEC/IEEE Standard for Binary Floating-Point
1549| Arithmetic---which means in particular that the conversion is rounded
1550| according to the current rounding mode. If `a' is a NaN, the largest
1551| unsigned integer is returned. Otherwise, if the conversion overflows, the
1552| largest unsigned integer is returned. If the 'a' is negative, the result
1553| is rounded and zero is returned; values that do not round to zero will
1554| raise the inexact exception flag.
1555*----------------------------------------------------------------------------*/
1556
1557uint64 float32_to_uint64(float32 a STATUS_PARAM)
1558{
1559 flag aSign;
1560 int_fast16_t aExp, shiftCount;
1561 uint32_t aSig;
1562 uint64_t aSig64, aSigExtra;
1563 a = float32_squash_input_denormal(a STATUS_VAR);
1564
1565 aSig = extractFloat32Frac(a);
1566 aExp = extractFloat32Exp(a);
1567 aSign = extractFloat32Sign(a);
1568 if ((aSign) && (aExp > 126)) {
1569 float_raise(float_flag_invalid STATUS_VAR);
1570 if (float32_is_any_nan(a)) {
1571 return LIT64(0xFFFFFFFFFFFFFFFF);
1572 } else {
1573 return 0;
1574 }
1575 }
1576 shiftCount = 0xBE - aExp;
1577 if (aExp) {
1578 aSig |= 0x00800000;
1579 }
1580 if (shiftCount < 0) {
1581 float_raise(float_flag_invalid STATUS_VAR);
1582 return LIT64(0xFFFFFFFFFFFFFFFF);
1583 }
1584
1585 aSig64 = aSig;
1586 aSig64 <<= 40;
1587 shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra);
1588 return roundAndPackUint64(aSign, aSig64, aSigExtra STATUS_VAR);
1589}
1590
158142c2
FB
1591/*----------------------------------------------------------------------------
1592| Returns the result of converting the single-precision floating-point value
1593| `a' to the 64-bit two's complement integer format. The conversion is
1594| performed according to the IEC/IEEE Standard for Binary Floating-Point
1595| Arithmetic, except that the conversion is always rounded toward zero. If
1596| `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1597| conversion overflows, the largest integer with the same sign as `a' is
1598| returned.
1599*----------------------------------------------------------------------------*/
1600
1601int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1602{
1603 flag aSign;
94a49d86 1604 int_fast16_t aExp, shiftCount;
bb98fe42
AF
1605 uint32_t aSig;
1606 uint64_t aSig64;
158142c2 1607 int64 z;
37d18660 1608 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1609
1610 aSig = extractFloat32Frac( a );
1611 aExp = extractFloat32Exp( a );
1612 aSign = extractFloat32Sign( a );
1613 shiftCount = aExp - 0xBE;
1614 if ( 0 <= shiftCount ) {
f090c9d4 1615 if ( float32_val(a) != 0xDF000000 ) {
158142c2
FB
1616 float_raise( float_flag_invalid STATUS_VAR);
1617 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1618 return LIT64( 0x7FFFFFFFFFFFFFFF );
1619 }
1620 }
bb98fe42 1621 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
1622 }
1623 else if ( aExp <= 0x7E ) {
1624 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1625 return 0;
1626 }
1627 aSig64 = aSig | 0x00800000;
1628 aSig64 <<= 40;
1629 z = aSig64>>( - shiftCount );
bb98fe42 1630 if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
158142c2
FB
1631 STATUS(float_exception_flags) |= float_flag_inexact;
1632 }
1633 if ( aSign ) z = - z;
1634 return z;
1635
1636}
1637
1638/*----------------------------------------------------------------------------
1639| Returns the result of converting the single-precision floating-point value
1640| `a' to the double-precision floating-point format. The conversion is
1641| performed according to the IEC/IEEE Standard for Binary Floating-Point
1642| Arithmetic.
1643*----------------------------------------------------------------------------*/
1644
1645float64 float32_to_float64( float32 a STATUS_PARAM )
1646{
1647 flag aSign;
94a49d86 1648 int_fast16_t aExp;
bb98fe42 1649 uint32_t aSig;
37d18660 1650 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1651
1652 aSig = extractFloat32Frac( a );
1653 aExp = extractFloat32Exp( a );
1654 aSign = extractFloat32Sign( a );
1655 if ( aExp == 0xFF ) {
bcd4d9af 1656 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
1657 return packFloat64( aSign, 0x7FF, 0 );
1658 }
1659 if ( aExp == 0 ) {
1660 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1661 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1662 --aExp;
1663 }
bb98fe42 1664 return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
158142c2
FB
1665
1666}
1667
158142c2
FB
1668/*----------------------------------------------------------------------------
1669| Returns the result of converting the single-precision floating-point value
1670| `a' to the extended double-precision floating-point format. The conversion
1671| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1672| Arithmetic.
1673*----------------------------------------------------------------------------*/
1674
1675floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1676{
1677 flag aSign;
94a49d86 1678 int_fast16_t aExp;
bb98fe42 1679 uint32_t aSig;
158142c2 1680
37d18660 1681 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1682 aSig = extractFloat32Frac( a );
1683 aExp = extractFloat32Exp( a );
1684 aSign = extractFloat32Sign( a );
1685 if ( aExp == 0xFF ) {
bcd4d9af 1686 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
1687 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1688 }
1689 if ( aExp == 0 ) {
1690 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1691 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1692 }
1693 aSig |= 0x00800000;
bb98fe42 1694 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
158142c2
FB
1695
1696}
1697
158142c2
FB
1698/*----------------------------------------------------------------------------
1699| Returns the result of converting the single-precision floating-point value
1700| `a' to the double-precision floating-point format. The conversion is
1701| performed according to the IEC/IEEE Standard for Binary Floating-Point
1702| Arithmetic.
1703*----------------------------------------------------------------------------*/
1704
1705float128 float32_to_float128( float32 a STATUS_PARAM )
1706{
1707 flag aSign;
94a49d86 1708 int_fast16_t aExp;
bb98fe42 1709 uint32_t aSig;
158142c2 1710
37d18660 1711 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1712 aSig = extractFloat32Frac( a );
1713 aExp = extractFloat32Exp( a );
1714 aSign = extractFloat32Sign( a );
1715 if ( aExp == 0xFF ) {
bcd4d9af 1716 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
1717 return packFloat128( aSign, 0x7FFF, 0, 0 );
1718 }
1719 if ( aExp == 0 ) {
1720 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1721 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1722 --aExp;
1723 }
bb98fe42 1724 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
158142c2
FB
1725
1726}
1727
158142c2
FB
1728/*----------------------------------------------------------------------------
1729| Rounds the single-precision floating-point value `a' to an integer, and
1730| returns the result as a single-precision floating-point value. The
1731| operation is performed according to the IEC/IEEE Standard for Binary
1732| Floating-Point Arithmetic.
1733*----------------------------------------------------------------------------*/
1734
1735float32 float32_round_to_int( float32 a STATUS_PARAM)
1736{
1737 flag aSign;
94a49d86 1738 int_fast16_t aExp;
bb98fe42 1739 uint32_t lastBitMask, roundBitsMask;
158142c2 1740 int8 roundingMode;
bb98fe42 1741 uint32_t z;
37d18660 1742 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1743
1744 aExp = extractFloat32Exp( a );
1745 if ( 0x96 <= aExp ) {
1746 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1747 return propagateFloat32NaN( a, a STATUS_VAR );
1748 }
1749 return a;
1750 }
1751 if ( aExp <= 0x7E ) {
bb98fe42 1752 if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
158142c2
FB
1753 STATUS(float_exception_flags) |= float_flag_inexact;
1754 aSign = extractFloat32Sign( a );
1755 switch ( STATUS(float_rounding_mode) ) {
1756 case float_round_nearest_even:
1757 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1758 return packFloat32( aSign, 0x7F, 0 );
1759 }
1760 break;
1761 case float_round_down:
f090c9d4 1762 return make_float32(aSign ? 0xBF800000 : 0);
158142c2 1763 case float_round_up:
f090c9d4 1764 return make_float32(aSign ? 0x80000000 : 0x3F800000);
158142c2
FB
1765 }
1766 return packFloat32( aSign, 0, 0 );
1767 }
1768 lastBitMask = 1;
1769 lastBitMask <<= 0x96 - aExp;
1770 roundBitsMask = lastBitMask - 1;
f090c9d4 1771 z = float32_val(a);
158142c2
FB
1772 roundingMode = STATUS(float_rounding_mode);
1773 if ( roundingMode == float_round_nearest_even ) {
1774 z += lastBitMask>>1;
1775 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1776 }
1777 else if ( roundingMode != float_round_to_zero ) {
f090c9d4 1778 if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
158142c2
FB
1779 z += roundBitsMask;
1780 }
1781 }
1782 z &= ~ roundBitsMask;
f090c9d4
PB
1783 if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1784 return make_float32(z);
158142c2
FB
1785
1786}
1787
1788/*----------------------------------------------------------------------------
1789| Returns the result of adding the absolute values of the single-precision
1790| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1791| before being returned. `zSign' is ignored if the result is a NaN.
1792| The addition is performed according to the IEC/IEEE Standard for Binary
1793| Floating-Point Arithmetic.
1794*----------------------------------------------------------------------------*/
1795
1796static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1797{
94a49d86 1798 int_fast16_t aExp, bExp, zExp;
bb98fe42 1799 uint32_t aSig, bSig, zSig;
94a49d86 1800 int_fast16_t expDiff;
158142c2
FB
1801
1802 aSig = extractFloat32Frac( a );
1803 aExp = extractFloat32Exp( a );
1804 bSig = extractFloat32Frac( b );
1805 bExp = extractFloat32Exp( b );
1806 expDiff = aExp - bExp;
1807 aSig <<= 6;
1808 bSig <<= 6;
1809 if ( 0 < expDiff ) {
1810 if ( aExp == 0xFF ) {
1811 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1812 return a;
1813 }
1814 if ( bExp == 0 ) {
1815 --expDiff;
1816 }
1817 else {
1818 bSig |= 0x20000000;
1819 }
1820 shift32RightJamming( bSig, expDiff, &bSig );
1821 zExp = aExp;
1822 }
1823 else if ( expDiff < 0 ) {
1824 if ( bExp == 0xFF ) {
1825 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1826 return packFloat32( zSign, 0xFF, 0 );
1827 }
1828 if ( aExp == 0 ) {
1829 ++expDiff;
1830 }
1831 else {
1832 aSig |= 0x20000000;
1833 }
1834 shift32RightJamming( aSig, - expDiff, &aSig );
1835 zExp = bExp;
1836 }
1837 else {
1838 if ( aExp == 0xFF ) {
1839 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1840 return a;
1841 }
fe76d976 1842 if ( aExp == 0 ) {
e6afc87f
PM
1843 if (STATUS(flush_to_zero)) {
1844 if (aSig | bSig) {
1845 float_raise(float_flag_output_denormal STATUS_VAR);
1846 }
1847 return packFloat32(zSign, 0, 0);
1848 }
fe76d976
PB
1849 return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1850 }
158142c2
FB
1851 zSig = 0x40000000 + aSig + bSig;
1852 zExp = aExp;
1853 goto roundAndPack;
1854 }
1855 aSig |= 0x20000000;
1856 zSig = ( aSig + bSig )<<1;
1857 --zExp;
bb98fe42 1858 if ( (int32_t) zSig < 0 ) {
158142c2
FB
1859 zSig = aSig + bSig;
1860 ++zExp;
1861 }
1862 roundAndPack:
1863 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1864
1865}
1866
1867/*----------------------------------------------------------------------------
1868| Returns the result of subtracting the absolute values of the single-
1869| precision floating-point values `a' and `b'. If `zSign' is 1, the
1870| difference is negated before being returned. `zSign' is ignored if the
1871| result is a NaN. The subtraction is performed according to the IEC/IEEE
1872| Standard for Binary Floating-Point Arithmetic.
1873*----------------------------------------------------------------------------*/
1874
1875static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1876{
94a49d86 1877 int_fast16_t aExp, bExp, zExp;
bb98fe42 1878 uint32_t aSig, bSig, zSig;
94a49d86 1879 int_fast16_t expDiff;
158142c2
FB
1880
1881 aSig = extractFloat32Frac( a );
1882 aExp = extractFloat32Exp( a );
1883 bSig = extractFloat32Frac( b );
1884 bExp = extractFloat32Exp( b );
1885 expDiff = aExp - bExp;
1886 aSig <<= 7;
1887 bSig <<= 7;
1888 if ( 0 < expDiff ) goto aExpBigger;
1889 if ( expDiff < 0 ) goto bExpBigger;
1890 if ( aExp == 0xFF ) {
1891 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1892 float_raise( float_flag_invalid STATUS_VAR);
1893 return float32_default_nan;
1894 }
1895 if ( aExp == 0 ) {
1896 aExp = 1;
1897 bExp = 1;
1898 }
1899 if ( bSig < aSig ) goto aBigger;
1900 if ( aSig < bSig ) goto bBigger;
1901 return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1902 bExpBigger:
1903 if ( bExp == 0xFF ) {
1904 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1905 return packFloat32( zSign ^ 1, 0xFF, 0 );
1906 }
1907 if ( aExp == 0 ) {
1908 ++expDiff;
1909 }
1910 else {
1911 aSig |= 0x40000000;
1912 }
1913 shift32RightJamming( aSig, - expDiff, &aSig );
1914 bSig |= 0x40000000;
1915 bBigger:
1916 zSig = bSig - aSig;
1917 zExp = bExp;
1918 zSign ^= 1;
1919 goto normalizeRoundAndPack;
1920 aExpBigger:
1921 if ( aExp == 0xFF ) {
1922 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1923 return a;
1924 }
1925 if ( bExp == 0 ) {
1926 --expDiff;
1927 }
1928 else {
1929 bSig |= 0x40000000;
1930 }
1931 shift32RightJamming( bSig, expDiff, &bSig );
1932 aSig |= 0x40000000;
1933 aBigger:
1934 zSig = aSig - bSig;
1935 zExp = aExp;
1936 normalizeRoundAndPack:
1937 --zExp;
1938 return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1939
1940}
1941
1942/*----------------------------------------------------------------------------
1943| Returns the result of adding the single-precision floating-point values `a'
1944| and `b'. The operation is performed according to the IEC/IEEE Standard for
1945| Binary Floating-Point Arithmetic.
1946*----------------------------------------------------------------------------*/
1947
1948float32 float32_add( float32 a, float32 b STATUS_PARAM )
1949{
1950 flag aSign, bSign;
37d18660
PM
1951 a = float32_squash_input_denormal(a STATUS_VAR);
1952 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
1953
1954 aSign = extractFloat32Sign( a );
1955 bSign = extractFloat32Sign( b );
1956 if ( aSign == bSign ) {
1957 return addFloat32Sigs( a, b, aSign STATUS_VAR);
1958 }
1959 else {
1960 return subFloat32Sigs( a, b, aSign STATUS_VAR );
1961 }
1962
1963}
1964
1965/*----------------------------------------------------------------------------
1966| Returns the result of subtracting the single-precision floating-point values
1967| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1968| for Binary Floating-Point Arithmetic.
1969*----------------------------------------------------------------------------*/
1970
1971float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1972{
1973 flag aSign, bSign;
37d18660
PM
1974 a = float32_squash_input_denormal(a STATUS_VAR);
1975 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
1976
1977 aSign = extractFloat32Sign( a );
1978 bSign = extractFloat32Sign( b );
1979 if ( aSign == bSign ) {
1980 return subFloat32Sigs( a, b, aSign STATUS_VAR );
1981 }
1982 else {
1983 return addFloat32Sigs( a, b, aSign STATUS_VAR );
1984 }
1985
1986}
1987
1988/*----------------------------------------------------------------------------
1989| Returns the result of multiplying the single-precision floating-point values
1990| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1991| for Binary Floating-Point Arithmetic.
1992*----------------------------------------------------------------------------*/
1993
1994float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1995{
1996 flag aSign, bSign, zSign;
94a49d86 1997 int_fast16_t aExp, bExp, zExp;
bb98fe42
AF
1998 uint32_t aSig, bSig;
1999 uint64_t zSig64;
2000 uint32_t zSig;
158142c2 2001
37d18660
PM
2002 a = float32_squash_input_denormal(a STATUS_VAR);
2003 b = float32_squash_input_denormal(b STATUS_VAR);
2004
158142c2
FB
2005 aSig = extractFloat32Frac( a );
2006 aExp = extractFloat32Exp( a );
2007 aSign = extractFloat32Sign( a );
2008 bSig = extractFloat32Frac( b );
2009 bExp = extractFloat32Exp( b );
2010 bSign = extractFloat32Sign( b );
2011 zSign = aSign ^ bSign;
2012 if ( aExp == 0xFF ) {
2013 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2014 return propagateFloat32NaN( a, b STATUS_VAR );
2015 }
2016 if ( ( bExp | bSig ) == 0 ) {
2017 float_raise( float_flag_invalid STATUS_VAR);
2018 return float32_default_nan;
2019 }
2020 return packFloat32( zSign, 0xFF, 0 );
2021 }
2022 if ( bExp == 0xFF ) {
2023 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2024 if ( ( aExp | aSig ) == 0 ) {
2025 float_raise( float_flag_invalid STATUS_VAR);
2026 return float32_default_nan;
2027 }
2028 return packFloat32( zSign, 0xFF, 0 );
2029 }
2030 if ( aExp == 0 ) {
2031 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2032 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2033 }
2034 if ( bExp == 0 ) {
2035 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
2036 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2037 }
2038 zExp = aExp + bExp - 0x7F;
2039 aSig = ( aSig | 0x00800000 )<<7;
2040 bSig = ( bSig | 0x00800000 )<<8;
bb98fe42 2041 shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
158142c2 2042 zSig = zSig64;
bb98fe42 2043 if ( 0 <= (int32_t) ( zSig<<1 ) ) {
158142c2
FB
2044 zSig <<= 1;
2045 --zExp;
2046 }
2047 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2048
2049}
2050
2051/*----------------------------------------------------------------------------
2052| Returns the result of dividing the single-precision floating-point value `a'
2053| by the corresponding value `b'. The operation is performed according to the
2054| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2055*----------------------------------------------------------------------------*/
2056
2057float32 float32_div( float32 a, float32 b STATUS_PARAM )
2058{
2059 flag aSign, bSign, zSign;
94a49d86 2060 int_fast16_t aExp, bExp, zExp;
bb98fe42 2061 uint32_t aSig, bSig, zSig;
37d18660
PM
2062 a = float32_squash_input_denormal(a STATUS_VAR);
2063 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2064
2065 aSig = extractFloat32Frac( a );
2066 aExp = extractFloat32Exp( a );
2067 aSign = extractFloat32Sign( a );
2068 bSig = extractFloat32Frac( b );
2069 bExp = extractFloat32Exp( b );
2070 bSign = extractFloat32Sign( b );
2071 zSign = aSign ^ bSign;
2072 if ( aExp == 0xFF ) {
2073 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2074 if ( bExp == 0xFF ) {
2075 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2076 float_raise( float_flag_invalid STATUS_VAR);
2077 return float32_default_nan;
2078 }
2079 return packFloat32( zSign, 0xFF, 0 );
2080 }
2081 if ( bExp == 0xFF ) {
2082 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2083 return packFloat32( zSign, 0, 0 );
2084 }
2085 if ( bExp == 0 ) {
2086 if ( bSig == 0 ) {
2087 if ( ( aExp | aSig ) == 0 ) {
2088 float_raise( float_flag_invalid STATUS_VAR);
2089 return float32_default_nan;
2090 }
2091 float_raise( float_flag_divbyzero STATUS_VAR);
2092 return packFloat32( zSign, 0xFF, 0 );
2093 }
2094 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2095 }
2096 if ( aExp == 0 ) {
2097 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2098 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2099 }
2100 zExp = aExp - bExp + 0x7D;
2101 aSig = ( aSig | 0x00800000 )<<7;
2102 bSig = ( bSig | 0x00800000 )<<8;
2103 if ( bSig <= ( aSig + aSig ) ) {
2104 aSig >>= 1;
2105 ++zExp;
2106 }
bb98fe42 2107 zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
158142c2 2108 if ( ( zSig & 0x3F ) == 0 ) {
bb98fe42 2109 zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
158142c2
FB
2110 }
2111 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2112
2113}
2114
2115/*----------------------------------------------------------------------------
2116| Returns the remainder of the single-precision floating-point value `a'
2117| with respect to the corresponding value `b'. The operation is performed
2118| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2119*----------------------------------------------------------------------------*/
2120
2121float32 float32_rem( float32 a, float32 b STATUS_PARAM )
2122{
ed086f3d 2123 flag aSign, zSign;
94a49d86 2124 int_fast16_t aExp, bExp, expDiff;
bb98fe42
AF
2125 uint32_t aSig, bSig;
2126 uint32_t q;
2127 uint64_t aSig64, bSig64, q64;
2128 uint32_t alternateASig;
2129 int32_t sigMean;
37d18660
PM
2130 a = float32_squash_input_denormal(a STATUS_VAR);
2131 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2132
2133 aSig = extractFloat32Frac( a );
2134 aExp = extractFloat32Exp( a );
2135 aSign = extractFloat32Sign( a );
2136 bSig = extractFloat32Frac( b );
2137 bExp = extractFloat32Exp( b );
158142c2
FB
2138 if ( aExp == 0xFF ) {
2139 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2140 return propagateFloat32NaN( a, b STATUS_VAR );
2141 }
2142 float_raise( float_flag_invalid STATUS_VAR);
2143 return float32_default_nan;
2144 }
2145 if ( bExp == 0xFF ) {
2146 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2147 return a;
2148 }
2149 if ( bExp == 0 ) {
2150 if ( bSig == 0 ) {
2151 float_raise( float_flag_invalid STATUS_VAR);
2152 return float32_default_nan;
2153 }
2154 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2155 }
2156 if ( aExp == 0 ) {
2157 if ( aSig == 0 ) return a;
2158 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2159 }
2160 expDiff = aExp - bExp;
2161 aSig |= 0x00800000;
2162 bSig |= 0x00800000;
2163 if ( expDiff < 32 ) {
2164 aSig <<= 8;
2165 bSig <<= 8;
2166 if ( expDiff < 0 ) {
2167 if ( expDiff < -1 ) return a;
2168 aSig >>= 1;
2169 }
2170 q = ( bSig <= aSig );
2171 if ( q ) aSig -= bSig;
2172 if ( 0 < expDiff ) {
bb98fe42 2173 q = ( ( (uint64_t) aSig )<<32 ) / bSig;
158142c2
FB
2174 q >>= 32 - expDiff;
2175 bSig >>= 2;
2176 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2177 }
2178 else {
2179 aSig >>= 2;
2180 bSig >>= 2;
2181 }
2182 }
2183 else {
2184 if ( bSig <= aSig ) aSig -= bSig;
bb98fe42
AF
2185 aSig64 = ( (uint64_t) aSig )<<40;
2186 bSig64 = ( (uint64_t) bSig )<<40;
158142c2
FB
2187 expDiff -= 64;
2188 while ( 0 < expDiff ) {
2189 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2190 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2191 aSig64 = - ( ( bSig * q64 )<<38 );
2192 expDiff -= 62;
2193 }
2194 expDiff += 64;
2195 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2196 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2197 q = q64>>( 64 - expDiff );
2198 bSig <<= 6;
2199 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2200 }
2201 do {
2202 alternateASig = aSig;
2203 ++q;
2204 aSig -= bSig;
bb98fe42 2205 } while ( 0 <= (int32_t) aSig );
158142c2
FB
2206 sigMean = aSig + alternateASig;
2207 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2208 aSig = alternateASig;
2209 }
bb98fe42 2210 zSign = ( (int32_t) aSig < 0 );
158142c2
FB
2211 if ( zSign ) aSig = - aSig;
2212 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2213
2214}
2215
369be8f6
PM
2216/*----------------------------------------------------------------------------
2217| Returns the result of multiplying the single-precision floating-point values
2218| `a' and `b' then adding 'c', with no intermediate rounding step after the
2219| multiplication. The operation is performed according to the IEC/IEEE
2220| Standard for Binary Floating-Point Arithmetic 754-2008.
2221| The flags argument allows the caller to select negation of the
2222| addend, the intermediate product, or the final result. (The difference
2223| between this and having the caller do a separate negation is that negating
2224| externally will flip the sign bit on NaNs.)
2225*----------------------------------------------------------------------------*/
2226
2227float32 float32_muladd(float32 a, float32 b, float32 c, int flags STATUS_PARAM)
2228{
2229 flag aSign, bSign, cSign, zSign;
94a49d86 2230 int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
369be8f6
PM
2231 uint32_t aSig, bSig, cSig;
2232 flag pInf, pZero, pSign;
2233 uint64_t pSig64, cSig64, zSig64;
2234 uint32_t pSig;
2235 int shiftcount;
2236 flag signflip, infzero;
2237
2238 a = float32_squash_input_denormal(a STATUS_VAR);
2239 b = float32_squash_input_denormal(b STATUS_VAR);
2240 c = float32_squash_input_denormal(c STATUS_VAR);
2241 aSig = extractFloat32Frac(a);
2242 aExp = extractFloat32Exp(a);
2243 aSign = extractFloat32Sign(a);
2244 bSig = extractFloat32Frac(b);
2245 bExp = extractFloat32Exp(b);
2246 bSign = extractFloat32Sign(b);
2247 cSig = extractFloat32Frac(c);
2248 cExp = extractFloat32Exp(c);
2249 cSign = extractFloat32Sign(c);
2250
2251 infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) ||
2252 (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0));
2253
2254 /* It is implementation-defined whether the cases of (0,inf,qnan)
2255 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
2256 * they return if they do), so we have to hand this information
2257 * off to the target-specific pick-a-NaN routine.
2258 */
2259 if (((aExp == 0xff) && aSig) ||
2260 ((bExp == 0xff) && bSig) ||
2261 ((cExp == 0xff) && cSig)) {
2262 return propagateFloat32MulAddNaN(a, b, c, infzero STATUS_VAR);
2263 }
2264
2265 if (infzero) {
2266 float_raise(float_flag_invalid STATUS_VAR);
2267 return float32_default_nan;
2268 }
2269
2270 if (flags & float_muladd_negate_c) {
2271 cSign ^= 1;
2272 }
2273
2274 signflip = (flags & float_muladd_negate_result) ? 1 : 0;
2275
2276 /* Work out the sign and type of the product */
2277 pSign = aSign ^ bSign;
2278 if (flags & float_muladd_negate_product) {
2279 pSign ^= 1;
2280 }
2281 pInf = (aExp == 0xff) || (bExp == 0xff);
2282 pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
2283
2284 if (cExp == 0xff) {
2285 if (pInf && (pSign ^ cSign)) {
2286 /* addition of opposite-signed infinities => InvalidOperation */
2287 float_raise(float_flag_invalid STATUS_VAR);
2288 return float32_default_nan;
2289 }
2290 /* Otherwise generate an infinity of the same sign */
2291 return packFloat32(cSign ^ signflip, 0xff, 0);
2292 }
2293
2294 if (pInf) {
2295 return packFloat32(pSign ^ signflip, 0xff, 0);
2296 }
2297
2298 if (pZero) {
2299 if (cExp == 0) {
2300 if (cSig == 0) {
2301 /* Adding two exact zeroes */
2302 if (pSign == cSign) {
2303 zSign = pSign;
2304 } else if (STATUS(float_rounding_mode) == float_round_down) {
2305 zSign = 1;
2306 } else {
2307 zSign = 0;
2308 }
2309 return packFloat32(zSign ^ signflip, 0, 0);
2310 }
2311 /* Exact zero plus a denorm */
2312 if (STATUS(flush_to_zero)) {
2313 float_raise(float_flag_output_denormal STATUS_VAR);
2314 return packFloat32(cSign ^ signflip, 0, 0);
2315 }
2316 }
2317 /* Zero plus something non-zero : just return the something */
a6e7c184 2318 return packFloat32(cSign ^ signflip, cExp, cSig);
369be8f6
PM
2319 }
2320
2321 if (aExp == 0) {
2322 normalizeFloat32Subnormal(aSig, &aExp, &aSig);
2323 }
2324 if (bExp == 0) {
2325 normalizeFloat32Subnormal(bSig, &bExp, &bSig);
2326 }
2327
2328 /* Calculate the actual result a * b + c */
2329
2330 /* Multiply first; this is easy. */
2331 /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
2332 * because we want the true exponent, not the "one-less-than"
2333 * flavour that roundAndPackFloat32() takes.
2334 */
2335 pExp = aExp + bExp - 0x7e;
2336 aSig = (aSig | 0x00800000) << 7;
2337 bSig = (bSig | 0x00800000) << 8;
2338 pSig64 = (uint64_t)aSig * bSig;
2339 if ((int64_t)(pSig64 << 1) >= 0) {
2340 pSig64 <<= 1;
2341 pExp--;
2342 }
2343
2344 zSign = pSign ^ signflip;
2345
2346 /* Now pSig64 is the significand of the multiply, with the explicit bit in
2347 * position 62.
2348 */
2349 if (cExp == 0) {
2350 if (!cSig) {
2351 /* Throw out the special case of c being an exact zero now */
2352 shift64RightJamming(pSig64, 32, &pSig64);
2353 pSig = pSig64;
2354 return roundAndPackFloat32(zSign, pExp - 1,
2355 pSig STATUS_VAR);
2356 }
2357 normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2358 }
2359
2360 cSig64 = (uint64_t)cSig << (62 - 23);
2361 cSig64 |= LIT64(0x4000000000000000);
2362 expDiff = pExp - cExp;
2363
2364 if (pSign == cSign) {
2365 /* Addition */
2366 if (expDiff > 0) {
2367 /* scale c to match p */
2368 shift64RightJamming(cSig64, expDiff, &cSig64);
2369 zExp = pExp;
2370 } else if (expDiff < 0) {
2371 /* scale p to match c */
2372 shift64RightJamming(pSig64, -expDiff, &pSig64);
2373 zExp = cExp;
2374 } else {
2375 /* no scaling needed */
2376 zExp = cExp;
2377 }
2378 /* Add significands and make sure explicit bit ends up in posn 62 */
2379 zSig64 = pSig64 + cSig64;
2380 if ((int64_t)zSig64 < 0) {
2381 shift64RightJamming(zSig64, 1, &zSig64);
2382 } else {
2383 zExp--;
2384 }
2385 } else {
2386 /* Subtraction */
2387 if (expDiff > 0) {
2388 shift64RightJamming(cSig64, expDiff, &cSig64);
2389 zSig64 = pSig64 - cSig64;
2390 zExp = pExp;
2391 } else if (expDiff < 0) {
2392 shift64RightJamming(pSig64, -expDiff, &pSig64);
2393 zSig64 = cSig64 - pSig64;
2394 zExp = cExp;
2395 zSign ^= 1;
2396 } else {
2397 zExp = pExp;
2398 if (cSig64 < pSig64) {
2399 zSig64 = pSig64 - cSig64;
2400 } else if (pSig64 < cSig64) {
2401 zSig64 = cSig64 - pSig64;
2402 zSign ^= 1;
2403 } else {
2404 /* Exact zero */
2405 zSign = signflip;
2406 if (STATUS(float_rounding_mode) == float_round_down) {
2407 zSign ^= 1;
2408 }
2409 return packFloat32(zSign, 0, 0);
2410 }
2411 }
2412 --zExp;
2413 /* Normalize to put the explicit bit back into bit 62. */
2414 shiftcount = countLeadingZeros64(zSig64) - 1;
2415 zSig64 <<= shiftcount;
2416 zExp -= shiftcount;
2417 }
2418 shift64RightJamming(zSig64, 32, &zSig64);
2419 return roundAndPackFloat32(zSign, zExp, zSig64 STATUS_VAR);
2420}
2421
2422
158142c2
FB
2423/*----------------------------------------------------------------------------
2424| Returns the square root of the single-precision floating-point value `a'.
2425| The operation is performed according to the IEC/IEEE Standard for Binary
2426| Floating-Point Arithmetic.
2427*----------------------------------------------------------------------------*/
2428
2429float32 float32_sqrt( float32 a STATUS_PARAM )
2430{
2431 flag aSign;
94a49d86 2432 int_fast16_t aExp, zExp;
bb98fe42
AF
2433 uint32_t aSig, zSig;
2434 uint64_t rem, term;
37d18660 2435 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
2436
2437 aSig = extractFloat32Frac( a );
2438 aExp = extractFloat32Exp( a );
2439 aSign = extractFloat32Sign( a );
2440 if ( aExp == 0xFF ) {
f090c9d4 2441 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
158142c2
FB
2442 if ( ! aSign ) return a;
2443 float_raise( float_flag_invalid STATUS_VAR);
2444 return float32_default_nan;
2445 }
2446 if ( aSign ) {
2447 if ( ( aExp | aSig ) == 0 ) return a;
2448 float_raise( float_flag_invalid STATUS_VAR);
2449 return float32_default_nan;
2450 }
2451 if ( aExp == 0 ) {
f090c9d4 2452 if ( aSig == 0 ) return float32_zero;
158142c2
FB
2453 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2454 }
2455 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2456 aSig = ( aSig | 0x00800000 )<<8;
2457 zSig = estimateSqrt32( aExp, aSig ) + 2;
2458 if ( ( zSig & 0x7F ) <= 5 ) {
2459 if ( zSig < 2 ) {
2460 zSig = 0x7FFFFFFF;
2461 goto roundAndPack;
2462 }
2463 aSig >>= aExp & 1;
bb98fe42
AF
2464 term = ( (uint64_t) zSig ) * zSig;
2465 rem = ( ( (uint64_t) aSig )<<32 ) - term;
2466 while ( (int64_t) rem < 0 ) {
158142c2 2467 --zSig;
bb98fe42 2468 rem += ( ( (uint64_t) zSig )<<1 ) | 1;
158142c2
FB
2469 }
2470 zSig |= ( rem != 0 );
2471 }
2472 shift32RightJamming( zSig, 1, &zSig );
2473 roundAndPack:
2474 return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2475
2476}
2477
8229c991
AJ
2478/*----------------------------------------------------------------------------
2479| Returns the binary exponential of the single-precision floating-point value
2480| `a'. The operation is performed according to the IEC/IEEE Standard for
2481| Binary Floating-Point Arithmetic.
2482|
2483| Uses the following identities:
2484|
2485| 1. -------------------------------------------------------------------------
2486| x x*ln(2)
2487| 2 = e
2488|
2489| 2. -------------------------------------------------------------------------
2490| 2 3 4 5 n
2491| x x x x x x x
2492| e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2493| 1! 2! 3! 4! 5! n!
2494*----------------------------------------------------------------------------*/
2495
2496static const float64 float32_exp2_coefficients[15] =
2497{
d5138cf4
PM
2498 const_float64( 0x3ff0000000000000ll ), /* 1 */
2499 const_float64( 0x3fe0000000000000ll ), /* 2 */
2500 const_float64( 0x3fc5555555555555ll ), /* 3 */
2501 const_float64( 0x3fa5555555555555ll ), /* 4 */
2502 const_float64( 0x3f81111111111111ll ), /* 5 */
2503 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
2504 const_float64( 0x3f2a01a01a01a01all ), /* 7 */
2505 const_float64( 0x3efa01a01a01a01all ), /* 8 */
2506 const_float64( 0x3ec71de3a556c734ll ), /* 9 */
2507 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2508 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2509 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2510 const_float64( 0x3de6124613a86d09ll ), /* 13 */
2511 const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2512 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
8229c991
AJ
2513};
2514
2515float32 float32_exp2( float32 a STATUS_PARAM )
2516{
2517 flag aSign;
94a49d86 2518 int_fast16_t aExp;
bb98fe42 2519 uint32_t aSig;
8229c991
AJ
2520 float64 r, x, xn;
2521 int i;
37d18660 2522 a = float32_squash_input_denormal(a STATUS_VAR);
8229c991
AJ
2523
2524 aSig = extractFloat32Frac( a );
2525 aExp = extractFloat32Exp( a );
2526 aSign = extractFloat32Sign( a );
2527
2528 if ( aExp == 0xFF) {
2529 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2530 return (aSign) ? float32_zero : a;
2531 }
2532 if (aExp == 0) {
2533 if (aSig == 0) return float32_one;
2534 }
2535
2536 float_raise( float_flag_inexact STATUS_VAR);
2537
2538 /* ******************************* */
2539 /* using float64 for approximation */
2540 /* ******************************* */
2541 x = float32_to_float64(a STATUS_VAR);
2542 x = float64_mul(x, float64_ln2 STATUS_VAR);
2543
2544 xn = x;
2545 r = float64_one;
2546 for (i = 0 ; i < 15 ; i++) {
2547 float64 f;
2548
2549 f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2550 r = float64_add(r, f STATUS_VAR);
2551
2552 xn = float64_mul(xn, x STATUS_VAR);
2553 }
2554
2555 return float64_to_float32(r, status);
2556}
2557
374dfc33
AJ
2558/*----------------------------------------------------------------------------
2559| Returns the binary log of the single-precision floating-point value `a'.
2560| The operation is performed according to the IEC/IEEE Standard for Binary
2561| Floating-Point Arithmetic.
2562*----------------------------------------------------------------------------*/
2563float32 float32_log2( float32 a STATUS_PARAM )
2564{
2565 flag aSign, zSign;
94a49d86 2566 int_fast16_t aExp;
bb98fe42 2567 uint32_t aSig, zSig, i;
374dfc33 2568
37d18660 2569 a = float32_squash_input_denormal(a STATUS_VAR);
374dfc33
AJ
2570 aSig = extractFloat32Frac( a );
2571 aExp = extractFloat32Exp( a );
2572 aSign = extractFloat32Sign( a );
2573
2574 if ( aExp == 0 ) {
2575 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2576 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2577 }
2578 if ( aSign ) {
2579 float_raise( float_flag_invalid STATUS_VAR);
2580 return float32_default_nan;
2581 }
2582 if ( aExp == 0xFF ) {
2583 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2584 return a;
2585 }
2586
2587 aExp -= 0x7F;
2588 aSig |= 0x00800000;
2589 zSign = aExp < 0;
2590 zSig = aExp << 23;
2591
2592 for (i = 1 << 22; i > 0; i >>= 1) {
bb98fe42 2593 aSig = ( (uint64_t)aSig * aSig ) >> 23;
374dfc33
AJ
2594 if ( aSig & 0x01000000 ) {
2595 aSig >>= 1;
2596 zSig |= i;
2597 }
2598 }
2599
2600 if ( zSign )
2601 zSig = -zSig;
2602
2603 return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2604}
2605
158142c2
FB
2606/*----------------------------------------------------------------------------
2607| Returns 1 if the single-precision floating-point value `a' is equal to
b689362d
AJ
2608| the corresponding value `b', and 0 otherwise. The invalid exception is
2609| raised if either operand is a NaN. Otherwise, the comparison is performed
158142c2
FB
2610| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2611*----------------------------------------------------------------------------*/
2612
b689362d 2613int float32_eq( float32 a, float32 b STATUS_PARAM )
158142c2 2614{
b689362d 2615 uint32_t av, bv;
37d18660
PM
2616 a = float32_squash_input_denormal(a STATUS_VAR);
2617 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2618
2619 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2620 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2621 ) {
b689362d 2622 float_raise( float_flag_invalid STATUS_VAR);
158142c2
FB
2623 return 0;
2624 }
b689362d
AJ
2625 av = float32_val(a);
2626 bv = float32_val(b);
2627 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
158142c2
FB
2628}
2629
2630/*----------------------------------------------------------------------------
2631| Returns 1 if the single-precision floating-point value `a' is less than
f5a64251
AJ
2632| or equal to the corresponding value `b', and 0 otherwise. The invalid
2633| exception is raised if either operand is a NaN. The comparison is performed
2634| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
2635*----------------------------------------------------------------------------*/
2636
750afe93 2637int float32_le( float32 a, float32 b STATUS_PARAM )
158142c2
FB
2638{
2639 flag aSign, bSign;
bb98fe42 2640 uint32_t av, bv;
37d18660
PM
2641 a = float32_squash_input_denormal(a STATUS_VAR);
2642 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2643
2644 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2645 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2646 ) {
2647 float_raise( float_flag_invalid STATUS_VAR);
2648 return 0;
2649 }
2650 aSign = extractFloat32Sign( a );
2651 bSign = extractFloat32Sign( b );
f090c9d4
PB
2652 av = float32_val(a);
2653 bv = float32_val(b);
bb98fe42 2654 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 2655 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
2656
2657}
2658
2659/*----------------------------------------------------------------------------
2660| Returns 1 if the single-precision floating-point value `a' is less than
f5a64251
AJ
2661| the corresponding value `b', and 0 otherwise. The invalid exception is
2662| raised if either operand is a NaN. The comparison is performed according
2663| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
2664*----------------------------------------------------------------------------*/
2665
750afe93 2666int float32_lt( float32 a, float32 b STATUS_PARAM )
158142c2
FB
2667{
2668 flag aSign, bSign;
bb98fe42 2669 uint32_t av, bv;
37d18660
PM
2670 a = float32_squash_input_denormal(a STATUS_VAR);
2671 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2672
2673 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2674 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2675 ) {
2676 float_raise( float_flag_invalid STATUS_VAR);
2677 return 0;
2678 }
2679 aSign = extractFloat32Sign( a );
2680 bSign = extractFloat32Sign( b );
f090c9d4
PB
2681 av = float32_val(a);
2682 bv = float32_val(b);
bb98fe42 2683 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 2684 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
2685
2686}
2687
67b7861d
AJ
2688/*----------------------------------------------------------------------------
2689| Returns 1 if the single-precision floating-point values `a' and `b' cannot
f5a64251
AJ
2690| be compared, and 0 otherwise. The invalid exception is raised if either
2691| operand is a NaN. The comparison is performed according to the IEC/IEEE
2692| Standard for Binary Floating-Point Arithmetic.
67b7861d
AJ
2693*----------------------------------------------------------------------------*/
2694
2695int float32_unordered( float32 a, float32 b STATUS_PARAM )
2696{
2697 a = float32_squash_input_denormal(a STATUS_VAR);
2698 b = float32_squash_input_denormal(b STATUS_VAR);
2699
2700 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2701 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2702 ) {
2703 float_raise( float_flag_invalid STATUS_VAR);
2704 return 1;
2705 }
2706 return 0;
2707}
b689362d 2708
158142c2
FB
2709/*----------------------------------------------------------------------------
2710| Returns 1 if the single-precision floating-point value `a' is equal to
f5a64251
AJ
2711| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2712| exception. The comparison is performed according to the IEC/IEEE Standard
2713| for Binary Floating-Point Arithmetic.
158142c2
FB
2714*----------------------------------------------------------------------------*/
2715
b689362d 2716int float32_eq_quiet( float32 a, float32 b STATUS_PARAM )
158142c2 2717{
37d18660
PM
2718 a = float32_squash_input_denormal(a STATUS_VAR);
2719 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2720
2721 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2722 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2723 ) {
b689362d
AJ
2724 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2725 float_raise( float_flag_invalid STATUS_VAR);
2726 }
158142c2
FB
2727 return 0;
2728 }
b689362d
AJ
2729 return ( float32_val(a) == float32_val(b) ) ||
2730 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
158142c2
FB
2731}
2732
2733/*----------------------------------------------------------------------------
2734| Returns 1 if the single-precision floating-point value `a' is less than or
2735| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2736| cause an exception. Otherwise, the comparison is performed according to the
2737| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2738*----------------------------------------------------------------------------*/
2739
750afe93 2740int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
158142c2
FB
2741{
2742 flag aSign, bSign;
bb98fe42 2743 uint32_t av, bv;
37d18660
PM
2744 a = float32_squash_input_denormal(a STATUS_VAR);
2745 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2746
2747 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2748 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2749 ) {
2750 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2751 float_raise( float_flag_invalid STATUS_VAR);
2752 }
2753 return 0;
2754 }
2755 aSign = extractFloat32Sign( a );
2756 bSign = extractFloat32Sign( b );
f090c9d4
PB
2757 av = float32_val(a);
2758 bv = float32_val(b);
bb98fe42 2759 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 2760 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
2761
2762}
2763
2764/*----------------------------------------------------------------------------
2765| Returns 1 if the single-precision floating-point value `a' is less than
2766| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2767| exception. Otherwise, the comparison is performed according to the IEC/IEEE
2768| Standard for Binary Floating-Point Arithmetic.
2769*----------------------------------------------------------------------------*/
2770
750afe93 2771int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
158142c2
FB
2772{
2773 flag aSign, bSign;
bb98fe42 2774 uint32_t av, bv;
37d18660
PM
2775 a = float32_squash_input_denormal(a STATUS_VAR);
2776 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2777
2778 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2779 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2780 ) {
2781 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2782 float_raise( float_flag_invalid STATUS_VAR);
2783 }
2784 return 0;
2785 }
2786 aSign = extractFloat32Sign( a );
2787 bSign = extractFloat32Sign( b );
f090c9d4
PB
2788 av = float32_val(a);
2789 bv = float32_val(b);
bb98fe42 2790 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 2791 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
2792
2793}
2794
67b7861d
AJ
2795/*----------------------------------------------------------------------------
2796| Returns 1 if the single-precision floating-point values `a' and `b' cannot
2797| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
2798| comparison is performed according to the IEC/IEEE Standard for Binary
2799| Floating-Point Arithmetic.
2800*----------------------------------------------------------------------------*/
2801
2802int float32_unordered_quiet( float32 a, float32 b STATUS_PARAM )
2803{
2804 a = float32_squash_input_denormal(a STATUS_VAR);
2805 b = float32_squash_input_denormal(b STATUS_VAR);
2806
2807 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2808 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2809 ) {
2810 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2811 float_raise( float_flag_invalid STATUS_VAR);
2812 }
2813 return 1;
2814 }
2815 return 0;
2816}
2817
158142c2
FB
2818/*----------------------------------------------------------------------------
2819| Returns the result of converting the double-precision floating-point value
2820| `a' to the 32-bit two's complement integer format. The conversion is
2821| performed according to the IEC/IEEE Standard for Binary Floating-Point
2822| Arithmetic---which means in particular that the conversion is rounded
2823| according to the current rounding mode. If `a' is a NaN, the largest
2824| positive integer is returned. Otherwise, if the conversion overflows, the
2825| largest integer with the same sign as `a' is returned.
2826*----------------------------------------------------------------------------*/
2827
2828int32 float64_to_int32( float64 a STATUS_PARAM )
2829{
2830 flag aSign;
94a49d86 2831 int_fast16_t aExp, shiftCount;
bb98fe42 2832 uint64_t aSig;
37d18660 2833 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
2834
2835 aSig = extractFloat64Frac( a );
2836 aExp = extractFloat64Exp( a );
2837 aSign = extractFloat64Sign( a );
2838 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2839 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2840 shiftCount = 0x42C - aExp;
2841 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2842 return roundAndPackInt32( aSign, aSig STATUS_VAR );
2843
2844}
2845
2846/*----------------------------------------------------------------------------
2847| Returns the result of converting the double-precision floating-point value
2848| `a' to the 32-bit two's complement integer format. The conversion is
2849| performed according to the IEC/IEEE Standard for Binary Floating-Point
2850| Arithmetic, except that the conversion is always rounded toward zero.
2851| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2852| the conversion overflows, the largest integer with the same sign as `a' is
2853| returned.
2854*----------------------------------------------------------------------------*/
2855
2856int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2857{
2858 flag aSign;
94a49d86 2859 int_fast16_t aExp, shiftCount;
bb98fe42 2860 uint64_t aSig, savedASig;
b3a6a2e0 2861 int32_t z;
37d18660 2862 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
2863
2864 aSig = extractFloat64Frac( a );
2865 aExp = extractFloat64Exp( a );
2866 aSign = extractFloat64Sign( a );
2867 if ( 0x41E < aExp ) {
2868 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2869 goto invalid;
2870 }
2871 else if ( aExp < 0x3FF ) {
2872 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2873 return 0;
2874 }
2875 aSig |= LIT64( 0x0010000000000000 );
2876 shiftCount = 0x433 - aExp;
2877 savedASig = aSig;
2878 aSig >>= shiftCount;
2879 z = aSig;
2880 if ( aSign ) z = - z;
2881 if ( ( z < 0 ) ^ aSign ) {
2882 invalid:
2883 float_raise( float_flag_invalid STATUS_VAR);
bb98fe42 2884 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2
FB
2885 }
2886 if ( ( aSig<<shiftCount ) != savedASig ) {
2887 STATUS(float_exception_flags) |= float_flag_inexact;
2888 }
2889 return z;
2890
2891}
2892
cbcef455
PM
2893/*----------------------------------------------------------------------------
2894| Returns the result of converting the double-precision floating-point value
2895| `a' to the 16-bit two's complement integer format. The conversion is
2896| performed according to the IEC/IEEE Standard for Binary Floating-Point
2897| Arithmetic, except that the conversion is always rounded toward zero.
2898| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2899| the conversion overflows, the largest integer with the same sign as `a' is
2900| returned.
2901*----------------------------------------------------------------------------*/
2902
94a49d86 2903int_fast16_t float64_to_int16_round_to_zero(float64 a STATUS_PARAM)
cbcef455
PM
2904{
2905 flag aSign;
94a49d86 2906 int_fast16_t aExp, shiftCount;
bb98fe42 2907 uint64_t aSig, savedASig;
cbcef455
PM
2908 int32 z;
2909
2910 aSig = extractFloat64Frac( a );
2911 aExp = extractFloat64Exp( a );
2912 aSign = extractFloat64Sign( a );
2913 if ( 0x40E < aExp ) {
2914 if ( ( aExp == 0x7FF ) && aSig ) {
2915 aSign = 0;
2916 }
2917 goto invalid;
2918 }
2919 else if ( aExp < 0x3FF ) {
2920 if ( aExp || aSig ) {
2921 STATUS(float_exception_flags) |= float_flag_inexact;
2922 }
2923 return 0;
2924 }
2925 aSig |= LIT64( 0x0010000000000000 );
2926 shiftCount = 0x433 - aExp;
2927 savedASig = aSig;
2928 aSig >>= shiftCount;
2929 z = aSig;
2930 if ( aSign ) {
2931 z = - z;
2932 }
2933 if ( ( (int16_t)z < 0 ) ^ aSign ) {
2934 invalid:
2935 float_raise( float_flag_invalid STATUS_VAR);
bb98fe42 2936 return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
cbcef455
PM
2937 }
2938 if ( ( aSig<<shiftCount ) != savedASig ) {
2939 STATUS(float_exception_flags) |= float_flag_inexact;
2940 }
2941 return z;
2942}
2943
158142c2
FB
2944/*----------------------------------------------------------------------------
2945| Returns the result of converting the double-precision floating-point value
2946| `a' to the 64-bit two's complement integer format. The conversion is
2947| performed according to the IEC/IEEE Standard for Binary Floating-Point
2948| Arithmetic---which means in particular that the conversion is rounded
2949| according to the current rounding mode. If `a' is a NaN, the largest
2950| positive integer is returned. Otherwise, if the conversion overflows, the
2951| largest integer with the same sign as `a' is returned.
2952*----------------------------------------------------------------------------*/
2953
2954int64 float64_to_int64( float64 a STATUS_PARAM )
2955{
2956 flag aSign;
94a49d86 2957 int_fast16_t aExp, shiftCount;
bb98fe42 2958 uint64_t aSig, aSigExtra;
37d18660 2959 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
2960
2961 aSig = extractFloat64Frac( a );
2962 aExp = extractFloat64Exp( a );
2963 aSign = extractFloat64Sign( a );
2964 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2965 shiftCount = 0x433 - aExp;
2966 if ( shiftCount <= 0 ) {
2967 if ( 0x43E < aExp ) {
2968 float_raise( float_flag_invalid STATUS_VAR);
2969 if ( ! aSign
2970 || ( ( aExp == 0x7FF )
2971 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2972 ) {
2973 return LIT64( 0x7FFFFFFFFFFFFFFF );
2974 }
bb98fe42 2975 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
2976 }
2977 aSigExtra = 0;
2978 aSig <<= - shiftCount;
2979 }
2980 else {
2981 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2982 }
2983 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2984
2985}
2986
2987/*----------------------------------------------------------------------------
2988| Returns the result of converting the double-precision floating-point value
2989| `a' to the 64-bit two's complement integer format. The conversion is
2990| performed according to the IEC/IEEE Standard for Binary Floating-Point
2991| Arithmetic, except that the conversion is always rounded toward zero.
2992| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2993| the conversion overflows, the largest integer with the same sign as `a' is
2994| returned.
2995*----------------------------------------------------------------------------*/
2996
2997int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2998{
2999 flag aSign;
94a49d86 3000 int_fast16_t aExp, shiftCount;
bb98fe42 3001 uint64_t aSig;
158142c2 3002 int64 z;
37d18660 3003 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
3004
3005 aSig = extractFloat64Frac( a );
3006 aExp = extractFloat64Exp( a );
3007 aSign = extractFloat64Sign( a );
3008 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3009 shiftCount = aExp - 0x433;
3010 if ( 0 <= shiftCount ) {
3011 if ( 0x43E <= aExp ) {
f090c9d4 3012 if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
158142c2
FB
3013 float_raise( float_flag_invalid STATUS_VAR);
3014 if ( ! aSign
3015 || ( ( aExp == 0x7FF )
3016 && ( aSig != LIT64( 0x0010000000000000 ) ) )
3017 ) {
3018 return LIT64( 0x7FFFFFFFFFFFFFFF );
3019 }
3020 }
bb98fe42 3021 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
3022 }
3023 z = aSig<<shiftCount;
3024 }
3025 else {
3026 if ( aExp < 0x3FE ) {
3027 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3028 return 0;
3029 }
3030 z = aSig>>( - shiftCount );
bb98fe42 3031 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
158142c2
FB
3032 STATUS(float_exception_flags) |= float_flag_inexact;
3033 }
3034 }
3035 if ( aSign ) z = - z;
3036 return z;
3037
3038}
3039
3040/*----------------------------------------------------------------------------
3041| Returns the result of converting the double-precision floating-point value
3042| `a' to the single-precision floating-point format. The conversion is
3043| performed according to the IEC/IEEE Standard for Binary Floating-Point
3044| Arithmetic.
3045*----------------------------------------------------------------------------*/
3046
3047float32 float64_to_float32( float64 a STATUS_PARAM )
3048{
3049 flag aSign;
94a49d86 3050 int_fast16_t aExp;
bb98fe42
AF
3051 uint64_t aSig;
3052 uint32_t zSig;
37d18660 3053 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
3054
3055 aSig = extractFloat64Frac( a );
3056 aExp = extractFloat64Exp( a );
3057 aSign = extractFloat64Sign( a );
3058 if ( aExp == 0x7FF ) {
bcd4d9af 3059 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
3060 return packFloat32( aSign, 0xFF, 0 );
3061 }
3062 shift64RightJamming( aSig, 22, &aSig );
3063 zSig = aSig;
3064 if ( aExp || zSig ) {
3065 zSig |= 0x40000000;
3066 aExp -= 0x381;
3067 }
3068 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
3069
3070}
3071
60011498
PB
3072
3073/*----------------------------------------------------------------------------
3074| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3075| half-precision floating-point value, returning the result. After being
3076| shifted into the proper positions, the three fields are simply added
3077| together to form the result. This means that any integer portion of `zSig'
3078| will be added into the exponent. Since a properly normalized significand
3079| will have an integer portion equal to 1, the `zExp' input should be 1 less
3080| than the desired result exponent whenever `zSig' is a complete, normalized
3081| significand.
3082*----------------------------------------------------------------------------*/
94a49d86 3083static float16 packFloat16(flag zSign, int_fast16_t zExp, uint16_t zSig)
60011498 3084{
bb4d4bb3 3085 return make_float16(
bb98fe42 3086 (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
60011498
PB
3087}
3088
c4a1c5e7
PM
3089/*----------------------------------------------------------------------------
3090| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3091| and significand `zSig', and returns the proper half-precision floating-
3092| point value corresponding to the abstract input. Ordinarily, the abstract
3093| value is simply rounded and packed into the half-precision format, with
3094| the inexact exception raised if the abstract input cannot be represented
3095| exactly. However, if the abstract value is too large, the overflow and
3096| inexact exceptions are raised and an infinity or maximal finite value is
3097| returned. If the abstract value is too small, the input value is rounded to
3098| a subnormal number, and the underflow and inexact exceptions are raised if
3099| the abstract input cannot be represented exactly as a subnormal half-
3100| precision floating-point number.
3101| The `ieee' flag indicates whether to use IEEE standard half precision, or
3102| ARM-style "alternative representation", which omits the NaN and Inf
3103| encodings in order to raise the maximum representable exponent by one.
3104| The input significand `zSig' has its binary point between bits 22
3105| and 23, which is 13 bits to the left of the usual location. This shifted
3106| significand must be normalized or smaller. If `zSig' is not normalized,
3107| `zExp' must be 0; in that case, the result returned is a subnormal number,
3108| and it must not require rounding. In the usual case that `zSig' is
3109| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3110| Note the slightly odd position of the binary point in zSig compared with the
3111| other roundAndPackFloat functions. This should probably be fixed if we
3112| need to implement more float16 routines than just conversion.
3113| The handling of underflow and overflow follows the IEC/IEEE Standard for
3114| Binary Floating-Point Arithmetic.
3115*----------------------------------------------------------------------------*/
3116
3117static float32 roundAndPackFloat16(flag zSign, int_fast16_t zExp,
3118 uint32_t zSig, flag ieee STATUS_PARAM)
3119{
3120 int maxexp = ieee ? 29 : 30;
3121 uint32_t mask;
3122 uint32_t increment;
3123 int8 roundingMode;
3124 bool rounding_bumps_exp;
3125 bool is_tiny = false;
3126
3127 /* Calculate the mask of bits of the mantissa which are not
3128 * representable in half-precision and will be lost.
3129 */
3130 if (zExp < 1) {
3131 /* Will be denormal in halfprec */
3132 mask = 0x00ffffff;
3133 if (zExp >= -11) {
3134 mask >>= 11 + zExp;
3135 }
3136 } else {
3137 /* Normal number in halfprec */
3138 mask = 0x00001fff;
3139 }
3140
3141 roundingMode = STATUS(float_rounding_mode);
3142 switch (roundingMode) {
3143 case float_round_nearest_even:
3144 increment = (mask + 1) >> 1;
3145 if ((zSig & mask) == increment) {
3146 increment = zSig & (increment << 1);
3147 }
3148 break;
3149 case float_round_up:
3150 increment = zSign ? 0 : mask;
3151 break;
3152 case float_round_down:
3153 increment = zSign ? mask : 0;
3154 break;
3155 default: /* round_to_zero */
3156 increment = 0;
3157 break;
3158 }
3159
3160 rounding_bumps_exp = (zSig + increment >= 0x01000000);
3161
3162 if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3163 if (ieee) {
3164 float_raise(float_flag_overflow | float_flag_inexact STATUS_VAR);
3165 return packFloat16(zSign, 0x1f, 0);
3166 } else {
3167 float_raise(float_flag_invalid STATUS_VAR);
3168 return packFloat16(zSign, 0x1f, 0x3ff);
3169 }
3170 }
3171
3172 if (zExp < 0) {
3173 /* Note that flush-to-zero does not affect half-precision results */
3174 is_tiny =
3175 (STATUS(float_detect_tininess) == float_tininess_before_rounding)
3176 || (zExp < -1)
3177 || (!rounding_bumps_exp);
3178 }
3179 if (zSig & mask) {
3180 float_raise(float_flag_inexact STATUS_VAR);
3181 if (is_tiny) {
3182 float_raise(float_flag_underflow STATUS_VAR);
3183 }
3184 }
3185
3186 zSig += increment;
3187 if (rounding_bumps_exp) {
3188 zSig >>= 1;
3189 zExp++;
3190 }
3191
3192 if (zExp < -10) {
3193 return packFloat16(zSign, 0, 0);
3194 }
3195 if (zExp < 0) {
3196 zSig >>= -zExp;
3197 zExp = 0;
3198 }
3199 return packFloat16(zSign, zExp, zSig >> 13);
3200}
3201
3202static void normalizeFloat16Subnormal(uint32_t aSig, int_fast16_t *zExpPtr,
3203 uint32_t *zSigPtr)
3204{
3205 int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3206 *zSigPtr = aSig << shiftCount;
3207 *zExpPtr = 1 - shiftCount;
3208}
3209
60011498
PB
3210/* Half precision floats come in two formats: standard IEEE and "ARM" format.
3211 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
bb4d4bb3
PM
3212
3213float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM)
60011498
PB
3214{
3215 flag aSign;
94a49d86 3216 int_fast16_t aExp;
bb98fe42 3217 uint32_t aSig;
60011498 3218
bb4d4bb3
PM
3219 aSign = extractFloat16Sign(a);
3220 aExp = extractFloat16Exp(a);
3221 aSig = extractFloat16Frac(a);
60011498
PB
3222
3223 if (aExp == 0x1f && ieee) {
3224 if (aSig) {
f591e1be 3225 return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
60011498 3226 }
4be8eeac 3227 return packFloat32(aSign, 0xff, 0);
60011498
PB
3228 }
3229 if (aExp == 0) {
60011498
PB
3230 if (aSig == 0) {
3231 return packFloat32(aSign, 0, 0);
3232 }
3233
c4a1c5e7
PM
3234 normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3235 aExp--;
60011498
PB
3236 }
3237 return packFloat32( aSign, aExp + 0x70, aSig << 13);
3238}
3239
bb4d4bb3 3240float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM)
60011498
PB
3241{
3242 flag aSign;
94a49d86 3243 int_fast16_t aExp;
bb98fe42 3244 uint32_t aSig;
38970efa 3245
37d18660 3246 a = float32_squash_input_denormal(a STATUS_VAR);
60011498
PB
3247
3248 aSig = extractFloat32Frac( a );
3249 aExp = extractFloat32Exp( a );
3250 aSign = extractFloat32Sign( a );
3251 if ( aExp == 0xFF ) {
3252 if (aSig) {
600e30d2 3253 /* Input is a NaN */
600e30d2 3254 if (!ieee) {
38970efa 3255 float_raise(float_flag_invalid STATUS_VAR);
600e30d2
PM
3256 return packFloat16(aSign, 0, 0);
3257 }
38970efa
PM
3258 return commonNaNToFloat16(
3259 float32ToCommonNaN(a STATUS_VAR) STATUS_VAR);
60011498 3260 }
600e30d2
PM
3261 /* Infinity */
3262 if (!ieee) {
3263 float_raise(float_flag_invalid STATUS_VAR);
3264 return packFloat16(aSign, 0x1f, 0x3ff);
3265 }
3266 return packFloat16(aSign, 0x1f, 0);
60011498 3267 }
600e30d2 3268 if (aExp == 0 && aSig == 0) {
60011498
PB
3269 return packFloat16(aSign, 0, 0);
3270 }
38970efa
PM
3271 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3272 * even if the input is denormal; however this is harmless because
3273 * the largest possible single-precision denormal is still smaller
3274 * than the smallest representable half-precision denormal, and so we
3275 * will end up ignoring aSig and returning via the "always return zero"
3276 * codepath.
3277 */
60011498 3278 aSig |= 0x00800000;
c4a1c5e7 3279 aExp -= 0x71;
60011498 3280
c4a1c5e7 3281 return roundAndPackFloat16(aSign, aExp, aSig, ieee STATUS_VAR);
60011498
PB
3282}
3283
158142c2
FB
3284/*----------------------------------------------------------------------------
3285| Returns the result of converting the double-precision floating-point value
3286| `a' to the extended double-precision floating-point format. The conversion
3287| is performed according to the IEC/IEEE Standard for Binary Floating-Point
3288| Arithmetic.
3289*----------------------------------------------------------------------------*/
3290
3291floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
3292{
3293 flag aSign;
94a49d86 3294 int_fast16_t aExp;
bb98fe42 3295 uint64_t aSig;
158142c2 3296
37d18660 3297 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
3298 aSig = extractFloat64Frac( a );
3299 aExp = extractFloat64Exp( a );
3300 aSign = extractFloat64Sign( a );
3301 if ( aExp == 0x7FF ) {
bcd4d9af 3302 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
3303 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3304 }
3305 if ( aExp == 0 ) {
3306 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3307 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3308 }
3309 return
3310 packFloatx80(
3311 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
3312
3313}
3314
158142c2
FB
3315/*----------------------------------------------------------------------------
3316| Returns the result of converting the double-precision floating-point value
3317| `a' to the quadruple-precision floating-point format. The conversion is
3318| performed according to the IEC/IEEE Standard for Binary Floating-Point
3319| Arithmetic.
3320*----------------------------------------------------------------------------*/
3321
3322float128 float64_to_float128( float64 a STATUS_PARAM )
3323{
3324 flag aSign;
94a49d86 3325 int_fast16_t aExp;
bb98fe42 3326 uint64_t aSig, zSig0, zSig1;
158142c2 3327
37d18660 3328 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
3329 aSig = extractFloat64Frac( a );
3330 aExp = extractFloat64Exp( a );
3331 aSign = extractFloat64Sign( a );
3332 if ( aExp == 0x7FF ) {
bcd4d9af 3333 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
3334 return packFloat128( aSign, 0x7FFF, 0, 0 );
3335 }
3336 if ( aExp == 0 ) {
3337 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3338 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3339 --aExp;
3340 }
3341 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
3342 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
3343
3344}
3345
158142c2
FB
3346/*----------------------------------------------------------------------------
3347| Rounds the double-precision floating-point value `a' to an integer, and
3348| returns the result as a double-precision floating-point value. The
3349| operation is performed according to the IEC/IEEE Standard for Binary
3350| Floating-Point Arithmetic.
3351*----------------------------------------------------------------------------*/
3352
3353float64 float64_round_to_int( float64 a STATUS_PARAM )
3354{
3355 flag aSign;
94a49d86 3356 int_fast16_t aExp;
bb98fe42 3357 uint64_t lastBitMask, roundBitsMask;
158142c2 3358 int8 roundingMode;
bb98fe42 3359 uint64_t z;
37d18660 3360 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
3361
3362 aExp = extractFloat64Exp( a );
3363 if ( 0x433 <= aExp ) {
3364 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
3365 return propagateFloat64NaN( a, a STATUS_VAR );
3366 }
3367 return a;
3368 }
3369 if ( aExp < 0x3FF ) {
bb98fe42 3370 if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
158142c2
FB
3371 STATUS(float_exception_flags) |= float_flag_inexact;
3372 aSign = extractFloat64Sign( a );
3373 switch ( STATUS(float_rounding_mode) ) {
3374 case float_round_nearest_even:
3375 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3376 return packFloat64( aSign, 0x3FF, 0 );
3377 }
3378 break;
3379 case float_round_down:
f090c9d4 3380 return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
158142c2 3381 case float_round_up:
f090c9d4
PB
3382 return make_float64(
3383 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
158142c2
FB
3384 }
3385 return packFloat64( aSign, 0, 0 );
3386 }
3387 lastBitMask = 1;
3388 lastBitMask <<= 0x433 - aExp;
3389 roundBitsMask = lastBitMask - 1;
f090c9d4 3390 z = float64_val(a);
158142c2
FB
3391 roundingMode = STATUS(float_rounding_mode);
3392 if ( roundingMode == float_round_nearest_even ) {
3393 z += lastBitMask>>1;
3394 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
3395 }
3396 else if ( roundingMode != float_round_to_zero ) {
f090c9d4 3397 if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
158142c2
FB
3398 z += roundBitsMask;
3399 }
3400 }
3401 z &= ~ roundBitsMask;
f090c9d4
PB
3402 if ( z != float64_val(a) )
3403 STATUS(float_exception_flags) |= float_flag_inexact;
3404 return make_float64(z);
158142c2
FB
3405
3406}
3407
e6e5906b
PB
3408float64 float64_trunc_to_int( float64 a STATUS_PARAM)
3409{
3410 int oldmode;
3411 float64 res;
3412 oldmode = STATUS(float_rounding_mode);
3413 STATUS(float_rounding_mode) = float_round_to_zero;
3414 res = float64_round_to_int(a STATUS_VAR);
3415 STATUS(float_rounding_mode) = oldmode;
3416 return res;
3417}
3418
158142c2
FB
3419/*----------------------------------------------------------------------------
3420| Returns the result of adding the absolute values of the double-precision
3421| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
3422| before being returned. `zSign' is ignored if the result is a NaN.
3423| The addition is performed according to the IEC/IEEE Standard for Binary
3424| Floating-Point Arithmetic.
3425*----------------------------------------------------------------------------*/
3426
3427static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3428{
94a49d86 3429 int_fast16_t aExp, bExp, zExp;
bb98fe42 3430 uint64_t aSig, bSig, zSig;
94a49d86 3431 int_fast16_t expDiff;
158142c2
FB
3432
3433 aSig = extractFloat64Frac( a );
3434 aExp = extractFloat64Exp( a );
3435 bSig = extractFloat64Frac( b );
3436 bExp = extractFloat64Exp( b );
3437 expDiff = aExp - bExp;
3438 aSig <<= 9;
3439 bSig <<= 9;
3440 if ( 0 < expDiff ) {
3441 if ( aExp == 0x7FF ) {
3442 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3443 return a;
3444 }
3445 if ( bExp == 0 ) {
3446 --expDiff;
3447 }
3448 else {
3449 bSig |= LIT64( 0x2000000000000000 );
3450 }
3451 shift64RightJamming( bSig, expDiff, &bSig );
3452 zExp = aExp;
3453 }
3454 else if ( expDiff < 0 ) {
3455 if ( bExp == 0x7FF ) {
3456 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3457 return packFloat64( zSign, 0x7FF, 0 );
3458 }
3459 if ( aExp == 0 ) {
3460 ++expDiff;
3461 }
3462 else {
3463 aSig |= LIT64( 0x2000000000000000 );
3464 }
3465 shift64RightJamming( aSig, - expDiff, &aSig );
3466 zExp = bExp;
3467 }
3468 else {
3469 if ( aExp == 0x7FF ) {
3470 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3471 return a;
3472 }
fe76d976 3473 if ( aExp == 0 ) {
e6afc87f
PM
3474 if (STATUS(flush_to_zero)) {
3475 if (aSig | bSig) {
3476 float_raise(float_flag_output_denormal STATUS_VAR);
3477 }
3478 return packFloat64(zSign, 0, 0);
3479 }
fe76d976
PB
3480 return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3481 }
158142c2
FB
3482 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3483 zExp = aExp;
3484 goto roundAndPack;
3485 }
3486 aSig |= LIT64( 0x2000000000000000 );
3487 zSig = ( aSig + bSig )<<1;
3488 --zExp;
bb98fe42 3489 if ( (int64_t) zSig < 0 ) {
158142c2
FB
3490 zSig = aSig + bSig;
3491 ++zExp;
3492 }
3493 roundAndPack:
3494 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3495
3496}
3497
3498/*----------------------------------------------------------------------------
3499| Returns the result of subtracting the absolute values of the double-
3500| precision floating-point values `a' and `b'. If `zSign' is 1, the
3501| difference is negated before being returned. `zSign' is ignored if the
3502| result is a NaN. The subtraction is performed according to the IEC/IEEE
3503| Standard for Binary Floating-Point Arithmetic.
3504*----------------------------------------------------------------------------*/
3505
3506static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3507{
94a49d86 3508 int_fast16_t aExp, bExp, zExp;
bb98fe42 3509 uint64_t aSig, bSig, zSig;
94a49d86 3510 int_fast16_t expDiff;
158142c2
FB
3511
3512 aSig = extractFloat64Frac( a );
3513 aExp = extractFloat64Exp( a );
3514 bSig = extractFloat64Frac( b );
3515 bExp = extractFloat64Exp( b );
3516 expDiff = aExp - bExp;
3517 aSig <<= 10;
3518 bSig <<= 10;
3519 if ( 0 < expDiff ) goto aExpBigger;
3520 if ( expDiff < 0 ) goto bExpBigger;
3521 if ( aExp == 0x7FF ) {
3522 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3523 float_raise( float_flag_invalid STATUS_VAR);
3524 return float64_default_nan;
3525 }
3526 if ( aExp == 0 ) {
3527 aExp = 1;
3528 bExp = 1;
3529 }
3530 if ( bSig < aSig ) goto aBigger;
3531 if ( aSig < bSig ) goto bBigger;
3532 return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3533 bExpBigger:
3534 if ( bExp == 0x7FF ) {
3535 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3536 return packFloat64( zSign ^ 1, 0x7FF, 0 );
3537 }
3538 if ( aExp == 0 ) {
3539 ++expDiff;
3540 }
3541 else {
3542 aSig |= LIT64( 0x4000000000000000 );
3543 }
3544 shift64RightJamming( aSig, - expDiff, &aSig );
3545 bSig |= LIT64( 0x4000000000000000 );
3546 bBigger:
3547 zSig = bSig - aSig;
3548 zExp = bExp;
3549 zSign ^= 1;
3550 goto normalizeRoundAndPack;
3551 aExpBigger:
3552 if ( aExp == 0x7FF ) {
3553 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3554 return a;
3555 }
3556 if ( bExp == 0 ) {
3557 --expDiff;
3558 }
3559 else {
3560 bSig |= LIT64( 0x4000000000000000 );
3561 }
3562 shift64RightJamming( bSig, expDiff, &bSig );
3563 aSig |= LIT64( 0x4000000000000000 );
3564 aBigger:
3565 zSig = aSig - bSig;
3566 zExp = aExp;
3567 normalizeRoundAndPack:
3568 --zExp;
3569 return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3570
3571}
3572
3573/*----------------------------------------------------------------------------
3574| Returns the result of adding the double-precision floating-point values `a'
3575| and `b'. The operation is performed according to the IEC/IEEE Standard for
3576| Binary Floating-Point Arithmetic.
3577*----------------------------------------------------------------------------*/
3578
3579float64 float64_add( float64 a, float64 b STATUS_PARAM )
3580{
3581 flag aSign, bSign;
37d18660
PM
3582 a = float64_squash_input_denormal(a STATUS_VAR);
3583 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
3584
3585 aSign = extractFloat64Sign( a );
3586 bSign = extractFloat64Sign( b );
3587 if ( aSign == bSign ) {
3588 return addFloat64Sigs( a, b, aSign STATUS_VAR );
3589 }
3590 else {
3591 return subFloat64Sigs( a, b, aSign STATUS_VAR );
3592 }
3593
3594}
3595
3596/*----------------------------------------------------------------------------
3597| Returns the result of subtracting the double-precision floating-point values
3598| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3599| for Binary Floating-Point Arithmetic.
3600*----------------------------------------------------------------------------*/
3601
3602float64 float64_sub( float64 a, float64 b STATUS_PARAM )
3603{
3604 flag aSign, bSign;
37d18660
PM
3605 a = float64_squash_input_denormal(a STATUS_VAR);
3606 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
3607
3608 aSign = extractFloat64Sign( a );
3609 bSign = extractFloat64Sign( b );
3610 if ( aSign == bSign ) {
3611 return subFloat64Sigs( a, b, aSign STATUS_VAR );
3612 }
3613 else {
3614 return addFloat64Sigs( a, b, aSign STATUS_VAR );
3615 }
3616
3617}
3618
3619/*----------------------------------------------------------------------------
3620| Returns the result of multiplying the double-precision floating-point values
3621| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3622| for Binary Floating-Point Arithmetic.
3623*----------------------------------------------------------------------------*/
3624
3625float64 float64_mul( float64 a, float64 b STATUS_PARAM )
3626{
3627 flag aSign, bSign, zSign;
94a49d86 3628 int_fast16_t aExp, bExp, zExp;
bb98fe42 3629 uint64_t aSig, bSig, zSig0, zSig1;
158142c2 3630
37d18660
PM
3631 a = float64_squash_input_denormal(a STATUS_VAR);
3632 b = float64_squash_input_denormal(b STATUS_VAR);
3633
158142c2
FB
3634 aSig = extractFloat64Frac( a );
3635 aExp = extractFloat64Exp( a );
3636 aSign = extractFloat64Sign( a );
3637 bSig = extractFloat64Frac( b );
3638 bExp = extractFloat64Exp( b );
3639 bSign = extractFloat64Sign( b );
3640 zSign = aSign ^ bSign;
3641 if ( aExp == 0x7FF ) {
3642 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3643 return propagateFloat64NaN( a, b STATUS_VAR );
3644 }
3645 if ( ( bExp | bSig ) == 0 ) {
3646 float_raise( float_flag_invalid STATUS_VAR);
3647 return float64_default_nan;
3648 }
3649 return packFloat64( zSign, 0x7FF, 0 );
3650 }
3651 if ( bExp == 0x7FF ) {
3652 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3653 if ( ( aExp | aSig ) == 0 ) {
3654 float_raise( float_flag_invalid STATUS_VAR);
3655 return float64_default_nan;
3656 }
3657 return packFloat64( zSign, 0x7FF, 0 );
3658 }
3659 if ( aExp == 0 ) {
3660 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3661 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3662 }
3663 if ( bExp == 0 ) {
3664 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
3665 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3666 }
3667 zExp = aExp + bExp - 0x3FF;
3668 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3669 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3670 mul64To128( aSig, bSig, &zSig0, &zSig1 );
3671 zSig0 |= ( zSig1 != 0 );
bb98fe42 3672 if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
158142c2
FB
3673 zSig0 <<= 1;
3674 --zExp;
3675 }
3676 return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
3677
3678}
3679
3680/*----------------------------------------------------------------------------
3681| Returns the result of dividing the double-precision floating-point value `a'
3682| by the corresponding value `b'. The operation is performed according to
3683| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3684*----------------------------------------------------------------------------*/
3685
3686float64 float64_div( float64 a, float64 b STATUS_PARAM )
3687{
3688 flag aSign, bSign, zSign;
94a49d86 3689 int_fast16_t aExp, bExp, zExp;
bb98fe42
AF
3690 uint64_t aSig, bSig, zSig;
3691 uint64_t rem0, rem1;
3692 uint64_t term0, term1;
37d18660
PM
3693 a = float64_squash_input_denormal(a STATUS_VAR);
3694 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
3695
3696 aSig = extractFloat64Frac( a );
3697 aExp = extractFloat64Exp( a );
3698 aSign = extractFloat64Sign( a );
3699 bSig = extractFloat64Frac( b );
3700 bExp = extractFloat64Exp( b );
3701 bSign = extractFloat64Sign( b );
3702 zSign = aSign ^ bSign;
3703 if ( aExp == 0x7FF ) {
3704 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3705 if ( bExp == 0x7FF ) {
3706 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3707 float_raise( float_flag_invalid STATUS_VAR);
3708 return float64_default_nan;
3709 }
3710 return packFloat64( zSign, 0x7FF, 0 );
3711 }
3712 if ( bExp == 0x7FF ) {
3713 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3714 return packFloat64( zSign, 0, 0 );
3715 }
3716 if ( bExp == 0 ) {
3717 if ( bSig == 0 ) {
3718 if ( ( aExp | aSig ) == 0 ) {
3719 float_raise( float_flag_invalid STATUS_VAR);
3720 return float64_default_nan;
3721 }
3722 float_raise( float_flag_divbyzero STATUS_VAR);
3723 return packFloat64( zSign, 0x7FF, 0 );
3724 }
3725 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3726 }
3727 if ( aExp == 0 ) {
3728 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3729 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3730 }
3731 zExp = aExp - bExp + 0x3FD;
3732 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3733 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3734 if ( bSig <= ( aSig + aSig ) ) {
3735 aSig >>= 1;
3736 ++zExp;
3737 }
3738 zSig = estimateDiv128To64( aSig, 0, bSig );
3739 if ( ( zSig & 0x1FF ) <= 2 ) {
3740 mul64To128( bSig, zSig, &term0, &term1 );
3741 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
bb98fe42 3742 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
3743 --zSig;
3744 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3745 }
3746 zSig |= ( rem1 != 0 );
3747 }
3748 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3749
3750}
3751
3752/*----------------------------------------------------------------------------
3753| Returns the remainder of the double-precision floating-point value `a'
3754| with respect to the corresponding value `b'. The operation is performed
3755| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3756*----------------------------------------------------------------------------*/
3757
3758float64 float64_rem( float64 a, float64 b STATUS_PARAM )
3759{
ed086f3d 3760 flag aSign, zSign;
94a49d86 3761 int_fast16_t aExp, bExp, expDiff;
bb98fe42
AF
3762 uint64_t aSig, bSig;
3763 uint64_t q, alternateASig;
3764 int64_t sigMean;
158142c2 3765
37d18660
PM
3766 a = float64_squash_input_denormal(a STATUS_VAR);
3767 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
3768 aSig = extractFloat64Frac( a );
3769 aExp = extractFloat64Exp( a );
3770 aSign = extractFloat64Sign( a );
3771 bSig = extractFloat64Frac( b );
3772 bExp = extractFloat64Exp( b );
158142c2
FB
3773 if ( aExp == 0x7FF ) {
3774 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3775 return propagateFloat64NaN( a, b STATUS_VAR );
3776 }
3777 float_raise( float_flag_invalid STATUS_VAR);
3778 return float64_default_nan;
3779 }
3780 if ( bExp == 0x7FF ) {
3781 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3782 return a;
3783 }
3784 if ( bExp == 0 ) {
3785 if ( bSig == 0 ) {
3786 float_raise( float_flag_invalid STATUS_VAR);
3787 return float64_default_nan;
3788 }
3789 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3790 }
3791 if ( aExp == 0 ) {
3792 if ( aSig == 0 ) return a;
3793 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3794 }
3795 expDiff = aExp - bExp;
3796 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3797 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3798 if ( expDiff < 0 ) {
3799 if ( expDiff < -1 ) return a;
3800 aSig >>= 1;
3801 }
3802 q = ( bSig <= aSig );
3803 if ( q ) aSig -= bSig;
3804 expDiff -= 64;
3805 while ( 0 < expDiff ) {
3806 q = estimateDiv128To64( aSig, 0, bSig );
3807 q = ( 2 < q ) ? q - 2 : 0;
3808 aSig = - ( ( bSig>>2 ) * q );
3809 expDiff -= 62;
3810 }
3811 expDiff += 64;
3812 if ( 0 < expDiff ) {
3813 q = estimateDiv128To64( aSig, 0, bSig );
3814 q = ( 2 < q ) ? q - 2 : 0;
3815 q >>= 64 - expDiff;
3816 bSig >>= 2;
3817 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3818 }
3819 else {
3820 aSig >>= 2;
3821 bSig >>= 2;
3822 }
3823 do {
3824 alternateASig = aSig;
3825 ++q;
3826 aSig -= bSig;
bb98fe42 3827 } while ( 0 <= (int64_t) aSig );
158142c2
FB
3828 sigMean = aSig + alternateASig;
3829 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3830 aSig = alternateASig;
3831 }
bb98fe42 3832 zSign = ( (int64_t) aSig < 0 );
158142c2
FB
3833 if ( zSign ) aSig = - aSig;
3834 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
3835
3836}
3837
369be8f6
PM
3838/*----------------------------------------------------------------------------
3839| Returns the result of multiplying the double-precision floating-point values
3840| `a' and `b' then adding 'c', with no intermediate rounding step after the
3841| multiplication. The operation is performed according to the IEC/IEEE
3842| Standard for Binary Floating-Point Arithmetic 754-2008.
3843| The flags argument allows the caller to select negation of the
3844| addend, the intermediate product, or the final result. (The difference
3845| between this and having the caller do a separate negation is that negating
3846| externally will flip the sign bit on NaNs.)
3847*----------------------------------------------------------------------------*/
3848
3849float64 float64_muladd(float64 a, float64 b, float64 c, int flags STATUS_PARAM)
3850{
3851 flag aSign, bSign, cSign, zSign;
94a49d86 3852 int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
369be8f6
PM
3853 uint64_t aSig, bSig, cSig;
3854 flag pInf, pZero, pSign;
3855 uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1;
3856 int shiftcount;
3857 flag signflip, infzero;
3858
3859 a = float64_squash_input_denormal(a STATUS_VAR);
3860 b = float64_squash_input_denormal(b STATUS_VAR);
3861 c = float64_squash_input_denormal(c STATUS_VAR);
3862 aSig = extractFloat64Frac(a);
3863 aExp = extractFloat64Exp(a);
3864 aSign = extractFloat64Sign(a);
3865 bSig = extractFloat64Frac(b);
3866 bExp = extractFloat64Exp(b);
3867 bSign = extractFloat64Sign(b);
3868 cSig = extractFloat64Frac(c);
3869 cExp = extractFloat64Exp(c);
3870 cSign = extractFloat64Sign(c);
3871
3872 infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) ||
3873 (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0));
3874
3875 /* It is implementation-defined whether the cases of (0,inf,qnan)
3876 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
3877 * they return if they do), so we have to hand this information
3878 * off to the target-specific pick-a-NaN routine.
3879 */
3880 if (((aExp == 0x7ff) && aSig) ||
3881 ((bExp == 0x7ff) && bSig) ||
3882 ((cExp == 0x7ff) && cSig)) {
3883 return propagateFloat64MulAddNaN(a, b, c, infzero STATUS_VAR);
3884 }
3885
3886 if (infzero) {
3887 float_raise(float_flag_invalid STATUS_VAR);
3888 return float64_default_nan;
3889 }
3890
3891 if (flags & float_muladd_negate_c) {
3892 cSign ^= 1;
3893 }
3894
3895 signflip = (flags & float_muladd_negate_result) ? 1 : 0;
3896
3897 /* Work out the sign and type of the product */
3898 pSign = aSign ^ bSign;
3899 if (flags & float_muladd_negate_product) {
3900 pSign ^= 1;
3901 }
3902 pInf = (aExp == 0x7ff) || (bExp == 0x7ff);
3903 pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
3904
3905 if (cExp == 0x7ff) {
3906 if (pInf && (pSign ^ cSign)) {
3907 /* addition of opposite-signed infinities => InvalidOperation */
3908 float_raise(float_flag_invalid STATUS_VAR);
3909 return float64_default_nan;
3910 }
3911 /* Otherwise generate an infinity of the same sign */
3912 return packFloat64(cSign ^ signflip, 0x7ff, 0);
3913 }
3914
3915 if (pInf) {
3916 return packFloat64(pSign ^ signflip, 0x7ff, 0);
3917 }
3918
3919 if (pZero) {
3920 if (cExp == 0) {
3921 if (cSig == 0) {
3922 /* Adding two exact zeroes */
3923 if (pSign == cSign) {
3924 zSign = pSign;
3925 } else if (STATUS(float_rounding_mode) == float_round_down) {
3926 zSign = 1;
3927 } else {
3928 zSign = 0;
3929 }
3930 return packFloat64(zSign ^ signflip, 0, 0);
3931 }
3932 /* Exact zero plus a denorm */
3933 if (STATUS(flush_to_zero)) {
3934 float_raise(float_flag_output_denormal STATUS_VAR);
3935 return packFloat64(cSign ^ signflip, 0, 0);
3936 }
3937 }
3938 /* Zero plus something non-zero : just return the something */
a6e7c184 3939 return packFloat64(cSign ^ signflip, cExp, cSig);
369be8f6
PM
3940 }
3941
3942 if (aExp == 0) {
3943 normalizeFloat64Subnormal(aSig, &aExp, &aSig);
3944 }
3945 if (bExp == 0) {
3946 normalizeFloat64Subnormal(bSig, &bExp, &bSig);
3947 }
3948
3949 /* Calculate the actual result a * b + c */
3950
3951 /* Multiply first; this is easy. */
3952 /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff
3953 * because we want the true exponent, not the "one-less-than"
3954 * flavour that roundAndPackFloat64() takes.
3955 */
3956 pExp = aExp + bExp - 0x3fe;
3957 aSig = (aSig | LIT64(0x0010000000000000))<<10;
3958 bSig = (bSig | LIT64(0x0010000000000000))<<11;
3959 mul64To128(aSig, bSig, &pSig0, &pSig1);
3960 if ((int64_t)(pSig0 << 1) >= 0) {
3961 shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1);
3962 pExp--;
3963 }
3964
3965 zSign = pSign ^ signflip;
3966
3967 /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit
3968 * bit in position 126.
3969 */
3970 if (cExp == 0) {
3971 if (!cSig) {
3972 /* Throw out the special case of c being an exact zero now */
3973 shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1);
3974 return roundAndPackFloat64(zSign, pExp - 1,
3975 pSig1 STATUS_VAR);
3976 }
3977 normalizeFloat64Subnormal(cSig, &cExp, &cSig);
3978 }
3979
3980 /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the
3981 * significand of the addend, with the explicit bit in position 126.
3982 */
3983 cSig0 = cSig << (126 - 64 - 52);
3984 cSig1 = 0;
3985 cSig0 |= LIT64(0x4000000000000000);
3986 expDiff = pExp - cExp;
3987
3988 if (pSign == cSign) {
3989 /* Addition */
3990 if (expDiff > 0) {
3991 /* scale c to match p */
3992 shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
3993 zExp = pExp;
3994 } else if (expDiff < 0) {
3995 /* scale p to match c */
3996 shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
3997 zExp = cExp;
3998 } else {
3999 /* no scaling needed */
4000 zExp = cExp;
4001 }
4002 /* Add significands and make sure explicit bit ends up in posn 126 */
4003 add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4004 if ((int64_t)zSig0 < 0) {
4005 shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);
4006 } else {
4007 zExp--;
4008 }
4009 shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1);
4010 return roundAndPackFloat64(zSign, zExp, zSig1 STATUS_VAR);
4011 } else {
4012 /* Subtraction */
4013 if (expDiff > 0) {
4014 shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4015 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4016 zExp = pExp;
4017 } else if (expDiff < 0) {
4018 shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4019 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4020 zExp = cExp;
4021 zSign ^= 1;
4022 } else {
4023 zExp = pExp;
4024 if (lt128(cSig0, cSig1, pSig0, pSig1)) {
4025 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4026 } else if (lt128(pSig0, pSig1, cSig0, cSig1)) {
4027 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4028 zSign ^= 1;
4029 } else {
4030 /* Exact zero */
4031 zSign = signflip;
4032 if (STATUS(float_rounding_mode) == float_round_down) {
4033 zSign ^= 1;
4034 }
4035 return packFloat64(zSign, 0, 0);
4036 }
4037 }
4038 --zExp;
4039 /* Do the equivalent of normalizeRoundAndPackFloat64() but
4040 * starting with the significand in a pair of uint64_t.
4041 */
4042 if (zSig0) {
4043 shiftcount = countLeadingZeros64(zSig0) - 1;
4044 shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1);
4045 if (zSig1) {
4046 zSig0 |= 1;
4047 }
4048 zExp -= shiftcount;
4049 } else {
e3d142d0
PM
4050 shiftcount = countLeadingZeros64(zSig1);
4051 if (shiftcount == 0) {
4052 zSig0 = (zSig1 >> 1) | (zSig1 & 1);
4053 zExp -= 63;
4054 } else {
4055 shiftcount--;
4056 zSig0 = zSig1 << shiftcount;
4057 zExp -= (shiftcount + 64);
4058 }
369be8f6
PM
4059 }
4060 return roundAndPackFloat64(zSign, zExp, zSig0 STATUS_VAR);
4061 }
4062}
4063
158142c2
FB
4064/*----------------------------------------------------------------------------
4065| Returns the square root of the double-precision floating-point value `a'.
4066| The operation is performed according to the IEC/IEEE Standard for Binary
4067| Floating-Point Arithmetic.
4068*----------------------------------------------------------------------------*/
4069
4070float64 float64_sqrt( float64 a STATUS_PARAM )
4071{
4072 flag aSign;
94a49d86 4073 int_fast16_t aExp, zExp;
bb98fe42
AF
4074 uint64_t aSig, zSig, doubleZSig;
4075 uint64_t rem0, rem1, term0, term1;
37d18660 4076 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
4077
4078 aSig = extractFloat64Frac( a );
4079 aExp = extractFloat64Exp( a );
4080 aSign = extractFloat64Sign( a );
4081 if ( aExp == 0x7FF ) {
4082 if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
4083 if ( ! aSign ) return a;
4084 float_raise( float_flag_invalid STATUS_VAR);
4085 return float64_default_nan;
4086 }
4087 if ( aSign ) {
4088 if ( ( aExp | aSig ) == 0 ) return a;
4089 float_raise( float_flag_invalid STATUS_VAR);
4090 return float64_default_nan;
4091 }
4092 if ( aExp == 0 ) {
f090c9d4 4093 if ( aSig == 0 ) return float64_zero;
158142c2
FB
4094 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4095 }
4096 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
4097 aSig |= LIT64( 0x0010000000000000 );
4098 zSig = estimateSqrt32( aExp, aSig>>21 );
4099 aSig <<= 9 - ( aExp & 1 );
4100 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
4101 if ( ( zSig & 0x1FF ) <= 5 ) {
4102 doubleZSig = zSig<<1;
4103 mul64To128( zSig, zSig, &term0, &term1 );
4104 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
bb98fe42 4105 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
4106 --zSig;
4107 doubleZSig -= 2;
4108 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
4109 }
4110 zSig |= ( ( rem0 | rem1 ) != 0 );
4111 }
4112 return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
4113
4114}
4115
374dfc33
AJ
4116/*----------------------------------------------------------------------------
4117| Returns the binary log of the double-precision floating-point value `a'.
4118| The operation is performed according to the IEC/IEEE Standard for Binary
4119| Floating-Point Arithmetic.
4120*----------------------------------------------------------------------------*/
4121float64 float64_log2( float64 a STATUS_PARAM )
4122{
4123 flag aSign, zSign;
94a49d86 4124 int_fast16_t aExp;
bb98fe42 4125 uint64_t aSig, aSig0, aSig1, zSig, i;
37d18660 4126 a = float64_squash_input_denormal(a STATUS_VAR);
374dfc33
AJ
4127
4128 aSig = extractFloat64Frac( a );
4129 aExp = extractFloat64Exp( a );
4130 aSign = extractFloat64Sign( a );
4131
4132 if ( aExp == 0 ) {
4133 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4134 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4135 }
4136 if ( aSign ) {
4137 float_raise( float_flag_invalid STATUS_VAR);
4138 return float64_default_nan;
4139 }
4140 if ( aExp == 0x7FF ) {
4141 if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
4142 return a;
4143 }
4144
4145 aExp -= 0x3FF;
4146 aSig |= LIT64( 0x0010000000000000 );
4147 zSign = aExp < 0;
bb98fe42 4148 zSig = (uint64_t)aExp << 52;
374dfc33
AJ
4149 for (i = 1LL << 51; i > 0; i >>= 1) {
4150 mul64To128( aSig, aSig, &aSig0, &aSig1 );
4151 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4152 if ( aSig & LIT64( 0x0020000000000000 ) ) {
4153 aSig >>= 1;
4154 zSig |= i;
4155 }
4156 }
4157
4158 if ( zSign )
4159 zSig = -zSig;
4160 return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
4161}
4162
158142c2
FB
4163/*----------------------------------------------------------------------------
4164| Returns 1 if the double-precision floating-point value `a' is equal to the
b689362d
AJ
4165| corresponding value `b', and 0 otherwise. The invalid exception is raised
4166| if either operand is a NaN. Otherwise, the comparison is performed
158142c2
FB
4167| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4168*----------------------------------------------------------------------------*/
4169
b689362d 4170int float64_eq( float64 a, float64 b STATUS_PARAM )
158142c2 4171{
bb98fe42 4172 uint64_t av, bv;
37d18660
PM
4173 a = float64_squash_input_denormal(a STATUS_VAR);
4174 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
4175
4176 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4177 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4178 ) {
b689362d 4179 float_raise( float_flag_invalid STATUS_VAR);
158142c2
FB
4180 return 0;
4181 }
f090c9d4 4182 av = float64_val(a);
a1b91bb4 4183 bv = float64_val(b);
bb98fe42 4184 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
158142c2
FB
4185
4186}
4187
4188/*----------------------------------------------------------------------------
4189| Returns 1 if the double-precision floating-point value `a' is less than or
f5a64251
AJ
4190| equal to the corresponding value `b', and 0 otherwise. The invalid
4191| exception is raised if either operand is a NaN. The comparison is performed
4192| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
4193*----------------------------------------------------------------------------*/
4194
750afe93 4195int float64_le( float64 a, float64 b STATUS_PARAM )
158142c2
FB
4196{
4197 flag aSign, bSign;
bb98fe42 4198 uint64_t av, bv;
37d18660
PM
4199 a = float64_squash_input_denormal(a STATUS_VAR);
4200 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
4201
4202 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4203 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4204 ) {
4205 float_raise( float_flag_invalid STATUS_VAR);
4206 return 0;
4207 }
4208 aSign = extractFloat64Sign( a );
4209 bSign = extractFloat64Sign( b );
f090c9d4 4210 av = float64_val(a);
a1b91bb4 4211 bv = float64_val(b);
bb98fe42 4212 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 4213 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
4214
4215}
4216
4217/*----------------------------------------------------------------------------
4218| Returns 1 if the double-precision floating-point value `a' is less than
f5a64251
AJ
4219| the corresponding value `b', and 0 otherwise. The invalid exception is
4220| raised if either operand is a NaN. The comparison is performed according
4221| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
4222*----------------------------------------------------------------------------*/
4223
750afe93 4224int float64_lt( float64 a, float64 b STATUS_PARAM )
158142c2
FB
4225{
4226 flag aSign, bSign;
bb98fe42 4227 uint64_t av, bv;
158142c2 4228
37d18660
PM
4229 a = float64_squash_input_denormal(a STATUS_VAR);
4230 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
4231 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4232 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4233 ) {
4234 float_raise( float_flag_invalid STATUS_VAR);
4235 return 0;
4236 }
4237 aSign = extractFloat64Sign( a );
4238 bSign = extractFloat64Sign( b );
f090c9d4 4239 av = float64_val(a);
a1b91bb4 4240 bv = float64_val(b);
bb98fe42 4241 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 4242 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
4243
4244}
4245
67b7861d
AJ
4246/*----------------------------------------------------------------------------
4247| Returns 1 if the double-precision floating-point values `a' and `b' cannot
f5a64251
AJ
4248| be compared, and 0 otherwise. The invalid exception is raised if either
4249| operand is a NaN. The comparison is performed according to the IEC/IEEE
4250| Standard for Binary Floating-Point Arithmetic.
67b7861d
AJ
4251*----------------------------------------------------------------------------*/
4252
4253int float64_unordered( float64 a, float64 b STATUS_PARAM )
4254{
4255 a = float64_squash_input_denormal(a STATUS_VAR);
4256 b = float64_squash_input_denormal(b STATUS_VAR);
4257
4258 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4259 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4260 ) {
4261 float_raise( float_flag_invalid STATUS_VAR);
4262 return 1;
4263 }
4264 return 0;
4265}
4266
158142c2
FB
4267/*----------------------------------------------------------------------------
4268| Returns 1 if the double-precision floating-point value `a' is equal to the
f5a64251
AJ
4269| corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4270| exception.The comparison is performed according to the IEC/IEEE Standard
4271| for Binary Floating-Point Arithmetic.
158142c2
FB
4272*----------------------------------------------------------------------------*/
4273
b689362d 4274int float64_eq_quiet( float64 a, float64 b STATUS_PARAM )
158142c2 4275{
bb98fe42 4276 uint64_t av, bv;
37d18660
PM
4277 a = float64_squash_input_denormal(a STATUS_VAR);
4278 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
4279
4280 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4281 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4282 ) {
b689362d
AJ
4283 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4284 float_raise( float_flag_invalid STATUS_VAR);
4285 }
158142c2
FB
4286 return 0;
4287 }
f090c9d4 4288 av = float64_val(a);
a1b91bb4 4289 bv = float64_val(b);
bb98fe42 4290 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
158142c2
FB
4291
4292}
4293
4294/*----------------------------------------------------------------------------
4295| Returns 1 if the double-precision floating-point value `a' is less than or
4296| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4297| cause an exception. Otherwise, the comparison is performed according to the
4298| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4299*----------------------------------------------------------------------------*/
4300
750afe93 4301int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
158142c2
FB
4302{
4303 flag aSign, bSign;
bb98fe42 4304 uint64_t av, bv;
37d18660
PM
4305 a = float64_squash_input_denormal(a STATUS_VAR);
4306 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
4307
4308 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4309 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4310 ) {
4311 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4312 float_raise( float_flag_invalid STATUS_VAR);
4313 }
4314 return 0;
4315 }
4316 aSign = extractFloat64Sign( a );
4317 bSign = extractFloat64Sign( b );
f090c9d4 4318 av = float64_val(a);
a1b91bb4 4319 bv = float64_val(b);
bb98fe42 4320 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 4321 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
4322
4323}
4324
4325/*----------------------------------------------------------------------------
4326| Returns 1 if the double-precision floating-point value `a' is less than
4327| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4328| exception. Otherwise, the comparison is performed according to the IEC/IEEE
4329| Standard for Binary Floating-Point Arithmetic.
4330*----------------------------------------------------------------------------*/
4331
750afe93 4332int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
158142c2
FB
4333{
4334 flag aSign, bSign;
bb98fe42 4335 uint64_t av, bv;
37d18660
PM
4336 a = float64_squash_input_denormal(a STATUS_VAR);
4337 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
4338
4339 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4340 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4341 ) {
4342 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4343 float_raise( float_flag_invalid STATUS_VAR);
4344 }
4345 return 0;
4346 }
4347 aSign = extractFloat64Sign( a );
4348 bSign = extractFloat64Sign( b );
f090c9d4 4349 av = float64_val(a);
a1b91bb4 4350 bv = float64_val(b);
bb98fe42 4351 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 4352 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
4353
4354}
4355
67b7861d
AJ
4356/*----------------------------------------------------------------------------
4357| Returns 1 if the double-precision floating-point values `a' and `b' cannot
4358| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4359| comparison is performed according to the IEC/IEEE Standard for Binary
4360| Floating-Point Arithmetic.
4361*----------------------------------------------------------------------------*/
4362
4363int float64_unordered_quiet( float64 a, float64 b STATUS_PARAM )
4364{
4365 a = float64_squash_input_denormal(a STATUS_VAR);
4366 b = float64_squash_input_denormal(b STATUS_VAR);
4367
4368 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4369 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4370 ) {
4371 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4372 float_raise( float_flag_invalid STATUS_VAR);
4373 }
4374 return 1;
4375 }
4376 return 0;
4377}
4378
158142c2
FB
4379/*----------------------------------------------------------------------------
4380| Returns the result of converting the extended double-precision floating-
4381| point value `a' to the 32-bit two's complement integer format. The
4382| conversion is performed according to the IEC/IEEE Standard for Binary
4383| Floating-Point Arithmetic---which means in particular that the conversion
4384| is rounded according to the current rounding mode. If `a' is a NaN, the
4385| largest positive integer is returned. Otherwise, if the conversion
4386| overflows, the largest integer with the same sign as `a' is returned.
4387*----------------------------------------------------------------------------*/
4388
4389int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
4390{
4391 flag aSign;
4392 int32 aExp, shiftCount;
bb98fe42 4393 uint64_t aSig;
158142c2
FB
4394
4395 aSig = extractFloatx80Frac( a );
4396 aExp = extractFloatx80Exp( a );
4397 aSign = extractFloatx80Sign( a );
bb98fe42 4398 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
158142c2
FB
4399 shiftCount = 0x4037 - aExp;
4400 if ( shiftCount <= 0 ) shiftCount = 1;
4401 shift64RightJamming( aSig, shiftCount, &aSig );
4402 return roundAndPackInt32( aSign, aSig STATUS_VAR );
4403
4404}
4405
4406/*----------------------------------------------------------------------------
4407| Returns the result of converting the extended double-precision floating-
4408| point value `a' to the 32-bit two's complement integer format. The
4409| conversion is performed according to the IEC/IEEE Standard for Binary
4410| Floating-Point Arithmetic, except that the conversion is always rounded
4411| toward zero. If `a' is a NaN, the largest positive integer is returned.
4412| Otherwise, if the conversion overflows, the largest integer with the same
4413| sign as `a' is returned.
4414*----------------------------------------------------------------------------*/
4415
4416int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
4417{
4418 flag aSign;
4419 int32 aExp, shiftCount;
bb98fe42 4420 uint64_t aSig, savedASig;
b3a6a2e0 4421 int32_t z;
158142c2
FB
4422
4423 aSig = extractFloatx80Frac( a );
4424 aExp = extractFloatx80Exp( a );
4425 aSign = extractFloatx80Sign( a );
4426 if ( 0x401E < aExp ) {
bb98fe42 4427 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
158142c2
FB
4428 goto invalid;
4429 }
4430 else if ( aExp < 0x3FFF ) {
4431 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
4432 return 0;
4433 }
4434 shiftCount = 0x403E - aExp;
4435 savedASig = aSig;
4436 aSig >>= shiftCount;
4437 z = aSig;
4438 if ( aSign ) z = - z;
4439 if ( ( z < 0 ) ^ aSign ) {
4440 invalid:
4441 float_raise( float_flag_invalid STATUS_VAR);
bb98fe42 4442 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2
FB
4443 }
4444 if ( ( aSig<<shiftCount ) != savedASig ) {
4445 STATUS(float_exception_flags) |= float_flag_inexact;
4446 }
4447 return z;
4448
4449}
4450
4451/*----------------------------------------------------------------------------
4452| Returns the result of converting the extended double-precision floating-
4453| point value `a' to the 64-bit two's complement integer format. The
4454| conversion is performed according to the IEC/IEEE Standard for Binary
4455| Floating-Point Arithmetic---which means in particular that the conversion
4456| is rounded according to the current rounding mode. If `a' is a NaN,
4457| the largest positive integer is returned. Otherwise, if the conversion
4458| overflows, the largest integer with the same sign as `a' is returned.
4459*----------------------------------------------------------------------------*/
4460
4461int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
4462{
4463 flag aSign;
4464 int32 aExp, shiftCount;
bb98fe42 4465 uint64_t aSig, aSigExtra;
158142c2
FB
4466
4467 aSig = extractFloatx80Frac( a );
4468 aExp = extractFloatx80Exp( a );
4469 aSign = extractFloatx80Sign( a );
4470 shiftCount = 0x403E - aExp;
4471 if ( shiftCount <= 0 ) {
4472 if ( shiftCount ) {
4473 float_raise( float_flag_invalid STATUS_VAR);
4474 if ( ! aSign
4475 || ( ( aExp == 0x7FFF )
4476 && ( aSig != LIT64( 0x8000000000000000 ) ) )
4477 ) {
4478 return LIT64( 0x7FFFFFFFFFFFFFFF );
4479 }
bb98fe42 4480 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
4481 }
4482 aSigExtra = 0;
4483 }
4484 else {
4485 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4486 }
4487 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
4488
4489}
4490
4491/*----------------------------------------------------------------------------
4492| Returns the result of converting the extended double-precision floating-
4493| point value `a' to the 64-bit two's complement integer format. The
4494| conversion is performed according to the IEC/IEEE Standard for Binary
4495| Floating-Point Arithmetic, except that the conversion is always rounded
4496| toward zero. If `a' is a NaN, the largest positive integer is returned.
4497| Otherwise, if the conversion overflows, the largest integer with the same
4498| sign as `a' is returned.
4499*----------------------------------------------------------------------------*/
4500
4501int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
4502{
4503 flag aSign;
4504 int32 aExp, shiftCount;
bb98fe42 4505 uint64_t aSig;
158142c2
FB
4506 int64 z;
4507
4508 aSig = extractFloatx80Frac( a );
4509 aExp = extractFloatx80Exp( a );
4510 aSign = extractFloatx80Sign( a );
4511 shiftCount = aExp - 0x403E;
4512 if ( 0 <= shiftCount ) {
4513 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4514 if ( ( a.high != 0xC03E ) || aSig ) {
4515 float_raise( float_flag_invalid STATUS_VAR);
4516 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4517 return LIT64( 0x7FFFFFFFFFFFFFFF );
4518 }
4519 }
bb98fe42 4520 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
4521 }
4522 else if ( aExp < 0x3FFF ) {
4523 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
4524 return 0;
4525 }
4526 z = aSig>>( - shiftCount );
bb98fe42 4527 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
158142c2
FB
4528 STATUS(float_exception_flags) |= float_flag_inexact;
4529 }
4530 if ( aSign ) z = - z;
4531 return z;
4532
4533}
4534
4535/*----------------------------------------------------------------------------
4536| Returns the result of converting the extended double-precision floating-
4537| point value `a' to the single-precision floating-point format. The
4538| conversion is performed according to the IEC/IEEE Standard for Binary
4539| Floating-Point Arithmetic.
4540*----------------------------------------------------------------------------*/
4541
4542float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
4543{
4544 flag aSign;
4545 int32 aExp;
bb98fe42 4546 uint64_t aSig;
158142c2
FB
4547
4548 aSig = extractFloatx80Frac( a );
4549 aExp = extractFloatx80Exp( a );
4550 aSign = extractFloatx80Sign( a );
4551 if ( aExp == 0x7FFF ) {
bb98fe42 4552 if ( (uint64_t) ( aSig<<1 ) ) {
bcd4d9af 4553 return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
4554 }
4555 return packFloat32( aSign, 0xFF, 0 );
4556 }
4557 shift64RightJamming( aSig, 33, &aSig );
4558 if ( aExp || aSig ) aExp -= 0x3F81;
4559 return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
4560
4561}
4562
4563/*----------------------------------------------------------------------------
4564| Returns the result of converting the extended double-precision floating-
4565| point value `a' to the double-precision floating-point format. The
4566| conversion is performed according to the IEC/IEEE Standard for Binary
4567| Floating-Point Arithmetic.
4568*----------------------------------------------------------------------------*/
4569
4570float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
4571{
4572 flag aSign;
4573 int32 aExp;
bb98fe42 4574 uint64_t aSig, zSig;
158142c2
FB
4575
4576 aSig = extractFloatx80Frac( a );
4577 aExp = extractFloatx80Exp( a );
4578 aSign = extractFloatx80Sign( a );
4579 if ( aExp == 0x7FFF ) {
bb98fe42 4580 if ( (uint64_t) ( aSig<<1 ) ) {
bcd4d9af 4581 return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
4582 }
4583 return packFloat64( aSign, 0x7FF, 0 );
4584 }
4585 shift64RightJamming( aSig, 1, &zSig );
4586 if ( aExp || aSig ) aExp -= 0x3C01;
4587 return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
4588
4589}
4590
158142c2
FB
4591/*----------------------------------------------------------------------------
4592| Returns the result of converting the extended double-precision floating-
4593| point value `a' to the quadruple-precision floating-point format. The
4594| conversion is performed according to the IEC/IEEE Standard for Binary
4595| Floating-Point Arithmetic.
4596*----------------------------------------------------------------------------*/
4597
4598float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
4599{
4600 flag aSign;
94a49d86 4601 int_fast16_t aExp;
bb98fe42 4602 uint64_t aSig, zSig0, zSig1;
158142c2
FB
4603
4604 aSig = extractFloatx80Frac( a );
4605 aExp = extractFloatx80Exp( a );
4606 aSign = extractFloatx80Sign( a );
bb98fe42 4607 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
bcd4d9af 4608 return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
4609 }
4610 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4611 return packFloat128( aSign, aExp, zSig0, zSig1 );
4612
4613}
4614
158142c2
FB
4615/*----------------------------------------------------------------------------
4616| Rounds the extended double-precision floating-point value `a' to an integer,
4617| and returns the result as an extended quadruple-precision floating-point
4618| value. The operation is performed according to the IEC/IEEE Standard for
4619| Binary Floating-Point Arithmetic.
4620*----------------------------------------------------------------------------*/
4621
4622floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
4623{
4624 flag aSign;
4625 int32 aExp;
bb98fe42 4626 uint64_t lastBitMask, roundBitsMask;
158142c2
FB
4627 int8 roundingMode;
4628 floatx80 z;
4629
4630 aExp = extractFloatx80Exp( a );
4631 if ( 0x403E <= aExp ) {
bb98fe42 4632 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
158142c2
FB
4633 return propagateFloatx80NaN( a, a STATUS_VAR );
4634 }
4635 return a;
4636 }
4637 if ( aExp < 0x3FFF ) {
4638 if ( ( aExp == 0 )
bb98fe42 4639 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
158142c2
FB
4640 return a;
4641 }
4642 STATUS(float_exception_flags) |= float_flag_inexact;
4643 aSign = extractFloatx80Sign( a );
4644 switch ( STATUS(float_rounding_mode) ) {
4645 case float_round_nearest_even:
bb98fe42 4646 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
158142c2
FB
4647 ) {
4648 return
4649 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4650 }
4651 break;
4652 case float_round_down:
4653 return
4654 aSign ?
4655 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4656 : packFloatx80( 0, 0, 0 );
4657 case float_round_up:
4658 return
4659 aSign ? packFloatx80( 1, 0, 0 )
4660 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4661 }
4662 return packFloatx80( aSign, 0, 0 );
4663 }
4664 lastBitMask = 1;
4665 lastBitMask <<= 0x403E - aExp;
4666 roundBitsMask = lastBitMask - 1;
4667 z = a;
4668 roundingMode = STATUS(float_rounding_mode);
4669 if ( roundingMode == float_round_nearest_even ) {
4670 z.low += lastBitMask>>1;
4671 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4672 }
4673 else if ( roundingMode != float_round_to_zero ) {
4674 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
4675 z.low += roundBitsMask;
4676 }
4677 }
4678 z.low &= ~ roundBitsMask;
4679 if ( z.low == 0 ) {
4680 ++z.high;
4681 z.low = LIT64( 0x8000000000000000 );
4682 }
4683 if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
4684 return z;
4685
4686}
4687
4688/*----------------------------------------------------------------------------
4689| Returns the result of adding the absolute values of the extended double-
4690| precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
4691| negated before being returned. `zSign' is ignored if the result is a NaN.
4692| The addition is performed according to the IEC/IEEE Standard for Binary
4693| Floating-Point Arithmetic.
4694*----------------------------------------------------------------------------*/
4695
4696static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
4697{
4698 int32 aExp, bExp, zExp;
bb98fe42 4699 uint64_t aSig, bSig, zSig0, zSig1;
158142c2
FB
4700 int32 expDiff;
4701
4702 aSig = extractFloatx80Frac( a );
4703 aExp = extractFloatx80Exp( a );
4704 bSig = extractFloatx80Frac( b );
4705 bExp = extractFloatx80Exp( b );
4706 expDiff = aExp - bExp;
4707 if ( 0 < expDiff ) {
4708 if ( aExp == 0x7FFF ) {
bb98fe42 4709 if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
4710 return a;
4711 }
4712 if ( bExp == 0 ) --expDiff;
4713 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4714 zExp = aExp;
4715 }
4716 else if ( expDiff < 0 ) {
4717 if ( bExp == 0x7FFF ) {
bb98fe42 4718 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
4719 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4720 }
4721 if ( aExp == 0 ) ++expDiff;
4722 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4723 zExp = bExp;
4724 }
4725 else {
4726 if ( aExp == 0x7FFF ) {
bb98fe42 4727 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
158142c2
FB
4728 return propagateFloatx80NaN( a, b STATUS_VAR );
4729 }
4730 return a;
4731 }
4732 zSig1 = 0;
4733 zSig0 = aSig + bSig;
4734 if ( aExp == 0 ) {
4735 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4736 goto roundAndPack;
4737 }
4738 zExp = aExp;
4739 goto shiftRight1;
4740 }
4741 zSig0 = aSig + bSig;
bb98fe42 4742 if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
158142c2
FB
4743 shiftRight1:
4744 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4745 zSig0 |= LIT64( 0x8000000000000000 );
4746 ++zExp;
4747 roundAndPack:
4748 return
4749 roundAndPackFloatx80(
4750 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4751
4752}
4753
4754/*----------------------------------------------------------------------------
4755| Returns the result of subtracting the absolute values of the extended
4756| double-precision floating-point values `a' and `b'. If `zSign' is 1, the
4757| difference is negated before being returned. `zSign' is ignored if the
4758| result is a NaN. The subtraction is performed according to the IEC/IEEE
4759| Standard for Binary Floating-Point Arithmetic.
4760*----------------------------------------------------------------------------*/
4761
4762static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
4763{
4764 int32 aExp, bExp, zExp;
bb98fe42 4765 uint64_t aSig, bSig, zSig0, zSig1;
158142c2
FB
4766 int32 expDiff;
4767 floatx80 z;
4768
4769 aSig = extractFloatx80Frac( a );
4770 aExp = extractFloatx80Exp( a );
4771 bSig = extractFloatx80Frac( b );
4772 bExp = extractFloatx80Exp( b );
4773 expDiff = aExp - bExp;
4774 if ( 0 < expDiff ) goto aExpBigger;
4775 if ( expDiff < 0 ) goto bExpBigger;
4776 if ( aExp == 0x7FFF ) {
bb98fe42 4777 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
158142c2
FB
4778 return propagateFloatx80NaN( a, b STATUS_VAR );
4779 }
4780 float_raise( float_flag_invalid STATUS_VAR);
4781 z.low = floatx80_default_nan_low;
4782 z.high = floatx80_default_nan_high;
4783 return z;
4784 }
4785 if ( aExp == 0 ) {
4786 aExp = 1;
4787 bExp = 1;
4788 }
4789 zSig1 = 0;
4790 if ( bSig < aSig ) goto aBigger;
4791 if ( aSig < bSig ) goto bBigger;
4792 return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
4793 bExpBigger:
4794 if ( bExp == 0x7FFF ) {
bb98fe42 4795 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
4796 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
4797 }
4798 if ( aExp == 0 ) ++expDiff;
4799 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4800 bBigger:
4801 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4802 zExp = bExp;
4803 zSign ^= 1;
4804 goto normalizeRoundAndPack;
4805 aExpBigger:
4806 if ( aExp == 0x7FFF ) {
bb98fe42 4807 if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
4808 return a;
4809 }
4810 if ( bExp == 0 ) --expDiff;
4811 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4812 aBigger:
4813 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4814 zExp = aExp;
4815 normalizeRoundAndPack:
4816 return
4817 normalizeRoundAndPackFloatx80(
4818 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4819
4820}
4821
4822/*----------------------------------------------------------------------------
4823| Returns the result of adding the extended double-precision floating-point
4824| values `a' and `b'. The operation is performed according to the IEC/IEEE
4825| Standard for Binary Floating-Point Arithmetic.
4826*----------------------------------------------------------------------------*/
4827
4828floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
4829{
4830 flag aSign, bSign;
4831
4832 aSign = extractFloatx80Sign( a );
4833 bSign = extractFloatx80Sign( b );
4834 if ( aSign == bSign ) {
4835 return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4836 }
4837 else {
4838 return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4839 }
4840
4841}
4842
4843/*----------------------------------------------------------------------------
4844| Returns the result of subtracting the extended double-precision floating-
4845| point values `a' and `b'. The operation is performed according to the
4846| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4847*----------------------------------------------------------------------------*/
4848
4849floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
4850{
4851 flag aSign, bSign;
4852
4853 aSign = extractFloatx80Sign( a );
4854 bSign = extractFloatx80Sign( b );
4855 if ( aSign == bSign ) {
4856 return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4857 }
4858 else {
4859 return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4860 }
4861
4862}
4863
4864/*----------------------------------------------------------------------------
4865| Returns the result of multiplying the extended double-precision floating-
4866| point values `a' and `b'. The operation is performed according to the
4867| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4868*----------------------------------------------------------------------------*/
4869
4870floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
4871{
4872 flag aSign, bSign, zSign;
4873 int32 aExp, bExp, zExp;
bb98fe42 4874 uint64_t aSig, bSig, zSig0, zSig1;
158142c2
FB
4875 floatx80 z;
4876
4877 aSig = extractFloatx80Frac( a );
4878 aExp = extractFloatx80Exp( a );
4879 aSign = extractFloatx80Sign( a );
4880 bSig = extractFloatx80Frac( b );
4881 bExp = extractFloatx80Exp( b );
4882 bSign = extractFloatx80Sign( b );
4883 zSign = aSign ^ bSign;
4884 if ( aExp == 0x7FFF ) {
bb98fe42
AF
4885 if ( (uint64_t) ( aSig<<1 )
4886 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
158142c2
FB
4887 return propagateFloatx80NaN( a, b STATUS_VAR );
4888 }
4889 if ( ( bExp | bSig ) == 0 ) goto invalid;
4890 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4891 }
4892 if ( bExp == 0x7FFF ) {
bb98fe42 4893 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
4894 if ( ( aExp | aSig ) == 0 ) {
4895 invalid:
4896 float_raise( float_flag_invalid STATUS_VAR);
4897 z.low = floatx80_default_nan_low;
4898 z.high = floatx80_default_nan_high;
4899 return z;
4900 }
4901 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4902 }
4903 if ( aExp == 0 ) {
4904 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4905 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4906 }
4907 if ( bExp == 0 ) {
4908 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
4909 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4910 }
4911 zExp = aExp + bExp - 0x3FFE;
4912 mul64To128( aSig, bSig, &zSig0, &zSig1 );
bb98fe42 4913 if ( 0 < (int64_t) zSig0 ) {
158142c2
FB
4914 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
4915 --zExp;
4916 }
4917 return
4918 roundAndPackFloatx80(
4919 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4920
4921}
4922
4923/*----------------------------------------------------------------------------
4924| Returns the result of dividing the extended double-precision floating-point
4925| value `a' by the corresponding value `b'. The operation is performed
4926| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4927*----------------------------------------------------------------------------*/
4928
4929floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
4930{
4931 flag aSign, bSign, zSign;
4932 int32 aExp, bExp, zExp;
bb98fe42
AF
4933 uint64_t aSig, bSig, zSig0, zSig1;
4934 uint64_t rem0, rem1, rem2, term0, term1, term2;
158142c2
FB
4935 floatx80 z;
4936
4937 aSig = extractFloatx80Frac( a );
4938 aExp = extractFloatx80Exp( a );
4939 aSign = extractFloatx80Sign( a );
4940 bSig = extractFloatx80Frac( b );
4941 bExp = extractFloatx80Exp( b );
4942 bSign = extractFloatx80Sign( b );
4943 zSign = aSign ^ bSign;
4944 if ( aExp == 0x7FFF ) {
bb98fe42 4945 if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2 4946 if ( bExp == 0x7FFF ) {
bb98fe42 4947 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
4948 goto invalid;
4949 }
4950 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4951 }
4952 if ( bExp == 0x7FFF ) {
bb98fe42 4953 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
4954 return packFloatx80( zSign, 0, 0 );
4955 }
4956 if ( bExp == 0 ) {
4957 if ( bSig == 0 ) {
4958 if ( ( aExp | aSig ) == 0 ) {
4959 invalid:
4960 float_raise( float_flag_invalid STATUS_VAR);
4961 z.low = floatx80_default_nan_low;
4962 z.high = floatx80_default_nan_high;
4963 return z;
4964 }
4965 float_raise( float_flag_divbyzero STATUS_VAR);
4966 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4967 }
4968 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4969 }
4970 if ( aExp == 0 ) {
4971 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4972 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4973 }
4974 zExp = aExp - bExp + 0x3FFE;
4975 rem1 = 0;
4976 if ( bSig <= aSig ) {
4977 shift128Right( aSig, 0, 1, &aSig, &rem1 );
4978 ++zExp;
4979 }
4980 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
4981 mul64To128( bSig, zSig0, &term0, &term1 );
4982 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
bb98fe42 4983 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
4984 --zSig0;
4985 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4986 }
4987 zSig1 = estimateDiv128To64( rem1, 0, bSig );
bb98fe42 4988 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
158142c2
FB
4989 mul64To128( bSig, zSig1, &term1, &term2 );
4990 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
bb98fe42 4991 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
4992 --zSig1;
4993 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
4994 }
4995 zSig1 |= ( ( rem1 | rem2 ) != 0 );
4996 }
4997 return
4998 roundAndPackFloatx80(
4999 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
5000
5001}
5002
5003/*----------------------------------------------------------------------------
5004| Returns the remainder of the extended double-precision floating-point value
5005| `a' with respect to the corresponding value `b'. The operation is performed
5006| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5007*----------------------------------------------------------------------------*/
5008
5009floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
5010{
ed086f3d 5011 flag aSign, zSign;
158142c2 5012 int32 aExp, bExp, expDiff;
bb98fe42
AF
5013 uint64_t aSig0, aSig1, bSig;
5014 uint64_t q, term0, term1, alternateASig0, alternateASig1;
158142c2
FB
5015 floatx80 z;
5016
5017 aSig0 = extractFloatx80Frac( a );
5018 aExp = extractFloatx80Exp( a );
5019 aSign = extractFloatx80Sign( a );
5020 bSig = extractFloatx80Frac( b );
5021 bExp = extractFloatx80Exp( b );
158142c2 5022 if ( aExp == 0x7FFF ) {
bb98fe42
AF
5023 if ( (uint64_t) ( aSig0<<1 )
5024 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
158142c2
FB
5025 return propagateFloatx80NaN( a, b STATUS_VAR );
5026 }
5027 goto invalid;
5028 }
5029 if ( bExp == 0x7FFF ) {
bb98fe42 5030 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
5031 return a;
5032 }
5033 if ( bExp == 0 ) {
5034 if ( bSig == 0 ) {
5035 invalid:
5036 float_raise( float_flag_invalid STATUS_VAR);
5037 z.low = floatx80_default_nan_low;
5038 z.high = floatx80_default_nan_high;
5039 return z;
5040 }
5041 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5042 }
5043 if ( aExp == 0 ) {
bb98fe42 5044 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
158142c2
FB
5045 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5046 }
5047 bSig |= LIT64( 0x8000000000000000 );
5048 zSign = aSign;
5049 expDiff = aExp - bExp;
5050 aSig1 = 0;
5051 if ( expDiff < 0 ) {
5052 if ( expDiff < -1 ) return a;
5053 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5054 expDiff = 0;
5055 }
5056 q = ( bSig <= aSig0 );
5057 if ( q ) aSig0 -= bSig;
5058 expDiff -= 64;
5059 while ( 0 < expDiff ) {
5060 q = estimateDiv128To64( aSig0, aSig1, bSig );
5061 q = ( 2 < q ) ? q - 2 : 0;
5062 mul64To128( bSig, q, &term0, &term1 );
5063 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5064 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5065 expDiff -= 62;
5066 }
5067 expDiff += 64;
5068 if ( 0 < expDiff ) {
5069 q = estimateDiv128To64( aSig0, aSig1, bSig );
5070 q = ( 2 < q ) ? q - 2 : 0;
5071 q >>= 64 - expDiff;
5072 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5073 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5074 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5075 while ( le128( term0, term1, aSig0, aSig1 ) ) {
5076 ++q;
5077 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5078 }
5079 }
5080 else {
5081 term1 = 0;
5082 term0 = bSig;
5083 }
5084 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5085 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5086 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5087 && ( q & 1 ) )
5088 ) {
5089 aSig0 = alternateASig0;
5090 aSig1 = alternateASig1;
5091 zSign = ! zSign;
5092 }
5093 return
5094 normalizeRoundAndPackFloatx80(
5095 80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
5096
5097}
5098
5099/*----------------------------------------------------------------------------
5100| Returns the square root of the extended double-precision floating-point
5101| value `a'. The operation is performed according to the IEC/IEEE Standard
5102| for Binary Floating-Point Arithmetic.
5103*----------------------------------------------------------------------------*/
5104
5105floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
5106{
5107 flag aSign;
5108 int32 aExp, zExp;
bb98fe42
AF
5109 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5110 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
158142c2
FB
5111 floatx80 z;
5112
5113 aSig0 = extractFloatx80Frac( a );
5114 aExp = extractFloatx80Exp( a );
5115 aSign = extractFloatx80Sign( a );
5116 if ( aExp == 0x7FFF ) {
bb98fe42 5117 if ( (uint64_t) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
158142c2
FB
5118 if ( ! aSign ) return a;
5119 goto invalid;
5120 }
5121 if ( aSign ) {
5122 if ( ( aExp | aSig0 ) == 0 ) return a;
5123 invalid:
5124 float_raise( float_flag_invalid STATUS_VAR);
5125 z.low = floatx80_default_nan_low;
5126 z.high = floatx80_default_nan_high;
5127 return z;
5128 }
5129 if ( aExp == 0 ) {
5130 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5131 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5132 }
5133 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5134 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5135 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5136 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5137 doubleZSig0 = zSig0<<1;
5138 mul64To128( zSig0, zSig0, &term0, &term1 );
5139 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
bb98fe42 5140 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
5141 --zSig0;
5142 doubleZSig0 -= 2;
5143 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5144 }
5145 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5146 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5147 if ( zSig1 == 0 ) zSig1 = 1;
5148 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5149 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5150 mul64To128( zSig1, zSig1, &term2, &term3 );
5151 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
bb98fe42 5152 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
5153 --zSig1;
5154 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5155 term3 |= 1;
5156 term2 |= doubleZSig0;
5157 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5158 }
5159 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5160 }
5161 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5162 zSig0 |= doubleZSig0;
5163 return
5164 roundAndPackFloatx80(
5165 STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
5166
5167}
5168
5169/*----------------------------------------------------------------------------
b689362d
AJ
5170| Returns 1 if the extended double-precision floating-point value `a' is equal
5171| to the corresponding value `b', and 0 otherwise. The invalid exception is
5172| raised if either operand is a NaN. Otherwise, the comparison is performed
5173| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
5174*----------------------------------------------------------------------------*/
5175
b689362d 5176int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
158142c2
FB
5177{
5178
5179 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5180 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5181 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5182 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2 5183 ) {
b689362d 5184 float_raise( float_flag_invalid STATUS_VAR);
158142c2
FB
5185 return 0;
5186 }
5187 return
5188 ( a.low == b.low )
5189 && ( ( a.high == b.high )
5190 || ( ( a.low == 0 )
bb98fe42 5191 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
5192 );
5193
5194}
5195
5196/*----------------------------------------------------------------------------
5197| Returns 1 if the extended double-precision floating-point value `a' is
5198| less than or equal to the corresponding value `b', and 0 otherwise. The
f5a64251
AJ
5199| invalid exception is raised if either operand is a NaN. The comparison is
5200| performed according to the IEC/IEEE Standard for Binary Floating-Point
5201| Arithmetic.
158142c2
FB
5202*----------------------------------------------------------------------------*/
5203
750afe93 5204int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
158142c2
FB
5205{
5206 flag aSign, bSign;
5207
5208 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5209 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5210 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5211 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2
FB
5212 ) {
5213 float_raise( float_flag_invalid STATUS_VAR);
5214 return 0;
5215 }
5216 aSign = extractFloatx80Sign( a );
5217 bSign = extractFloatx80Sign( b );
5218 if ( aSign != bSign ) {
5219 return
5220 aSign
bb98fe42 5221 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5222 == 0 );
5223 }
5224 return
5225 aSign ? le128( b.high, b.low, a.high, a.low )
5226 : le128( a.high, a.low, b.high, b.low );
5227
5228}
5229
5230/*----------------------------------------------------------------------------
5231| Returns 1 if the extended double-precision floating-point value `a' is
f5a64251
AJ
5232| less than the corresponding value `b', and 0 otherwise. The invalid
5233| exception is raised if either operand is a NaN. The comparison is performed
5234| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
5235*----------------------------------------------------------------------------*/
5236
750afe93 5237int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
158142c2
FB
5238{
5239 flag aSign, bSign;
5240
5241 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5242 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5243 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5244 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2
FB
5245 ) {
5246 float_raise( float_flag_invalid STATUS_VAR);
5247 return 0;
5248 }
5249 aSign = extractFloatx80Sign( a );
5250 bSign = extractFloatx80Sign( b );
5251 if ( aSign != bSign ) {
5252 return
5253 aSign
bb98fe42 5254 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5255 != 0 );
5256 }
5257 return
5258 aSign ? lt128( b.high, b.low, a.high, a.low )
5259 : lt128( a.high, a.low, b.high, b.low );
5260
5261}
5262
67b7861d
AJ
5263/*----------------------------------------------------------------------------
5264| Returns 1 if the extended double-precision floating-point values `a' and `b'
f5a64251
AJ
5265| cannot be compared, and 0 otherwise. The invalid exception is raised if
5266| either operand is a NaN. The comparison is performed according to the
5267| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
67b7861d
AJ
5268*----------------------------------------------------------------------------*/
5269int floatx80_unordered( floatx80 a, floatx80 b STATUS_PARAM )
5270{
5271 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5272 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5273 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5274 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5275 ) {
5276 float_raise( float_flag_invalid STATUS_VAR);
5277 return 1;
5278 }
5279 return 0;
5280}
5281
158142c2 5282/*----------------------------------------------------------------------------
b689362d 5283| Returns 1 if the extended double-precision floating-point value `a' is
f5a64251
AJ
5284| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5285| cause an exception. The comparison is performed according to the IEC/IEEE
5286| Standard for Binary Floating-Point Arithmetic.
158142c2
FB
5287*----------------------------------------------------------------------------*/
5288
b689362d 5289int floatx80_eq_quiet( floatx80 a, floatx80 b STATUS_PARAM )
158142c2
FB
5290{
5291
5292 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5293 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5294 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5295 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2 5296 ) {
b689362d
AJ
5297 if ( floatx80_is_signaling_nan( a )
5298 || floatx80_is_signaling_nan( b ) ) {
5299 float_raise( float_flag_invalid STATUS_VAR);
5300 }
158142c2
FB
5301 return 0;
5302 }
5303 return
5304 ( a.low == b.low )
5305 && ( ( a.high == b.high )
5306 || ( ( a.low == 0 )
bb98fe42 5307 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
5308 );
5309
5310}
5311
5312/*----------------------------------------------------------------------------
5313| Returns 1 if the extended double-precision floating-point value `a' is less
5314| than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
5315| do not cause an exception. Otherwise, the comparison is performed according
5316| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5317*----------------------------------------------------------------------------*/
5318
750afe93 5319int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
158142c2
FB
5320{
5321 flag aSign, bSign;
5322
5323 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5324 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5325 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5326 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2
FB
5327 ) {
5328 if ( floatx80_is_signaling_nan( a )
5329 || floatx80_is_signaling_nan( b ) ) {
5330 float_raise( float_flag_invalid STATUS_VAR);
5331 }
5332 return 0;
5333 }
5334 aSign = extractFloatx80Sign( a );
5335 bSign = extractFloatx80Sign( b );
5336 if ( aSign != bSign ) {
5337 return
5338 aSign
bb98fe42 5339 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5340 == 0 );
5341 }
5342 return
5343 aSign ? le128( b.high, b.low, a.high, a.low )
5344 : le128( a.high, a.low, b.high, b.low );
5345
5346}
5347
5348/*----------------------------------------------------------------------------
5349| Returns 1 if the extended double-precision floating-point value `a' is less
5350| than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
5351| an exception. Otherwise, the comparison is performed according to the
5352| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5353*----------------------------------------------------------------------------*/
5354
750afe93 5355int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
158142c2
FB
5356{
5357 flag aSign, bSign;
5358
5359 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5360 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5361 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5362 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2
FB
5363 ) {
5364 if ( floatx80_is_signaling_nan( a )
5365 || floatx80_is_signaling_nan( b ) ) {
5366 float_raise( float_flag_invalid STATUS_VAR);
5367 }
5368 return 0;
5369 }
5370 aSign = extractFloatx80Sign( a );
5371 bSign = extractFloatx80Sign( b );
5372 if ( aSign != bSign ) {
5373 return
5374 aSign
bb98fe42 5375 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5376 != 0 );
5377 }
5378 return
5379 aSign ? lt128( b.high, b.low, a.high, a.low )
5380 : lt128( a.high, a.low, b.high, b.low );
5381
5382}
5383
67b7861d
AJ
5384/*----------------------------------------------------------------------------
5385| Returns 1 if the extended double-precision floating-point values `a' and `b'
5386| cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
5387| The comparison is performed according to the IEC/IEEE Standard for Binary
5388| Floating-Point Arithmetic.
5389*----------------------------------------------------------------------------*/
5390int floatx80_unordered_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5391{
5392 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5393 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5394 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5395 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5396 ) {
5397 if ( floatx80_is_signaling_nan( a )
5398 || floatx80_is_signaling_nan( b ) ) {
5399 float_raise( float_flag_invalid STATUS_VAR);
5400 }
5401 return 1;
5402 }
5403 return 0;
5404}
5405
158142c2
FB
5406/*----------------------------------------------------------------------------
5407| Returns the result of converting the quadruple-precision floating-point
5408| value `a' to the 32-bit two's complement integer format. The conversion
5409| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5410| Arithmetic---which means in particular that the conversion is rounded
5411| according to the current rounding mode. If `a' is a NaN, the largest
5412| positive integer is returned. Otherwise, if the conversion overflows, the
5413| largest integer with the same sign as `a' is returned.
5414*----------------------------------------------------------------------------*/
5415
5416int32 float128_to_int32( float128 a STATUS_PARAM )
5417{
5418 flag aSign;
5419 int32 aExp, shiftCount;
bb98fe42 5420 uint64_t aSig0, aSig1;
158142c2
FB
5421
5422 aSig1 = extractFloat128Frac1( a );
5423 aSig0 = extractFloat128Frac0( a );
5424 aExp = extractFloat128Exp( a );
5425 aSign = extractFloat128Sign( a );
5426 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5427 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5428 aSig0 |= ( aSig1 != 0 );
5429 shiftCount = 0x4028 - aExp;
5430 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5431 return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
5432
5433}
5434
5435/*----------------------------------------------------------------------------
5436| Returns the result of converting the quadruple-precision floating-point
5437| value `a' to the 32-bit two's complement integer format. The conversion
5438| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5439| Arithmetic, except that the conversion is always rounded toward zero. If
5440| `a' is a NaN, the largest positive integer is returned. Otherwise, if the
5441| conversion overflows, the largest integer with the same sign as `a' is
5442| returned.
5443*----------------------------------------------------------------------------*/
5444
5445int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
5446{
5447 flag aSign;
5448 int32 aExp, shiftCount;
bb98fe42 5449 uint64_t aSig0, aSig1, savedASig;
b3a6a2e0 5450 int32_t z;
158142c2
FB
5451
5452 aSig1 = extractFloat128Frac1( a );
5453 aSig0 = extractFloat128Frac0( a );
5454 aExp = extractFloat128Exp( a );
5455 aSign = extractFloat128Sign( a );
5456 aSig0 |= ( aSig1 != 0 );
5457 if ( 0x401E < aExp ) {
5458 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5459 goto invalid;
5460 }
5461 else if ( aExp < 0x3FFF ) {
5462 if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
5463 return 0;
5464 }
5465 aSig0 |= LIT64( 0x0001000000000000 );
5466 shiftCount = 0x402F - aExp;
5467 savedASig = aSig0;
5468 aSig0 >>= shiftCount;
5469 z = aSig0;
5470 if ( aSign ) z = - z;
5471 if ( ( z < 0 ) ^ aSign ) {
5472 invalid:
5473 float_raise( float_flag_invalid STATUS_VAR);
bb98fe42 5474 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2
FB
5475 }
5476 if ( ( aSig0<<shiftCount ) != savedASig ) {
5477 STATUS(float_exception_flags) |= float_flag_inexact;
5478 }
5479 return z;
5480
5481}
5482
5483/*----------------------------------------------------------------------------
5484| Returns the result of converting the quadruple-precision floating-point
5485| value `a' to the 64-bit two's complement integer format. The conversion
5486| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5487| Arithmetic---which means in particular that the conversion is rounded
5488| according to the current rounding mode. If `a' is a NaN, the largest
5489| positive integer is returned. Otherwise, if the conversion overflows, the
5490| largest integer with the same sign as `a' is returned.
5491*----------------------------------------------------------------------------*/
5492
5493int64 float128_to_int64( float128 a STATUS_PARAM )
5494{
5495 flag aSign;
5496 int32 aExp, shiftCount;
bb98fe42 5497 uint64_t aSig0, aSig1;
158142c2
FB
5498
5499 aSig1 = extractFloat128Frac1( a );
5500 aSig0 = extractFloat128Frac0( a );
5501 aExp = extractFloat128Exp( a );
5502 aSign = extractFloat128Sign( a );
5503 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5504 shiftCount = 0x402F - aExp;
5505 if ( shiftCount <= 0 ) {
5506 if ( 0x403E < aExp ) {
5507 float_raise( float_flag_invalid STATUS_VAR);
5508 if ( ! aSign
5509 || ( ( aExp == 0x7FFF )
5510 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5511 )
5512 ) {
5513 return LIT64( 0x7FFFFFFFFFFFFFFF );
5514 }
bb98fe42 5515 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
5516 }
5517 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5518 }
5519 else {
5520 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5521 }
5522 return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
5523
5524}
5525
5526/*----------------------------------------------------------------------------
5527| Returns the result of converting the quadruple-precision floating-point
5528| value `a' to the 64-bit two's complement integer format. The conversion
5529| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5530| Arithmetic, except that the conversion is always rounded toward zero.
5531| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
5532| the conversion overflows, the largest integer with the same sign as `a' is
5533| returned.
5534*----------------------------------------------------------------------------*/
5535
5536int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
5537{
5538 flag aSign;
5539 int32 aExp, shiftCount;
bb98fe42 5540 uint64_t aSig0, aSig1;
158142c2
FB
5541 int64 z;
5542
5543 aSig1 = extractFloat128Frac1( a );
5544 aSig0 = extractFloat128Frac0( a );
5545 aExp = extractFloat128Exp( a );
5546 aSign = extractFloat128Sign( a );
5547 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5548 shiftCount = aExp - 0x402F;
5549 if ( 0 < shiftCount ) {
5550 if ( 0x403E <= aExp ) {
5551 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5552 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
5553 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5554 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
5555 }
5556 else {
5557 float_raise( float_flag_invalid STATUS_VAR);
5558 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5559 return LIT64( 0x7FFFFFFFFFFFFFFF );
5560 }
5561 }
bb98fe42 5562 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
5563 }
5564 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
bb98fe42 5565 if ( (uint64_t) ( aSig1<<shiftCount ) ) {
158142c2
FB
5566 STATUS(float_exception_flags) |= float_flag_inexact;
5567 }
5568 }
5569 else {
5570 if ( aExp < 0x3FFF ) {
5571 if ( aExp | aSig0 | aSig1 ) {
5572 STATUS(float_exception_flags) |= float_flag_inexact;
5573 }
5574 return 0;
5575 }
5576 z = aSig0>>( - shiftCount );
5577 if ( aSig1
bb98fe42 5578 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
158142c2
FB
5579 STATUS(float_exception_flags) |= float_flag_inexact;
5580 }
5581 }
5582 if ( aSign ) z = - z;
5583 return z;
5584
5585}
5586
5587/*----------------------------------------------------------------------------
5588| Returns the result of converting the quadruple-precision floating-point
5589| value `a' to the single-precision floating-point format. The conversion
5590| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5591| Arithmetic.
5592*----------------------------------------------------------------------------*/
5593
5594float32 float128_to_float32( float128 a STATUS_PARAM )
5595{
5596 flag aSign;
5597 int32 aExp;
bb98fe42
AF
5598 uint64_t aSig0, aSig1;
5599 uint32_t zSig;
158142c2
FB
5600
5601 aSig1 = extractFloat128Frac1( a );
5602 aSig0 = extractFloat128Frac0( a );
5603 aExp = extractFloat128Exp( a );
5604 aSign = extractFloat128Sign( a );
5605 if ( aExp == 0x7FFF ) {
5606 if ( aSig0 | aSig1 ) {
bcd4d9af 5607 return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
5608 }
5609 return packFloat32( aSign, 0xFF, 0 );
5610 }
5611 aSig0 |= ( aSig1 != 0 );
5612 shift64RightJamming( aSig0, 18, &aSig0 );
5613 zSig = aSig0;
5614 if ( aExp || zSig ) {
5615 zSig |= 0x40000000;
5616 aExp -= 0x3F81;
5617 }
5618 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
5619
5620}
5621
5622/*----------------------------------------------------------------------------
5623| Returns the result of converting the quadruple-precision floating-point
5624| value `a' to the double-precision floating-point format. The conversion
5625| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5626| Arithmetic.
5627*----------------------------------------------------------------------------*/
5628
5629float64 float128_to_float64( float128 a STATUS_PARAM )
5630{
5631 flag aSign;
5632 int32 aExp;
bb98fe42 5633 uint64_t aSig0, aSig1;
158142c2
FB
5634
5635 aSig1 = extractFloat128Frac1( a );
5636 aSig0 = extractFloat128Frac0( a );
5637 aExp = extractFloat128Exp( a );
5638 aSign = extractFloat128Sign( a );
5639 if ( aExp == 0x7FFF ) {
5640 if ( aSig0 | aSig1 ) {
bcd4d9af 5641 return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
5642 }
5643 return packFloat64( aSign, 0x7FF, 0 );
5644 }
5645 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5646 aSig0 |= ( aSig1 != 0 );
5647 if ( aExp || aSig0 ) {
5648 aSig0 |= LIT64( 0x4000000000000000 );
5649 aExp -= 0x3C01;
5650 }
5651 return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
5652
5653}
5654
158142c2
FB
5655/*----------------------------------------------------------------------------
5656| Returns the result of converting the quadruple-precision floating-point
5657| value `a' to the extended double-precision floating-point format. The
5658| conversion is performed according to the IEC/IEEE Standard for Binary
5659| Floating-Point Arithmetic.
5660*----------------------------------------------------------------------------*/
5661
5662floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
5663{
5664 flag aSign;
5665 int32 aExp;
bb98fe42 5666 uint64_t aSig0, aSig1;
158142c2
FB
5667
5668 aSig1 = extractFloat128Frac1( a );
5669 aSig0 = extractFloat128Frac0( a );
5670 aExp = extractFloat128Exp( a );
5671 aSign = extractFloat128Sign( a );
5672 if ( aExp == 0x7FFF ) {
5673 if ( aSig0 | aSig1 ) {
bcd4d9af 5674 return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
5675 }
5676 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5677 }
5678 if ( aExp == 0 ) {
5679 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5680 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5681 }
5682 else {
5683 aSig0 |= LIT64( 0x0001000000000000 );
5684 }
5685 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5686 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
5687
5688}
5689
158142c2
FB
5690/*----------------------------------------------------------------------------
5691| Rounds the quadruple-precision floating-point value `a' to an integer, and
5692| returns the result as a quadruple-precision floating-point value. The
5693| operation is performed according to the IEC/IEEE Standard for Binary
5694| Floating-Point Arithmetic.
5695*----------------------------------------------------------------------------*/
5696
5697float128 float128_round_to_int( float128 a STATUS_PARAM )
5698{
5699 flag aSign;
5700 int32 aExp;
bb98fe42 5701 uint64_t lastBitMask, roundBitsMask;
158142c2
FB
5702 int8 roundingMode;
5703 float128 z;
5704
5705 aExp = extractFloat128Exp( a );
5706 if ( 0x402F <= aExp ) {
5707 if ( 0x406F <= aExp ) {
5708 if ( ( aExp == 0x7FFF )
5709 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5710 ) {
5711 return propagateFloat128NaN( a, a STATUS_VAR );
5712 }
5713 return a;
5714 }
5715 lastBitMask = 1;
5716 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5717 roundBitsMask = lastBitMask - 1;
5718 z = a;
5719 roundingMode = STATUS(float_rounding_mode);
5720 if ( roundingMode == float_round_nearest_even ) {
5721 if ( lastBitMask ) {
5722 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
5723 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
5724 }
5725 else {
bb98fe42 5726 if ( (int64_t) z.low < 0 ) {
158142c2 5727 ++z.high;
bb98fe42 5728 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
158142c2
FB
5729 }
5730 }
5731 }
5732 else if ( roundingMode != float_round_to_zero ) {
5733 if ( extractFloat128Sign( z )
5734 ^ ( roundingMode == float_round_up ) ) {
5735 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
5736 }
5737 }
5738 z.low &= ~ roundBitsMask;
5739 }
5740 else {
5741 if ( aExp < 0x3FFF ) {
bb98fe42 5742 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
158142c2
FB
5743 STATUS(float_exception_flags) |= float_flag_inexact;
5744 aSign = extractFloat128Sign( a );
5745 switch ( STATUS(float_rounding_mode) ) {
5746 case float_round_nearest_even:
5747 if ( ( aExp == 0x3FFE )
5748 && ( extractFloat128Frac0( a )
5749 | extractFloat128Frac1( a ) )
5750 ) {
5751 return packFloat128( aSign, 0x3FFF, 0, 0 );
5752 }
5753 break;
5754 case float_round_down:
5755 return
5756 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
5757 : packFloat128( 0, 0, 0, 0 );
5758 case float_round_up:
5759 return
5760 aSign ? packFloat128( 1, 0, 0, 0 )
5761 : packFloat128( 0, 0x3FFF, 0, 0 );
5762 }
5763 return packFloat128( aSign, 0, 0, 0 );
5764 }
5765 lastBitMask = 1;
5766 lastBitMask <<= 0x402F - aExp;
5767 roundBitsMask = lastBitMask - 1;
5768 z.low = 0;
5769 z.high = a.high;
5770 roundingMode = STATUS(float_rounding_mode);
5771 if ( roundingMode == float_round_nearest_even ) {
5772 z.high += lastBitMask>>1;
5773 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
5774 z.high &= ~ lastBitMask;
5775 }
5776 }
5777 else if ( roundingMode != float_round_to_zero ) {
5778 if ( extractFloat128Sign( z )
5779 ^ ( roundingMode == float_round_up ) ) {
5780 z.high |= ( a.low != 0 );
5781 z.high += roundBitsMask;
5782 }
5783 }
5784 z.high &= ~ roundBitsMask;
5785 }
5786 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
5787 STATUS(float_exception_flags) |= float_flag_inexact;
5788 }
5789 return z;
5790
5791}
5792
5793/*----------------------------------------------------------------------------
5794| Returns the result of adding the absolute values of the quadruple-precision
5795| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
5796| before being returned. `zSign' is ignored if the result is a NaN.
5797| The addition is performed according to the IEC/IEEE Standard for Binary
5798| Floating-Point Arithmetic.
5799*----------------------------------------------------------------------------*/
5800
5801static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5802{
5803 int32 aExp, bExp, zExp;
bb98fe42 5804 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
158142c2
FB
5805 int32 expDiff;
5806
5807 aSig1 = extractFloat128Frac1( a );
5808 aSig0 = extractFloat128Frac0( a );
5809 aExp = extractFloat128Exp( a );
5810 bSig1 = extractFloat128Frac1( b );
5811 bSig0 = extractFloat128Frac0( b );
5812 bExp = extractFloat128Exp( b );
5813 expDiff = aExp - bExp;
5814 if ( 0 < expDiff ) {
5815 if ( aExp == 0x7FFF ) {
5816 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5817 return a;
5818 }
5819 if ( bExp == 0 ) {
5820 --expDiff;
5821 }
5822 else {
5823 bSig0 |= LIT64( 0x0001000000000000 );
5824 }
5825 shift128ExtraRightJamming(
5826 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
5827 zExp = aExp;
5828 }
5829 else if ( expDiff < 0 ) {
5830 if ( bExp == 0x7FFF ) {
5831 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5832 return packFloat128( zSign, 0x7FFF, 0, 0 );
5833 }
5834 if ( aExp == 0 ) {
5835 ++expDiff;
5836 }
5837 else {
5838 aSig0 |= LIT64( 0x0001000000000000 );
5839 }
5840 shift128ExtraRightJamming(
5841 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
5842 zExp = bExp;
5843 }
5844 else {
5845 if ( aExp == 0x7FFF ) {
5846 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5847 return propagateFloat128NaN( a, b STATUS_VAR );
5848 }
5849 return a;
5850 }
5851 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
fe76d976 5852 if ( aExp == 0 ) {
e6afc87f
PM
5853 if (STATUS(flush_to_zero)) {
5854 if (zSig0 | zSig1) {
5855 float_raise(float_flag_output_denormal STATUS_VAR);
5856 }
5857 return packFloat128(zSign, 0, 0, 0);
5858 }
fe76d976
PB
5859 return packFloat128( zSign, 0, zSig0, zSig1 );
5860 }
158142c2
FB
5861 zSig2 = 0;
5862 zSig0 |= LIT64( 0x0002000000000000 );
5863 zExp = aExp;
5864 goto shiftRight1;
5865 }
5866 aSig0 |= LIT64( 0x0001000000000000 );
5867 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5868 --zExp;
5869 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
5870 ++zExp;
5871 shiftRight1:
5872 shift128ExtraRightJamming(
5873 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5874 roundAndPack:
5875 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5876
5877}
5878
5879/*----------------------------------------------------------------------------
5880| Returns the result of subtracting the absolute values of the quadruple-
5881| precision floating-point values `a' and `b'. If `zSign' is 1, the
5882| difference is negated before being returned. `zSign' is ignored if the
5883| result is a NaN. The subtraction is performed according to the IEC/IEEE
5884| Standard for Binary Floating-Point Arithmetic.
5885*----------------------------------------------------------------------------*/
5886
5887static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5888{
5889 int32 aExp, bExp, zExp;
bb98fe42 5890 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
158142c2
FB
5891 int32 expDiff;
5892 float128 z;
5893
5894 aSig1 = extractFloat128Frac1( a );
5895 aSig0 = extractFloat128Frac0( a );
5896 aExp = extractFloat128Exp( a );
5897 bSig1 = extractFloat128Frac1( b );
5898 bSig0 = extractFloat128Frac0( b );
5899 bExp = extractFloat128Exp( b );
5900 expDiff = aExp - bExp;
5901 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5902 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
5903 if ( 0 < expDiff ) goto aExpBigger;
5904 if ( expDiff < 0 ) goto bExpBigger;
5905 if ( aExp == 0x7FFF ) {
5906 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5907 return propagateFloat128NaN( a, b STATUS_VAR );
5908 }
5909 float_raise( float_flag_invalid STATUS_VAR);
5910 z.low = float128_default_nan_low;
5911 z.high = float128_default_nan_high;
5912 return z;
5913 }
5914 if ( aExp == 0 ) {
5915 aExp = 1;
5916 bExp = 1;
5917 }
5918 if ( bSig0 < aSig0 ) goto aBigger;
5919 if ( aSig0 < bSig0 ) goto bBigger;
5920 if ( bSig1 < aSig1 ) goto aBigger;
5921 if ( aSig1 < bSig1 ) goto bBigger;
5922 return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
5923 bExpBigger:
5924 if ( bExp == 0x7FFF ) {
5925 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5926 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
5927 }
5928 if ( aExp == 0 ) {
5929 ++expDiff;
5930 }
5931 else {
5932 aSig0 |= LIT64( 0x4000000000000000 );
5933 }
5934 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5935 bSig0 |= LIT64( 0x4000000000000000 );
5936 bBigger:
5937 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
5938 zExp = bExp;
5939 zSign ^= 1;
5940 goto normalizeRoundAndPack;
5941 aExpBigger:
5942 if ( aExp == 0x7FFF ) {
5943 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5944 return a;
5945 }
5946 if ( bExp == 0 ) {
5947 --expDiff;
5948 }
5949 else {
5950 bSig0 |= LIT64( 0x4000000000000000 );
5951 }
5952 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
5953 aSig0 |= LIT64( 0x4000000000000000 );
5954 aBigger:
5955 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5956 zExp = aExp;
5957 normalizeRoundAndPack:
5958 --zExp;
5959 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
5960
5961}
5962
5963/*----------------------------------------------------------------------------
5964| Returns the result of adding the quadruple-precision floating-point values
5965| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
5966| for Binary Floating-Point Arithmetic.
5967*----------------------------------------------------------------------------*/
5968
5969float128 float128_add( float128 a, float128 b STATUS_PARAM )
5970{
5971 flag aSign, bSign;
5972
5973 aSign = extractFloat128Sign( a );
5974 bSign = extractFloat128Sign( b );
5975 if ( aSign == bSign ) {
5976 return addFloat128Sigs( a, b, aSign STATUS_VAR );
5977 }
5978 else {
5979 return subFloat128Sigs( a, b, aSign STATUS_VAR );
5980 }
5981
5982}
5983
5984/*----------------------------------------------------------------------------
5985| Returns the result of subtracting the quadruple-precision floating-point
5986| values `a' and `b'. The operation is performed according to the IEC/IEEE
5987| Standard for Binary Floating-Point Arithmetic.
5988*----------------------------------------------------------------------------*/
5989
5990float128 float128_sub( float128 a, float128 b STATUS_PARAM )
5991{
5992 flag aSign, bSign;
5993
5994 aSign = extractFloat128Sign( a );
5995 bSign = extractFloat128Sign( b );
5996 if ( aSign == bSign ) {
5997 return subFloat128Sigs( a, b, aSign STATUS_VAR );
5998 }
5999 else {
6000 return addFloat128Sigs( a, b, aSign STATUS_VAR );
6001 }
6002
6003}
6004
6005/*----------------------------------------------------------------------------
6006| Returns the result of multiplying the quadruple-precision floating-point
6007| values `a' and `b'. The operation is performed according to the IEC/IEEE
6008| Standard for Binary Floating-Point Arithmetic.
6009*----------------------------------------------------------------------------*/
6010
6011float128 float128_mul( float128 a, float128 b STATUS_PARAM )
6012{
6013 flag aSign, bSign, zSign;
6014 int32 aExp, bExp, zExp;
bb98fe42 6015 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
158142c2
FB
6016 float128 z;
6017
6018 aSig1 = extractFloat128Frac1( a );
6019 aSig0 = extractFloat128Frac0( a );
6020 aExp = extractFloat128Exp( a );
6021 aSign = extractFloat128Sign( a );
6022 bSig1 = extractFloat128Frac1( b );
6023 bSig0 = extractFloat128Frac0( b );
6024 bExp = extractFloat128Exp( b );
6025 bSign = extractFloat128Sign( b );
6026 zSign = aSign ^ bSign;
6027 if ( aExp == 0x7FFF ) {
6028 if ( ( aSig0 | aSig1 )
6029 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6030 return propagateFloat128NaN( a, b STATUS_VAR );
6031 }
6032 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6033 return packFloat128( zSign, 0x7FFF, 0, 0 );
6034 }
6035 if ( bExp == 0x7FFF ) {
6036 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6037 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6038 invalid:
6039 float_raise( float_flag_invalid STATUS_VAR);
6040 z.low = float128_default_nan_low;
6041 z.high = float128_default_nan_high;
6042 return z;
6043 }
6044 return packFloat128( zSign, 0x7FFF, 0, 0 );
6045 }
6046 if ( aExp == 0 ) {
6047 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6048 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6049 }
6050 if ( bExp == 0 ) {
6051 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6052 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6053 }
6054 zExp = aExp + bExp - 0x4000;
6055 aSig0 |= LIT64( 0x0001000000000000 );
6056 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6057 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6058 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6059 zSig2 |= ( zSig3 != 0 );
6060 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6061 shift128ExtraRightJamming(
6062 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6063 ++zExp;
6064 }
6065 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6066
6067}
6068
6069/*----------------------------------------------------------------------------
6070| Returns the result of dividing the quadruple-precision floating-point value
6071| `a' by the corresponding value `b'. The operation is performed according to
6072| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6073*----------------------------------------------------------------------------*/
6074
6075float128 float128_div( float128 a, float128 b STATUS_PARAM )
6076{
6077 flag aSign, bSign, zSign;
6078 int32 aExp, bExp, zExp;
bb98fe42
AF
6079 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6080 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
158142c2
FB
6081 float128 z;
6082
6083 aSig1 = extractFloat128Frac1( a );
6084 aSig0 = extractFloat128Frac0( a );
6085 aExp = extractFloat128Exp( a );
6086 aSign = extractFloat128Sign( a );
6087 bSig1 = extractFloat128Frac1( b );
6088 bSig0 = extractFloat128Frac0( b );
6089 bExp = extractFloat128Exp( b );
6090 bSign = extractFloat128Sign( b );
6091 zSign = aSign ^ bSign;
6092 if ( aExp == 0x7FFF ) {
6093 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6094 if ( bExp == 0x7FFF ) {
6095 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6096 goto invalid;
6097 }
6098 return packFloat128( zSign, 0x7FFF, 0, 0 );
6099 }
6100 if ( bExp == 0x7FFF ) {
6101 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6102 return packFloat128( zSign, 0, 0, 0 );
6103 }
6104 if ( bExp == 0 ) {
6105 if ( ( bSig0 | bSig1 ) == 0 ) {
6106 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6107 invalid:
6108 float_raise( float_flag_invalid STATUS_VAR);
6109 z.low = float128_default_nan_low;
6110 z.high = float128_default_nan_high;
6111 return z;
6112 }
6113 float_raise( float_flag_divbyzero STATUS_VAR);
6114 return packFloat128( zSign, 0x7FFF, 0, 0 );
6115 }
6116 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6117 }
6118 if ( aExp == 0 ) {
6119 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6120 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6121 }
6122 zExp = aExp - bExp + 0x3FFD;
6123 shortShift128Left(
6124 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6125 shortShift128Left(
6126 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6127 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6128 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6129 ++zExp;
6130 }
6131 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6132 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6133 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
bb98fe42 6134 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
6135 --zSig0;
6136 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6137 }
6138 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6139 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6140 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6141 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
bb98fe42 6142 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
6143 --zSig1;
6144 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6145 }
6146 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6147 }
6148 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6149 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6150
6151}
6152
6153/*----------------------------------------------------------------------------
6154| Returns the remainder of the quadruple-precision floating-point value `a'
6155| with respect to the corresponding value `b'. The operation is performed
6156| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6157*----------------------------------------------------------------------------*/
6158
6159float128 float128_rem( float128 a, float128 b STATUS_PARAM )
6160{
ed086f3d 6161 flag aSign, zSign;
158142c2 6162 int32 aExp, bExp, expDiff;
bb98fe42
AF
6163 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6164 uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6165 int64_t sigMean0;
158142c2
FB
6166 float128 z;
6167
6168 aSig1 = extractFloat128Frac1( a );
6169 aSig0 = extractFloat128Frac0( a );
6170 aExp = extractFloat128Exp( a );
6171 aSign = extractFloat128Sign( a );
6172 bSig1 = extractFloat128Frac1( b );
6173 bSig0 = extractFloat128Frac0( b );
6174 bExp = extractFloat128Exp( b );
158142c2
FB
6175 if ( aExp == 0x7FFF ) {
6176 if ( ( aSig0 | aSig1 )
6177 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6178 return propagateFloat128NaN( a, b STATUS_VAR );
6179 }
6180 goto invalid;
6181 }
6182 if ( bExp == 0x7FFF ) {
6183 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6184 return a;
6185 }
6186 if ( bExp == 0 ) {
6187 if ( ( bSig0 | bSig1 ) == 0 ) {
6188 invalid:
6189 float_raise( float_flag_invalid STATUS_VAR);
6190 z.low = float128_default_nan_low;
6191 z.high = float128_default_nan_high;
6192 return z;
6193 }
6194 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6195 }
6196 if ( aExp == 0 ) {
6197 if ( ( aSig0 | aSig1 ) == 0 ) return a;
6198 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6199 }
6200 expDiff = aExp - bExp;
6201 if ( expDiff < -1 ) return a;
6202 shortShift128Left(
6203 aSig0 | LIT64( 0x0001000000000000 ),
6204 aSig1,
6205 15 - ( expDiff < 0 ),
6206 &aSig0,
6207 &aSig1
6208 );
6209 shortShift128Left(
6210 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6211 q = le128( bSig0, bSig1, aSig0, aSig1 );
6212 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6213 expDiff -= 64;
6214 while ( 0 < expDiff ) {
6215 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6216 q = ( 4 < q ) ? q - 4 : 0;
6217 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6218 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6219 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6220 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6221 expDiff -= 61;
6222 }
6223 if ( -64 < expDiff ) {
6224 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6225 q = ( 4 < q ) ? q - 4 : 0;
6226 q >>= - expDiff;
6227 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6228 expDiff += 52;
6229 if ( expDiff < 0 ) {
6230 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6231 }
6232 else {
6233 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6234 }
6235 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6236 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6237 }
6238 else {
6239 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6240 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6241 }
6242 do {
6243 alternateASig0 = aSig0;
6244 alternateASig1 = aSig1;
6245 ++q;
6246 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
bb98fe42 6247 } while ( 0 <= (int64_t) aSig0 );
158142c2 6248 add128(
bb98fe42 6249 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
158142c2
FB
6250 if ( ( sigMean0 < 0 )
6251 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6252 aSig0 = alternateASig0;
6253 aSig1 = alternateASig1;
6254 }
bb98fe42 6255 zSign = ( (int64_t) aSig0 < 0 );
158142c2
FB
6256 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6257 return
6258 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
6259
6260}
6261
6262/*----------------------------------------------------------------------------
6263| Returns the square root of the quadruple-precision floating-point value `a'.
6264| The operation is performed according to the IEC/IEEE Standard for Binary
6265| Floating-Point Arithmetic.
6266*----------------------------------------------------------------------------*/
6267
6268float128 float128_sqrt( float128 a STATUS_PARAM )
6269{
6270 flag aSign;
6271 int32 aExp, zExp;
bb98fe42
AF
6272 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6273 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
158142c2
FB
6274 float128 z;
6275
6276 aSig1 = extractFloat128Frac1( a );
6277 aSig0 = extractFloat128Frac0( a );
6278 aExp = extractFloat128Exp( a );
6279 aSign = extractFloat128Sign( a );
6280 if ( aExp == 0x7FFF ) {
6281 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
6282 if ( ! aSign ) return a;
6283 goto invalid;
6284 }
6285 if ( aSign ) {
6286 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6287 invalid:
6288 float_raise( float_flag_invalid STATUS_VAR);
6289 z.low = float128_default_nan_low;
6290 z.high = float128_default_nan_high;
6291 return z;
6292 }
6293 if ( aExp == 0 ) {
6294 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6295 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6296 }
6297 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6298 aSig0 |= LIT64( 0x0001000000000000 );
6299 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6300 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6301 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6302 doubleZSig0 = zSig0<<1;
6303 mul64To128( zSig0, zSig0, &term0, &term1 );
6304 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
bb98fe42 6305 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
6306 --zSig0;
6307 doubleZSig0 -= 2;
6308 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6309 }
6310 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6311 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6312 if ( zSig1 == 0 ) zSig1 = 1;
6313 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6314 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6315 mul64To128( zSig1, zSig1, &term2, &term3 );
6316 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
bb98fe42 6317 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
6318 --zSig1;
6319 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6320 term3 |= 1;
6321 term2 |= doubleZSig0;
6322 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6323 }
6324 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6325 }
6326 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6327 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6328
6329}
6330
6331/*----------------------------------------------------------------------------
6332| Returns 1 if the quadruple-precision floating-point value `a' is equal to
b689362d
AJ
6333| the corresponding value `b', and 0 otherwise. The invalid exception is
6334| raised if either operand is a NaN. Otherwise, the comparison is performed
158142c2
FB
6335| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6336*----------------------------------------------------------------------------*/
6337
b689362d 6338int float128_eq( float128 a, float128 b STATUS_PARAM )
158142c2
FB
6339{
6340
6341 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6342 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6343 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6344 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6345 ) {
b689362d 6346 float_raise( float_flag_invalid STATUS_VAR);
158142c2
FB
6347 return 0;
6348 }
6349 return
6350 ( a.low == b.low )
6351 && ( ( a.high == b.high )
6352 || ( ( a.low == 0 )
bb98fe42 6353 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
6354 );
6355
6356}
6357
6358/*----------------------------------------------------------------------------
6359| Returns 1 if the quadruple-precision floating-point value `a' is less than
f5a64251
AJ
6360| or equal to the corresponding value `b', and 0 otherwise. The invalid
6361| exception is raised if either operand is a NaN. The comparison is performed
6362| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
6363*----------------------------------------------------------------------------*/
6364
750afe93 6365int float128_le( float128 a, float128 b STATUS_PARAM )
158142c2
FB
6366{
6367 flag aSign, bSign;
6368
6369 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6370 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6371 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6372 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6373 ) {
6374 float_raise( float_flag_invalid STATUS_VAR);
6375 return 0;
6376 }
6377 aSign = extractFloat128Sign( a );
6378 bSign = extractFloat128Sign( b );
6379 if ( aSign != bSign ) {
6380 return
6381 aSign
bb98fe42 6382 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
6383 == 0 );
6384 }
6385 return
6386 aSign ? le128( b.high, b.low, a.high, a.low )
6387 : le128( a.high, a.low, b.high, b.low );
6388
6389}
6390
6391/*----------------------------------------------------------------------------
6392| Returns 1 if the quadruple-precision floating-point value `a' is less than
f5a64251
AJ
6393| the corresponding value `b', and 0 otherwise. The invalid exception is
6394| raised if either operand is a NaN. The comparison is performed according
6395| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
6396*----------------------------------------------------------------------------*/
6397
750afe93 6398int float128_lt( float128 a, float128 b STATUS_PARAM )
158142c2
FB
6399{
6400 flag aSign, bSign;
6401
6402 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6403 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6404 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6405 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6406 ) {
6407 float_raise( float_flag_invalid STATUS_VAR);
6408 return 0;
6409 }
6410 aSign = extractFloat128Sign( a );
6411 bSign = extractFloat128Sign( b );
6412 if ( aSign != bSign ) {
6413 return
6414 aSign
bb98fe42 6415 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
6416 != 0 );
6417 }
6418 return
6419 aSign ? lt128( b.high, b.low, a.high, a.low )
6420 : lt128( a.high, a.low, b.high, b.low );
6421
6422}
6423
67b7861d
AJ
6424/*----------------------------------------------------------------------------
6425| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
f5a64251
AJ
6426| be compared, and 0 otherwise. The invalid exception is raised if either
6427| operand is a NaN. The comparison is performed according to the IEC/IEEE
6428| Standard for Binary Floating-Point Arithmetic.
67b7861d
AJ
6429*----------------------------------------------------------------------------*/
6430
6431int float128_unordered( float128 a, float128 b STATUS_PARAM )
6432{
6433 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6434 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6435 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6436 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6437 ) {
6438 float_raise( float_flag_invalid STATUS_VAR);
6439 return 1;
6440 }
6441 return 0;
6442}
6443
158142c2
FB
6444/*----------------------------------------------------------------------------
6445| Returns 1 if the quadruple-precision floating-point value `a' is equal to
f5a64251
AJ
6446| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6447| exception. The comparison is performed according to the IEC/IEEE Standard
6448| for Binary Floating-Point Arithmetic.
158142c2
FB
6449*----------------------------------------------------------------------------*/
6450
b689362d 6451int float128_eq_quiet( float128 a, float128 b STATUS_PARAM )
158142c2
FB
6452{
6453
6454 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6455 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6456 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6457 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6458 ) {
b689362d
AJ
6459 if ( float128_is_signaling_nan( a )
6460 || float128_is_signaling_nan( b ) ) {
6461 float_raise( float_flag_invalid STATUS_VAR);
6462 }
158142c2
FB
6463 return 0;
6464 }
6465 return
6466 ( a.low == b.low )
6467 && ( ( a.high == b.high )
6468 || ( ( a.low == 0 )
bb98fe42 6469 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
6470 );
6471
6472}
6473
6474/*----------------------------------------------------------------------------
6475| Returns 1 if the quadruple-precision floating-point value `a' is less than
6476| or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
6477| cause an exception. Otherwise, the comparison is performed according to the
6478| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6479*----------------------------------------------------------------------------*/
6480
750afe93 6481int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
158142c2
FB
6482{
6483 flag aSign, bSign;
6484
6485 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6486 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6487 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6488 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6489 ) {
6490 if ( float128_is_signaling_nan( a )
6491 || float128_is_signaling_nan( b ) ) {
6492 float_raise( float_flag_invalid STATUS_VAR);
6493 }
6494 return 0;
6495 }
6496 aSign = extractFloat128Sign( a );
6497 bSign = extractFloat128Sign( b );
6498 if ( aSign != bSign ) {
6499 return
6500 aSign
bb98fe42 6501 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
6502 == 0 );
6503 }
6504 return
6505 aSign ? le128( b.high, b.low, a.high, a.low )
6506 : le128( a.high, a.low, b.high, b.low );
6507
6508}
6509
6510/*----------------------------------------------------------------------------
6511| Returns 1 if the quadruple-precision floating-point value `a' is less than
6512| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6513| exception. Otherwise, the comparison is performed according to the IEC/IEEE
6514| Standard for Binary Floating-Point Arithmetic.
6515*----------------------------------------------------------------------------*/
6516
750afe93 6517int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
158142c2
FB
6518{
6519 flag aSign, bSign;
6520
6521 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6522 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6523 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6524 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6525 ) {
6526 if ( float128_is_signaling_nan( a )
6527 || float128_is_signaling_nan( b ) ) {
6528 float_raise( float_flag_invalid STATUS_VAR);
6529 }
6530 return 0;
6531 }
6532 aSign = extractFloat128Sign( a );
6533 bSign = extractFloat128Sign( b );
6534 if ( aSign != bSign ) {
6535 return
6536 aSign
bb98fe42 6537 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
6538 != 0 );
6539 }
6540 return
6541 aSign ? lt128( b.high, b.low, a.high, a.low )
6542 : lt128( a.high, a.low, b.high, b.low );
6543
6544}
6545
67b7861d
AJ
6546/*----------------------------------------------------------------------------
6547| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6548| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
6549| comparison is performed according to the IEC/IEEE Standard for Binary
6550| Floating-Point Arithmetic.
6551*----------------------------------------------------------------------------*/
6552
6553int float128_unordered_quiet( float128 a, float128 b STATUS_PARAM )
6554{
6555 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6556 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6557 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6558 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6559 ) {
6560 if ( float128_is_signaling_nan( a )
6561 || float128_is_signaling_nan( b ) ) {
6562 float_raise( float_flag_invalid STATUS_VAR);
6563 }
6564 return 1;
6565 }
6566 return 0;
6567}
6568
1d6bda35 6569/* misc functions */
c4850f9e 6570float32 uint32_to_float32(uint32_t a STATUS_PARAM)
1d6bda35
FB
6571{
6572 return int64_to_float32(a STATUS_VAR);
6573}
6574
c4850f9e 6575float64 uint32_to_float64(uint32_t a STATUS_PARAM)
1d6bda35
FB
6576{
6577 return int64_to_float64(a STATUS_VAR);
6578}
6579
9f8d2a09 6580uint32 float32_to_uint32( float32 a STATUS_PARAM )
1d6bda35
FB
6581{
6582 int64_t v;
9f8d2a09 6583 uint32 res;
34e1c27b 6584 int old_exc_flags = get_float_exception_flags(status);
1d6bda35
FB
6585
6586 v = float32_to_int64(a STATUS_VAR);
6587 if (v < 0) {
6588 res = 0;
1d6bda35
FB
6589 } else if (v > 0xffffffff) {
6590 res = 0xffffffff;
1d6bda35 6591 } else {
34e1c27b 6592 return v;
1d6bda35 6593 }
34e1c27b
PM
6594 set_float_exception_flags(old_exc_flags, status);
6595 float_raise(float_flag_invalid STATUS_VAR);
1d6bda35
FB
6596 return res;
6597}
6598
9f8d2a09 6599uint32 float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
1d6bda35
FB
6600{
6601 int64_t v;
9f8d2a09 6602 uint32 res;
34e1c27b 6603 int old_exc_flags = get_float_exception_flags(status);
1d6bda35
FB
6604
6605 v = float32_to_int64_round_to_zero(a STATUS_VAR);
6606 if (v < 0) {
6607 res = 0;
1d6bda35
FB
6608 } else if (v > 0xffffffff) {
6609 res = 0xffffffff;
1d6bda35 6610 } else {
34e1c27b 6611 return v;
1d6bda35 6612 }
34e1c27b
PM
6613 set_float_exception_flags(old_exc_flags, status);
6614 float_raise(float_flag_invalid STATUS_VAR);
1d6bda35
FB
6615 return res;
6616}
6617
f581bf54
WN
6618int_fast16_t float32_to_int16(float32 a STATUS_PARAM)
6619{
6620 int32_t v;
6621 int_fast16_t res;
6622 int old_exc_flags = get_float_exception_flags(status);
6623
6624 v = float32_to_int32(a STATUS_VAR);
6625 if (v < -0x8000) {
6626 res = -0x8000;
6627 } else if (v > 0x7fff) {
6628 res = 0x7fff;
6629 } else {
6630 return v;
6631 }
6632
6633 set_float_exception_flags(old_exc_flags, status);
6634 float_raise(float_flag_invalid STATUS_VAR);
6635 return res;
6636}
6637
6638uint_fast16_t float32_to_uint16(float32 a STATUS_PARAM)
6639{
6640 int32_t v;
6641 uint_fast16_t res;
6642 int old_exc_flags = get_float_exception_flags(status);
6643
6644 v = float32_to_int32(a STATUS_VAR);
6645 if (v < 0) {
6646 res = 0;
6647 } else if (v > 0xffff) {
6648 res = 0xffff;
6649 } else {
6650 return v;
6651 }
6652
6653 set_float_exception_flags(old_exc_flags, status);
6654 float_raise(float_flag_invalid STATUS_VAR);
6655 return res;
6656}
6657
5aea4c58 6658uint_fast16_t float32_to_uint16_round_to_zero(float32 a STATUS_PARAM)
cbcef455
PM
6659{
6660 int64_t v;
5aea4c58 6661 uint_fast16_t res;
34e1c27b 6662 int old_exc_flags = get_float_exception_flags(status);
cbcef455
PM
6663
6664 v = float32_to_int64_round_to_zero(a STATUS_VAR);
6665 if (v < 0) {
6666 res = 0;
cbcef455
PM
6667 } else if (v > 0xffff) {
6668 res = 0xffff;
cbcef455 6669 } else {
34e1c27b 6670 return v;
cbcef455 6671 }
34e1c27b
PM
6672 set_float_exception_flags(old_exc_flags, status);
6673 float_raise(float_flag_invalid STATUS_VAR);
cbcef455
PM
6674 return res;
6675}
6676
9f8d2a09 6677uint32 float64_to_uint32( float64 a STATUS_PARAM )
1d6bda35 6678{
5e7f654f 6679 uint64_t v;
9f8d2a09 6680 uint32 res;
5e7f654f 6681 int old_exc_flags = get_float_exception_flags(status);
1d6bda35 6682
5e7f654f
TM
6683 v = float64_to_uint64(a STATUS_VAR);
6684 if (v > 0xffffffff) {
1d6bda35 6685 res = 0xffffffff;
1d6bda35 6686 } else {
5e7f654f 6687 return v;
1d6bda35 6688 }
5e7f654f
TM
6689 set_float_exception_flags(old_exc_flags, status);
6690 float_raise(float_flag_invalid STATUS_VAR);
1d6bda35
FB
6691 return res;
6692}
6693
9f8d2a09 6694uint32 float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
1d6bda35 6695{
fd728f2f 6696 uint64_t v;
9f8d2a09 6697 uint32 res;
fd728f2f 6698 int old_exc_flags = get_float_exception_flags(status);
1d6bda35 6699
fd728f2f
TM
6700 v = float64_to_uint64_round_to_zero(a STATUS_VAR);
6701 if (v > 0xffffffff) {
1d6bda35 6702 res = 0xffffffff;
1d6bda35 6703 } else {
fd728f2f 6704 return v;
1d6bda35 6705 }
fd728f2f
TM
6706 set_float_exception_flags(old_exc_flags, status);
6707 float_raise(float_flag_invalid STATUS_VAR);
1d6bda35
FB
6708 return res;
6709}
6710
f581bf54
WN
6711int_fast16_t float64_to_int16(float64 a STATUS_PARAM)
6712{
6713 int64_t v;
6714 int_fast16_t res;
6715 int old_exc_flags = get_float_exception_flags(status);
6716
6717 v = float64_to_int32(a STATUS_VAR);
6718 if (v < -0x8000) {
6719 res = -0x8000;
6720 } else if (v > 0x7fff) {
6721 res = 0x7fff;
6722 } else {
6723 return v;
6724 }
6725
6726 set_float_exception_flags(old_exc_flags, status);
6727 float_raise(float_flag_invalid STATUS_VAR);
6728 return res;
6729}
6730
6731uint_fast16_t float64_to_uint16(float64 a STATUS_PARAM)
6732{
6733 int64_t v;
6734 uint_fast16_t res;
6735 int old_exc_flags = get_float_exception_flags(status);
6736
6737 v = float64_to_int32(a STATUS_VAR);
6738 if (v < 0) {
6739 res = 0;
6740 } else if (v > 0xffff) {
6741 res = 0xffff;
6742 } else {
6743 return v;
6744 }
6745
6746 set_float_exception_flags(old_exc_flags, status);
6747 float_raise(float_flag_invalid STATUS_VAR);
6748 return res;
6749}
6750
5aea4c58 6751uint_fast16_t float64_to_uint16_round_to_zero(float64 a STATUS_PARAM)
cbcef455
PM
6752{
6753 int64_t v;
5aea4c58 6754 uint_fast16_t res;
34e1c27b 6755 int old_exc_flags = get_float_exception_flags(status);
cbcef455
PM
6756
6757 v = float64_to_int64_round_to_zero(a STATUS_VAR);
6758 if (v < 0) {
6759 res = 0;
cbcef455
PM
6760 } else if (v > 0xffff) {
6761 res = 0xffff;
cbcef455 6762 } else {
34e1c27b 6763 return v;
cbcef455 6764 }
34e1c27b
PM
6765 set_float_exception_flags(old_exc_flags, status);
6766 float_raise(float_flag_invalid STATUS_VAR);
cbcef455
PM
6767 return res;
6768}
6769
fb3ea83a
TM
6770/*----------------------------------------------------------------------------
6771| Returns the result of converting the double-precision floating-point value
6772| `a' to the 64-bit unsigned integer format. The conversion is
6773| performed according to the IEC/IEEE Standard for Binary Floating-Point
6774| Arithmetic---which means in particular that the conversion is rounded
6775| according to the current rounding mode. If `a' is a NaN, the largest
6776| positive integer is returned. If the conversion overflows, the
6777| largest unsigned integer is returned. If 'a' is negative, the value is
6778| rounded and zero is returned; negative values that do not round to zero
6779| will raise the inexact exception.
6780*----------------------------------------------------------------------------*/
75d62a58 6781
fb3ea83a
TM
6782uint64_t float64_to_uint64(float64 a STATUS_PARAM)
6783{
6784 flag aSign;
6785 int_fast16_t aExp, shiftCount;
6786 uint64_t aSig, aSigExtra;
6787 a = float64_squash_input_denormal(a STATUS_VAR);
75d62a58 6788
fb3ea83a
TM
6789 aSig = extractFloat64Frac(a);
6790 aExp = extractFloat64Exp(a);
6791 aSign = extractFloat64Sign(a);
6792 if (aSign && (aExp > 1022)) {
6793 float_raise(float_flag_invalid STATUS_VAR);
6794 if (float64_is_any_nan(a)) {
6795 return LIT64(0xFFFFFFFFFFFFFFFF);
6796 } else {
6797 return 0;
6798 }
6799 }
6800 if (aExp) {
6801 aSig |= LIT64(0x0010000000000000);
6802 }
6803 shiftCount = 0x433 - aExp;
6804 if (shiftCount <= 0) {
6805 if (0x43E < aExp) {
6806 float_raise(float_flag_invalid STATUS_VAR);
6807 return LIT64(0xFFFFFFFFFFFFFFFF);
6808 }
6809 aSigExtra = 0;
6810 aSig <<= -shiftCount;
6811 } else {
6812 shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra);
6813 }
6814 return roundAndPackUint64(aSign, aSig, aSigExtra STATUS_VAR);
75d62a58
JM
6815}
6816
6817uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
6818{
0a87a310
TM
6819 signed char current_rounding_mode = STATUS(float_rounding_mode);
6820 set_float_rounding_mode(float_round_to_zero STATUS_VAR);
6821 int64_t v = float64_to_uint64(a STATUS_VAR);
6822 set_float_rounding_mode(current_rounding_mode STATUS_VAR);
6823 return v;
75d62a58
JM
6824}
6825
1d6bda35 6826#define COMPARE(s, nan_exp) \
750afe93 6827INLINE int float ## s ## _compare_internal( float ## s a, float ## s b, \
1d6bda35
FB
6828 int is_quiet STATUS_PARAM ) \
6829{ \
6830 flag aSign, bSign; \
bb98fe42 6831 uint ## s ## _t av, bv; \
37d18660
PM
6832 a = float ## s ## _squash_input_denormal(a STATUS_VAR); \
6833 b = float ## s ## _squash_input_denormal(b STATUS_VAR); \
1d6bda35
FB
6834 \
6835 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
6836 extractFloat ## s ## Frac( a ) ) || \
6837 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
6838 extractFloat ## s ## Frac( b ) )) { \
6839 if (!is_quiet || \
6840 float ## s ## _is_signaling_nan( a ) || \
6841 float ## s ## _is_signaling_nan( b ) ) { \
6842 float_raise( float_flag_invalid STATUS_VAR); \
6843 } \
6844 return float_relation_unordered; \
6845 } \
6846 aSign = extractFloat ## s ## Sign( a ); \
6847 bSign = extractFloat ## s ## Sign( b ); \
f090c9d4 6848 av = float ## s ## _val(a); \
cd8a2533 6849 bv = float ## s ## _val(b); \
1d6bda35 6850 if ( aSign != bSign ) { \
bb98fe42 6851 if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) { \
1d6bda35
FB
6852 /* zero case */ \
6853 return float_relation_equal; \
6854 } else { \
6855 return 1 - (2 * aSign); \
6856 } \
6857 } else { \
f090c9d4 6858 if (av == bv) { \
1d6bda35
FB
6859 return float_relation_equal; \
6860 } else { \
f090c9d4 6861 return 1 - 2 * (aSign ^ ( av < bv )); \
1d6bda35
FB
6862 } \
6863 } \
6864} \
6865 \
750afe93 6866int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM ) \
1d6bda35
FB
6867{ \
6868 return float ## s ## _compare_internal(a, b, 0 STATUS_VAR); \
6869} \
6870 \
750afe93 6871int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \
1d6bda35
FB
6872{ \
6873 return float ## s ## _compare_internal(a, b, 1 STATUS_VAR); \
6874}
6875
6876COMPARE(32, 0xff)
6877COMPARE(64, 0x7ff)
9ee6e8bb 6878
f6714d36
AJ
6879INLINE int floatx80_compare_internal( floatx80 a, floatx80 b,
6880 int is_quiet STATUS_PARAM )
6881{
6882 flag aSign, bSign;
6883
6884 if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
6885 ( extractFloatx80Frac( a )<<1 ) ) ||
6886 ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
6887 ( extractFloatx80Frac( b )<<1 ) )) {
6888 if (!is_quiet ||
6889 floatx80_is_signaling_nan( a ) ||
6890 floatx80_is_signaling_nan( b ) ) {
6891 float_raise( float_flag_invalid STATUS_VAR);
6892 }
6893 return float_relation_unordered;
6894 }
6895 aSign = extractFloatx80Sign( a );
6896 bSign = extractFloatx80Sign( b );
6897 if ( aSign != bSign ) {
6898
6899 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
6900 ( ( a.low | b.low ) == 0 ) ) {
6901 /* zero case */
6902 return float_relation_equal;
6903 } else {
6904 return 1 - (2 * aSign);
6905 }
6906 } else {
6907 if (a.low == b.low && a.high == b.high) {
6908 return float_relation_equal;
6909 } else {
6910 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6911 }
6912 }
6913}
6914
6915int floatx80_compare( floatx80 a, floatx80 b STATUS_PARAM )
6916{
6917 return floatx80_compare_internal(a, b, 0 STATUS_VAR);
6918}
6919
6920int floatx80_compare_quiet( floatx80 a, floatx80 b STATUS_PARAM )
6921{
6922 return floatx80_compare_internal(a, b, 1 STATUS_VAR);
6923}
6924
1f587329
BS
6925INLINE int float128_compare_internal( float128 a, float128 b,
6926 int is_quiet STATUS_PARAM )
6927{
6928 flag aSign, bSign;
6929
6930 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6931 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6932 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6933 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6934 if (!is_quiet ||
6935 float128_is_signaling_nan( a ) ||
6936 float128_is_signaling_nan( b ) ) {
6937 float_raise( float_flag_invalid STATUS_VAR);
6938 }
6939 return float_relation_unordered;
6940 }
6941 aSign = extractFloat128Sign( a );
6942 bSign = extractFloat128Sign( b );
6943 if ( aSign != bSign ) {
6944 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6945 /* zero case */
6946 return float_relation_equal;
6947 } else {
6948 return 1 - (2 * aSign);
6949 }
6950 } else {
6951 if (a.low == b.low && a.high == b.high) {
6952 return float_relation_equal;
6953 } else {
6954 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6955 }
6956 }
6957}
6958
6959int float128_compare( float128 a, float128 b STATUS_PARAM )
6960{
6961 return float128_compare_internal(a, b, 0 STATUS_VAR);
6962}
6963
6964int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
6965{
6966 return float128_compare_internal(a, b, 1 STATUS_VAR);
6967}
6968
274f1b04
PM
6969/* min() and max() functions. These can't be implemented as
6970 * 'compare and pick one input' because that would mishandle
6971 * NaNs and +0 vs -0.
e17ab310
WN
6972 *
6973 * minnum() and maxnum() functions. These are similar to the min()
6974 * and max() functions but if one of the arguments is a QNaN and
6975 * the other is numerical then the numerical argument is returned.
6976 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
6977 * and maxNum() operations. min() and max() are the typical min/max
6978 * semantics provided by many CPUs which predate that specification.
274f1b04 6979 */
e70614ea 6980#define MINMAX(s) \
274f1b04 6981INLINE float ## s float ## s ## _minmax(float ## s a, float ## s b, \
e17ab310 6982 int ismin, int isieee STATUS_PARAM) \
274f1b04
PM
6983{ \
6984 flag aSign, bSign; \
6985 uint ## s ## _t av, bv; \
6986 a = float ## s ## _squash_input_denormal(a STATUS_VAR); \
6987 b = float ## s ## _squash_input_denormal(b STATUS_VAR); \
6988 if (float ## s ## _is_any_nan(a) || \
6989 float ## s ## _is_any_nan(b)) { \
e17ab310
WN
6990 if (isieee) { \
6991 if (float ## s ## _is_quiet_nan(a) && \
6992 !float ## s ##_is_any_nan(b)) { \
6993 return b; \
6994 } else if (float ## s ## _is_quiet_nan(b) && \
6995 !float ## s ## _is_any_nan(a)) { \
6996 return a; \
6997 } \
6998 } \
274f1b04
PM
6999 return propagateFloat ## s ## NaN(a, b STATUS_VAR); \
7000 } \
7001 aSign = extractFloat ## s ## Sign(a); \
7002 bSign = extractFloat ## s ## Sign(b); \
7003 av = float ## s ## _val(a); \
7004 bv = float ## s ## _val(b); \
7005 if (aSign != bSign) { \
7006 if (ismin) { \
7007 return aSign ? a : b; \
7008 } else { \
7009 return aSign ? b : a; \
7010 } \
7011 } else { \
7012 if (ismin) { \
7013 return (aSign ^ (av < bv)) ? a : b; \
7014 } else { \
7015 return (aSign ^ (av < bv)) ? b : a; \
7016 } \
7017 } \
7018} \
7019 \
7020float ## s float ## s ## _min(float ## s a, float ## s b STATUS_PARAM) \
7021{ \
e17ab310 7022 return float ## s ## _minmax(a, b, 1, 0 STATUS_VAR); \
274f1b04
PM
7023} \
7024 \
7025float ## s float ## s ## _max(float ## s a, float ## s b STATUS_PARAM) \
7026{ \
e17ab310
WN
7027 return float ## s ## _minmax(a, b, 0, 0 STATUS_VAR); \
7028} \
7029 \
7030float ## s float ## s ## _minnum(float ## s a, float ## s b STATUS_PARAM) \
7031{ \
7032 return float ## s ## _minmax(a, b, 1, 1 STATUS_VAR); \
7033} \
7034 \
7035float ## s float ## s ## _maxnum(float ## s a, float ## s b STATUS_PARAM) \
7036{ \
7037 return float ## s ## _minmax(a, b, 0, 1 STATUS_VAR); \
274f1b04
PM
7038}
7039
e70614ea
WN
7040MINMAX(32)
7041MINMAX(64)
274f1b04
PM
7042
7043
9ee6e8bb
PB
7044/* Multiply A by 2 raised to the power N. */
7045float32 float32_scalbn( float32 a, int n STATUS_PARAM )
7046{
7047 flag aSign;
326b9e98 7048 int16_t aExp;
bb98fe42 7049 uint32_t aSig;
9ee6e8bb 7050
37d18660 7051 a = float32_squash_input_denormal(a STATUS_VAR);
9ee6e8bb
PB
7052 aSig = extractFloat32Frac( a );
7053 aExp = extractFloat32Exp( a );
7054 aSign = extractFloat32Sign( a );
7055
7056 if ( aExp == 0xFF ) {
326b9e98
AJ
7057 if ( aSig ) {
7058 return propagateFloat32NaN( a, a STATUS_VAR );
7059 }
9ee6e8bb
PB
7060 return a;
7061 }
3c85c37f 7062 if (aExp != 0) {
69397542 7063 aSig |= 0x00800000;
3c85c37f 7064 } else if (aSig == 0) {
69397542 7065 return a;
3c85c37f
PM
7066 } else {
7067 aExp++;
7068 }
69397542 7069
326b9e98
AJ
7070 if (n > 0x200) {
7071 n = 0x200;
7072 } else if (n < -0x200) {
7073 n = -0x200;
7074 }
7075
69397542
PB
7076 aExp += n - 1;
7077 aSig <<= 7;
7078 return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
9ee6e8bb
PB
7079}
7080
7081float64 float64_scalbn( float64 a, int n STATUS_PARAM )
7082{
7083 flag aSign;
326b9e98 7084 int16_t aExp;
bb98fe42 7085 uint64_t aSig;
9ee6e8bb 7086
37d18660 7087 a = float64_squash_input_denormal(a STATUS_VAR);
9ee6e8bb
PB
7088 aSig = extractFloat64Frac( a );
7089 aExp = extractFloat64Exp( a );
7090 aSign = extractFloat64Sign( a );
7091
7092 if ( aExp == 0x7FF ) {
326b9e98
AJ
7093 if ( aSig ) {
7094 return propagateFloat64NaN( a, a STATUS_VAR );
7095 }
9ee6e8bb
PB
7096 return a;
7097 }
3c85c37f 7098 if (aExp != 0) {
69397542 7099 aSig |= LIT64( 0x0010000000000000 );
3c85c37f 7100 } else if (aSig == 0) {
69397542 7101 return a;
3c85c37f
PM
7102 } else {
7103 aExp++;
7104 }
69397542 7105
326b9e98
AJ
7106 if (n > 0x1000) {
7107 n = 0x1000;
7108 } else if (n < -0x1000) {
7109 n = -0x1000;
7110 }
7111
69397542
PB
7112 aExp += n - 1;
7113 aSig <<= 10;
7114 return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
9ee6e8bb
PB
7115}
7116
9ee6e8bb
PB
7117floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
7118{
7119 flag aSign;
326b9e98 7120 int32_t aExp;
bb98fe42 7121 uint64_t aSig;
9ee6e8bb
PB
7122
7123 aSig = extractFloatx80Frac( a );
7124 aExp = extractFloatx80Exp( a );
7125 aSign = extractFloatx80Sign( a );
7126
326b9e98
AJ
7127 if ( aExp == 0x7FFF ) {
7128 if ( aSig<<1 ) {
7129 return propagateFloatx80NaN( a, a STATUS_VAR );
7130 }
9ee6e8bb
PB
7131 return a;
7132 }
326b9e98 7133
3c85c37f
PM
7134 if (aExp == 0) {
7135 if (aSig == 0) {
7136 return a;
7137 }
7138 aExp++;
7139 }
69397542 7140
326b9e98
AJ
7141 if (n > 0x10000) {
7142 n = 0x10000;
7143 } else if (n < -0x10000) {
7144 n = -0x10000;
7145 }
7146
9ee6e8bb 7147 aExp += n;
69397542
PB
7148 return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
7149 aSign, aExp, aSig, 0 STATUS_VAR );
9ee6e8bb 7150}
9ee6e8bb 7151
9ee6e8bb
PB
7152float128 float128_scalbn( float128 a, int n STATUS_PARAM )
7153{
7154 flag aSign;
326b9e98 7155 int32_t aExp;
bb98fe42 7156 uint64_t aSig0, aSig1;
9ee6e8bb
PB
7157
7158 aSig1 = extractFloat128Frac1( a );
7159 aSig0 = extractFloat128Frac0( a );
7160 aExp = extractFloat128Exp( a );
7161 aSign = extractFloat128Sign( a );
7162 if ( aExp == 0x7FFF ) {
326b9e98
AJ
7163 if ( aSig0 | aSig1 ) {
7164 return propagateFloat128NaN( a, a STATUS_VAR );
7165 }
9ee6e8bb
PB
7166 return a;
7167 }
3c85c37f 7168 if (aExp != 0) {
69397542 7169 aSig0 |= LIT64( 0x0001000000000000 );
3c85c37f 7170 } else if (aSig0 == 0 && aSig1 == 0) {
69397542 7171 return a;
3c85c37f
PM
7172 } else {
7173 aExp++;
7174 }
69397542 7175
326b9e98
AJ
7176 if (n > 0x10000) {
7177 n = 0x10000;
7178 } else if (n < -0x10000) {
7179 n = -0x10000;
7180 }
7181
69397542
PB
7182 aExp += n - 1;
7183 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7184 STATUS_VAR );
9ee6e8bb
PB
7185
7186}