]> git.proxmox.com Git - mirror_qemu.git/blame - fpu/softfloat.c
pseries: Update SLOF firmware image to qemu-slof-20140404
[mirror_qemu.git] / fpu / softfloat.c
CommitLineData
8d725fac
AF
1/*
2 * QEMU float support
3 *
4 * Derived from SoftFloat.
5 */
158142c2
FB
6
7/*============================================================================
8
9This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
10Package, Release 2b.
11
12Written by John R. Hauser. This work was made possible in part by the
13International Computer Science Institute, located at Suite 600, 1947 Center
14Street, Berkeley, California 94704. Funding was partially provided by the
15National Science Foundation under grant MIP-9311980. The original version
16of this code was written as part of a project to build a fixed-point vector
17processor in collaboration with the University of California at Berkeley,
18overseen by Profs. Nelson Morgan and John Wawrzynek. More information
19is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
20arithmetic/SoftFloat.html'.
21
22THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
23been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
24RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
25AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
26COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
27EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
28INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
29OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
30
31Derivative works are acceptable, even for commercial purposes, so long as
32(1) the source code for the derivative work includes prominent notice that
33the work is derivative, and (2) the source code includes prominent notice with
34these four paragraphs for those parts of this code that are retained.
35
36=============================================================================*/
37
2ac8bd03
PM
38/* softfloat (and in particular the code in softfloat-specialize.h) is
39 * target-dependent and needs the TARGET_* macros.
40 */
41#include "config.h"
42
6b4c305c 43#include "fpu/softfloat.h"
158142c2 44
dc355b76
PM
45/* We only need stdlib for abort() */
46#include <stdlib.h>
47
158142c2
FB
48/*----------------------------------------------------------------------------
49| Primitive arithmetic functions, including multi-word arithmetic, and
50| division and square root approximations. (Can be specialized to target if
51| desired.)
52*----------------------------------------------------------------------------*/
53#include "softfloat-macros.h"
54
55/*----------------------------------------------------------------------------
56| Functions and definitions to determine: (1) whether tininess for underflow
57| is detected before or after rounding by default, (2) what (if anything)
58| happens when exceptions are raised, (3) how signaling NaNs are distinguished
59| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
60| are propagated from function inputs to output. These details are target-
61| specific.
62*----------------------------------------------------------------------------*/
63#include "softfloat-specialize.h"
64
bb4d4bb3
PM
65/*----------------------------------------------------------------------------
66| Returns the fraction bits of the half-precision floating-point value `a'.
67*----------------------------------------------------------------------------*/
68
69INLINE uint32_t extractFloat16Frac(float16 a)
70{
71 return float16_val(a) & 0x3ff;
72}
73
74/*----------------------------------------------------------------------------
75| Returns the exponent bits of the half-precision floating-point value `a'.
76*----------------------------------------------------------------------------*/
77
94a49d86 78INLINE int_fast16_t extractFloat16Exp(float16 a)
bb4d4bb3
PM
79{
80 return (float16_val(a) >> 10) & 0x1f;
81}
82
83/*----------------------------------------------------------------------------
84| Returns the sign bit of the single-precision floating-point value `a'.
85*----------------------------------------------------------------------------*/
86
87INLINE flag extractFloat16Sign(float16 a)
88{
89 return float16_val(a)>>15;
90}
91
158142c2
FB
92/*----------------------------------------------------------------------------
93| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
94| and 7, and returns the properly rounded 32-bit integer corresponding to the
95| input. If `zSign' is 1, the input is negated before being converted to an
96| integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
97| is simply rounded to an integer, with the inexact exception raised if the
98| input cannot be represented exactly as an integer. However, if the fixed-
99| point input is too large, the invalid exception is raised and the largest
100| positive or negative integer is returned.
101*----------------------------------------------------------------------------*/
102
bb98fe42 103static int32 roundAndPackInt32( flag zSign, uint64_t absZ STATUS_PARAM)
158142c2
FB
104{
105 int8 roundingMode;
106 flag roundNearestEven;
107 int8 roundIncrement, roundBits;
760e1416 108 int32_t z;
158142c2
FB
109
110 roundingMode = STATUS(float_rounding_mode);
111 roundNearestEven = ( roundingMode == float_round_nearest_even );
dc355b76
PM
112 switch (roundingMode) {
113 case float_round_nearest_even:
f9288a76 114 case float_round_ties_away:
dc355b76
PM
115 roundIncrement = 0x40;
116 break;
117 case float_round_to_zero:
118 roundIncrement = 0;
119 break;
120 case float_round_up:
121 roundIncrement = zSign ? 0 : 0x7f;
122 break;
123 case float_round_down:
124 roundIncrement = zSign ? 0x7f : 0;
125 break;
126 default:
127 abort();
158142c2
FB
128 }
129 roundBits = absZ & 0x7F;
130 absZ = ( absZ + roundIncrement )>>7;
131 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
132 z = absZ;
133 if ( zSign ) z = - z;
134 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
135 float_raise( float_flag_invalid STATUS_VAR);
bb98fe42 136 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2
FB
137 }
138 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
139 return z;
140
141}
142
143/*----------------------------------------------------------------------------
144| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
145| `absZ1', with binary point between bits 63 and 64 (between the input words),
146| and returns the properly rounded 64-bit integer corresponding to the input.
147| If `zSign' is 1, the input is negated before being converted to an integer.
148| Ordinarily, the fixed-point input is simply rounded to an integer, with
149| the inexact exception raised if the input cannot be represented exactly as
150| an integer. However, if the fixed-point input is too large, the invalid
151| exception is raised and the largest positive or negative integer is
152| returned.
153*----------------------------------------------------------------------------*/
154
bb98fe42 155static int64 roundAndPackInt64( flag zSign, uint64_t absZ0, uint64_t absZ1 STATUS_PARAM)
158142c2
FB
156{
157 int8 roundingMode;
158 flag roundNearestEven, increment;
760e1416 159 int64_t z;
158142c2
FB
160
161 roundingMode = STATUS(float_rounding_mode);
162 roundNearestEven = ( roundingMode == float_round_nearest_even );
dc355b76
PM
163 switch (roundingMode) {
164 case float_round_nearest_even:
f9288a76 165 case float_round_ties_away:
dc355b76
PM
166 increment = ((int64_t) absZ1 < 0);
167 break;
168 case float_round_to_zero:
169 increment = 0;
170 break;
171 case float_round_up:
172 increment = !zSign && absZ1;
173 break;
174 case float_round_down:
175 increment = zSign && absZ1;
176 break;
177 default:
178 abort();
158142c2
FB
179 }
180 if ( increment ) {
181 ++absZ0;
182 if ( absZ0 == 0 ) goto overflow;
bb98fe42 183 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
158142c2
FB
184 }
185 z = absZ0;
186 if ( zSign ) z = - z;
187 if ( z && ( ( z < 0 ) ^ zSign ) ) {
188 overflow:
189 float_raise( float_flag_invalid STATUS_VAR);
190 return
bb98fe42 191 zSign ? (int64_t) LIT64( 0x8000000000000000 )
158142c2
FB
192 : LIT64( 0x7FFFFFFFFFFFFFFF );
193 }
194 if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
195 return z;
196
197}
198
fb3ea83a
TM
199/*----------------------------------------------------------------------------
200| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
201| `absZ1', with binary point between bits 63 and 64 (between the input words),
202| and returns the properly rounded 64-bit unsigned integer corresponding to the
203| input. Ordinarily, the fixed-point input is simply rounded to an integer,
204| with the inexact exception raised if the input cannot be represented exactly
205| as an integer. However, if the fixed-point input is too large, the invalid
206| exception is raised and the largest unsigned integer is returned.
207*----------------------------------------------------------------------------*/
208
209static int64 roundAndPackUint64(flag zSign, uint64_t absZ0,
210 uint64_t absZ1 STATUS_PARAM)
211{
212 int8 roundingMode;
213 flag roundNearestEven, increment;
214
215 roundingMode = STATUS(float_rounding_mode);
216 roundNearestEven = (roundingMode == float_round_nearest_even);
dc355b76
PM
217 switch (roundingMode) {
218 case float_round_nearest_even:
f9288a76 219 case float_round_ties_away:
dc355b76
PM
220 increment = ((int64_t)absZ1 < 0);
221 break;
222 case float_round_to_zero:
223 increment = 0;
224 break;
225 case float_round_up:
226 increment = !zSign && absZ1;
227 break;
228 case float_round_down:
229 increment = zSign && absZ1;
230 break;
231 default:
232 abort();
fb3ea83a
TM
233 }
234 if (increment) {
235 ++absZ0;
236 if (absZ0 == 0) {
237 float_raise(float_flag_invalid STATUS_VAR);
238 return LIT64(0xFFFFFFFFFFFFFFFF);
239 }
240 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
241 }
242
243 if (zSign && absZ0) {
244 float_raise(float_flag_invalid STATUS_VAR);
245 return 0;
246 }
247
248 if (absZ1) {
249 STATUS(float_exception_flags) |= float_flag_inexact;
250 }
251 return absZ0;
252}
253
158142c2
FB
254/*----------------------------------------------------------------------------
255| Returns the fraction bits of the single-precision floating-point value `a'.
256*----------------------------------------------------------------------------*/
257
bb98fe42 258INLINE uint32_t extractFloat32Frac( float32 a )
158142c2
FB
259{
260
f090c9d4 261 return float32_val(a) & 0x007FFFFF;
158142c2
FB
262
263}
264
265/*----------------------------------------------------------------------------
266| Returns the exponent bits of the single-precision floating-point value `a'.
267*----------------------------------------------------------------------------*/
268
94a49d86 269INLINE int_fast16_t extractFloat32Exp(float32 a)
158142c2
FB
270{
271
f090c9d4 272 return ( float32_val(a)>>23 ) & 0xFF;
158142c2
FB
273
274}
275
276/*----------------------------------------------------------------------------
277| Returns the sign bit of the single-precision floating-point value `a'.
278*----------------------------------------------------------------------------*/
279
280INLINE flag extractFloat32Sign( float32 a )
281{
282
f090c9d4 283 return float32_val(a)>>31;
158142c2
FB
284
285}
286
37d18660
PM
287/*----------------------------------------------------------------------------
288| If `a' is denormal and we are in flush-to-zero mode then set the
289| input-denormal exception and return zero. Otherwise just return the value.
290*----------------------------------------------------------------------------*/
7baeabce 291float32 float32_squash_input_denormal(float32 a STATUS_PARAM)
37d18660
PM
292{
293 if (STATUS(flush_inputs_to_zero)) {
294 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
295 float_raise(float_flag_input_denormal STATUS_VAR);
296 return make_float32(float32_val(a) & 0x80000000);
297 }
298 }
299 return a;
300}
301
158142c2
FB
302/*----------------------------------------------------------------------------
303| Normalizes the subnormal single-precision floating-point value represented
304| by the denormalized significand `aSig'. The normalized exponent and
305| significand are stored at the locations pointed to by `zExpPtr' and
306| `zSigPtr', respectively.
307*----------------------------------------------------------------------------*/
308
309static void
94a49d86 310 normalizeFloat32Subnormal(uint32_t aSig, int_fast16_t *zExpPtr, uint32_t *zSigPtr)
158142c2
FB
311{
312 int8 shiftCount;
313
314 shiftCount = countLeadingZeros32( aSig ) - 8;
315 *zSigPtr = aSig<<shiftCount;
316 *zExpPtr = 1 - shiftCount;
317
318}
319
320/*----------------------------------------------------------------------------
321| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
322| single-precision floating-point value, returning the result. After being
323| shifted into the proper positions, the three fields are simply added
324| together to form the result. This means that any integer portion of `zSig'
325| will be added into the exponent. Since a properly normalized significand
326| will have an integer portion equal to 1, the `zExp' input should be 1 less
327| than the desired result exponent whenever `zSig' is a complete, normalized
328| significand.
329*----------------------------------------------------------------------------*/
330
94a49d86 331INLINE float32 packFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig)
158142c2
FB
332{
333
f090c9d4 334 return make_float32(
bb98fe42 335 ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
158142c2
FB
336
337}
338
339/*----------------------------------------------------------------------------
340| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
341| and significand `zSig', and returns the proper single-precision floating-
342| point value corresponding to the abstract input. Ordinarily, the abstract
343| value is simply rounded and packed into the single-precision format, with
344| the inexact exception raised if the abstract input cannot be represented
345| exactly. However, if the abstract value is too large, the overflow and
346| inexact exceptions are raised and an infinity or maximal finite value is
347| returned. If the abstract value is too small, the input value is rounded to
348| a subnormal number, and the underflow and inexact exceptions are raised if
349| the abstract input cannot be represented exactly as a subnormal single-
350| precision floating-point number.
351| The input significand `zSig' has its binary point between bits 30
352| and 29, which is 7 bits to the left of the usual location. This shifted
353| significand must be normalized or smaller. If `zSig' is not normalized,
354| `zExp' must be 0; in that case, the result returned is a subnormal number,
355| and it must not require rounding. In the usual case that `zSig' is
356| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
357| The handling of underflow and overflow follows the IEC/IEEE Standard for
358| Binary Floating-Point Arithmetic.
359*----------------------------------------------------------------------------*/
360
94a49d86 361static float32 roundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM)
158142c2
FB
362{
363 int8 roundingMode;
364 flag roundNearestEven;
365 int8 roundIncrement, roundBits;
366 flag isTiny;
367
368 roundingMode = STATUS(float_rounding_mode);
369 roundNearestEven = ( roundingMode == float_round_nearest_even );
dc355b76
PM
370 switch (roundingMode) {
371 case float_round_nearest_even:
f9288a76 372 case float_round_ties_away:
dc355b76
PM
373 roundIncrement = 0x40;
374 break;
375 case float_round_to_zero:
376 roundIncrement = 0;
377 break;
378 case float_round_up:
379 roundIncrement = zSign ? 0 : 0x7f;
380 break;
381 case float_round_down:
382 roundIncrement = zSign ? 0x7f : 0;
383 break;
384 default:
385 abort();
386 break;
158142c2
FB
387 }
388 roundBits = zSig & 0x7F;
bb98fe42 389 if ( 0xFD <= (uint16_t) zExp ) {
158142c2
FB
390 if ( ( 0xFD < zExp )
391 || ( ( zExp == 0xFD )
bb98fe42 392 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
158142c2
FB
393 ) {
394 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
f090c9d4 395 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
158142c2
FB
396 }
397 if ( zExp < 0 ) {
e6afc87f
PM
398 if (STATUS(flush_to_zero)) {
399 float_raise(float_flag_output_denormal STATUS_VAR);
400 return packFloat32(zSign, 0, 0);
401 }
158142c2
FB
402 isTiny =
403 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
404 || ( zExp < -1 )
405 || ( zSig + roundIncrement < 0x80000000 );
406 shift32RightJamming( zSig, - zExp, &zSig );
407 zExp = 0;
408 roundBits = zSig & 0x7F;
409 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
410 }
411 }
412 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
413 zSig = ( zSig + roundIncrement )>>7;
414 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
415 if ( zSig == 0 ) zExp = 0;
416 return packFloat32( zSign, zExp, zSig );
417
418}
419
420/*----------------------------------------------------------------------------
421| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
422| and significand `zSig', and returns the proper single-precision floating-
423| point value corresponding to the abstract input. This routine is just like
424| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
425| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
426| floating-point exponent.
427*----------------------------------------------------------------------------*/
428
429static float32
94a49d86 430 normalizeRoundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM)
158142c2
FB
431{
432 int8 shiftCount;
433
434 shiftCount = countLeadingZeros32( zSig ) - 1;
435 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
436
437}
438
439/*----------------------------------------------------------------------------
440| Returns the fraction bits of the double-precision floating-point value `a'.
441*----------------------------------------------------------------------------*/
442
bb98fe42 443INLINE uint64_t extractFloat64Frac( float64 a )
158142c2
FB
444{
445
f090c9d4 446 return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
158142c2
FB
447
448}
449
450/*----------------------------------------------------------------------------
451| Returns the exponent bits of the double-precision floating-point value `a'.
452*----------------------------------------------------------------------------*/
453
94a49d86 454INLINE int_fast16_t extractFloat64Exp(float64 a)
158142c2
FB
455{
456
f090c9d4 457 return ( float64_val(a)>>52 ) & 0x7FF;
158142c2
FB
458
459}
460
461/*----------------------------------------------------------------------------
462| Returns the sign bit of the double-precision floating-point value `a'.
463*----------------------------------------------------------------------------*/
464
465INLINE flag extractFloat64Sign( float64 a )
466{
467
f090c9d4 468 return float64_val(a)>>63;
158142c2
FB
469
470}
471
37d18660
PM
472/*----------------------------------------------------------------------------
473| If `a' is denormal and we are in flush-to-zero mode then set the
474| input-denormal exception and return zero. Otherwise just return the value.
475*----------------------------------------------------------------------------*/
7baeabce 476float64 float64_squash_input_denormal(float64 a STATUS_PARAM)
37d18660
PM
477{
478 if (STATUS(flush_inputs_to_zero)) {
479 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
480 float_raise(float_flag_input_denormal STATUS_VAR);
481 return make_float64(float64_val(a) & (1ULL << 63));
482 }
483 }
484 return a;
485}
486
158142c2
FB
487/*----------------------------------------------------------------------------
488| Normalizes the subnormal double-precision floating-point value represented
489| by the denormalized significand `aSig'. The normalized exponent and
490| significand are stored at the locations pointed to by `zExpPtr' and
491| `zSigPtr', respectively.
492*----------------------------------------------------------------------------*/
493
494static void
94a49d86 495 normalizeFloat64Subnormal(uint64_t aSig, int_fast16_t *zExpPtr, uint64_t *zSigPtr)
158142c2
FB
496{
497 int8 shiftCount;
498
499 shiftCount = countLeadingZeros64( aSig ) - 11;
500 *zSigPtr = aSig<<shiftCount;
501 *zExpPtr = 1 - shiftCount;
502
503}
504
505/*----------------------------------------------------------------------------
506| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
507| double-precision floating-point value, returning the result. After being
508| shifted into the proper positions, the three fields are simply added
509| together to form the result. This means that any integer portion of `zSig'
510| will be added into the exponent. Since a properly normalized significand
511| will have an integer portion equal to 1, the `zExp' input should be 1 less
512| than the desired result exponent whenever `zSig' is a complete, normalized
513| significand.
514*----------------------------------------------------------------------------*/
515
94a49d86 516INLINE float64 packFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig)
158142c2
FB
517{
518
f090c9d4 519 return make_float64(
bb98fe42 520 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
158142c2
FB
521
522}
523
524/*----------------------------------------------------------------------------
525| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
526| and significand `zSig', and returns the proper double-precision floating-
527| point value corresponding to the abstract input. Ordinarily, the abstract
528| value is simply rounded and packed into the double-precision format, with
529| the inexact exception raised if the abstract input cannot be represented
530| exactly. However, if the abstract value is too large, the overflow and
531| inexact exceptions are raised and an infinity or maximal finite value is
532| returned. If the abstract value is too small, the input value is rounded
533| to a subnormal number, and the underflow and inexact exceptions are raised
534| if the abstract input cannot be represented exactly as a subnormal double-
535| precision floating-point number.
536| The input significand `zSig' has its binary point between bits 62
537| and 61, which is 10 bits to the left of the usual location. This shifted
538| significand must be normalized or smaller. If `zSig' is not normalized,
539| `zExp' must be 0; in that case, the result returned is a subnormal number,
540| and it must not require rounding. In the usual case that `zSig' is
541| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
542| The handling of underflow and overflow follows the IEC/IEEE Standard for
543| Binary Floating-Point Arithmetic.
544*----------------------------------------------------------------------------*/
545
94a49d86 546static float64 roundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM)
158142c2
FB
547{
548 int8 roundingMode;
549 flag roundNearestEven;
94a49d86 550 int_fast16_t roundIncrement, roundBits;
158142c2
FB
551 flag isTiny;
552
553 roundingMode = STATUS(float_rounding_mode);
554 roundNearestEven = ( roundingMode == float_round_nearest_even );
dc355b76
PM
555 switch (roundingMode) {
556 case float_round_nearest_even:
f9288a76 557 case float_round_ties_away:
dc355b76
PM
558 roundIncrement = 0x200;
559 break;
560 case float_round_to_zero:
561 roundIncrement = 0;
562 break;
563 case float_round_up:
564 roundIncrement = zSign ? 0 : 0x3ff;
565 break;
566 case float_round_down:
567 roundIncrement = zSign ? 0x3ff : 0;
568 break;
569 default:
570 abort();
158142c2
FB
571 }
572 roundBits = zSig & 0x3FF;
bb98fe42 573 if ( 0x7FD <= (uint16_t) zExp ) {
158142c2
FB
574 if ( ( 0x7FD < zExp )
575 || ( ( zExp == 0x7FD )
bb98fe42 576 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
158142c2
FB
577 ) {
578 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
f090c9d4 579 return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
158142c2
FB
580 }
581 if ( zExp < 0 ) {
e6afc87f
PM
582 if (STATUS(flush_to_zero)) {
583 float_raise(float_flag_output_denormal STATUS_VAR);
584 return packFloat64(zSign, 0, 0);
585 }
158142c2
FB
586 isTiny =
587 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
588 || ( zExp < -1 )
589 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
590 shift64RightJamming( zSig, - zExp, &zSig );
591 zExp = 0;
592 roundBits = zSig & 0x3FF;
593 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
594 }
595 }
596 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
597 zSig = ( zSig + roundIncrement )>>10;
598 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
599 if ( zSig == 0 ) zExp = 0;
600 return packFloat64( zSign, zExp, zSig );
601
602}
603
604/*----------------------------------------------------------------------------
605| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
606| and significand `zSig', and returns the proper double-precision floating-
607| point value corresponding to the abstract input. This routine is just like
608| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
609| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
610| floating-point exponent.
611*----------------------------------------------------------------------------*/
612
613static float64
94a49d86 614 normalizeRoundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM)
158142c2
FB
615{
616 int8 shiftCount;
617
618 shiftCount = countLeadingZeros64( zSig ) - 1;
619 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
620
621}
622
158142c2
FB
623/*----------------------------------------------------------------------------
624| Returns the fraction bits of the extended double-precision floating-point
625| value `a'.
626*----------------------------------------------------------------------------*/
627
bb98fe42 628INLINE uint64_t extractFloatx80Frac( floatx80 a )
158142c2
FB
629{
630
631 return a.low;
632
633}
634
635/*----------------------------------------------------------------------------
636| Returns the exponent bits of the extended double-precision floating-point
637| value `a'.
638*----------------------------------------------------------------------------*/
639
640INLINE int32 extractFloatx80Exp( floatx80 a )
641{
642
643 return a.high & 0x7FFF;
644
645}
646
647/*----------------------------------------------------------------------------
648| Returns the sign bit of the extended double-precision floating-point value
649| `a'.
650*----------------------------------------------------------------------------*/
651
652INLINE flag extractFloatx80Sign( floatx80 a )
653{
654
655 return a.high>>15;
656
657}
658
659/*----------------------------------------------------------------------------
660| Normalizes the subnormal extended double-precision floating-point value
661| represented by the denormalized significand `aSig'. The normalized exponent
662| and significand are stored at the locations pointed to by `zExpPtr' and
663| `zSigPtr', respectively.
664*----------------------------------------------------------------------------*/
665
666static void
bb98fe42 667 normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr )
158142c2
FB
668{
669 int8 shiftCount;
670
671 shiftCount = countLeadingZeros64( aSig );
672 *zSigPtr = aSig<<shiftCount;
673 *zExpPtr = 1 - shiftCount;
674
675}
676
677/*----------------------------------------------------------------------------
678| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
679| extended double-precision floating-point value, returning the result.
680*----------------------------------------------------------------------------*/
681
bb98fe42 682INLINE floatx80 packFloatx80( flag zSign, int32 zExp, uint64_t zSig )
158142c2
FB
683{
684 floatx80 z;
685
686 z.low = zSig;
bb98fe42 687 z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
158142c2
FB
688 return z;
689
690}
691
692/*----------------------------------------------------------------------------
693| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
694| and extended significand formed by the concatenation of `zSig0' and `zSig1',
695| and returns the proper extended double-precision floating-point value
696| corresponding to the abstract input. Ordinarily, the abstract value is
697| rounded and packed into the extended double-precision format, with the
698| inexact exception raised if the abstract input cannot be represented
699| exactly. However, if the abstract value is too large, the overflow and
700| inexact exceptions are raised and an infinity or maximal finite value is
701| returned. If the abstract value is too small, the input value is rounded to
702| a subnormal number, and the underflow and inexact exceptions are raised if
703| the abstract input cannot be represented exactly as a subnormal extended
704| double-precision floating-point number.
705| If `roundingPrecision' is 32 or 64, the result is rounded to the same
706| number of bits as single or double precision, respectively. Otherwise, the
707| result is rounded to the full precision of the extended double-precision
708| format.
709| The input significand must be normalized or smaller. If the input
710| significand is not normalized, `zExp' must be 0; in that case, the result
711| returned is a subnormal number, and it must not require rounding. The
712| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
713| Floating-Point Arithmetic.
714*----------------------------------------------------------------------------*/
715
716static floatx80
717 roundAndPackFloatx80(
bb98fe42 718 int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
158142c2
FB
719 STATUS_PARAM)
720{
721 int8 roundingMode;
722 flag roundNearestEven, increment, isTiny;
723 int64 roundIncrement, roundMask, roundBits;
724
725 roundingMode = STATUS(float_rounding_mode);
726 roundNearestEven = ( roundingMode == float_round_nearest_even );
727 if ( roundingPrecision == 80 ) goto precision80;
728 if ( roundingPrecision == 64 ) {
729 roundIncrement = LIT64( 0x0000000000000400 );
730 roundMask = LIT64( 0x00000000000007FF );
731 }
732 else if ( roundingPrecision == 32 ) {
733 roundIncrement = LIT64( 0x0000008000000000 );
734 roundMask = LIT64( 0x000000FFFFFFFFFF );
735 }
736 else {
737 goto precision80;
738 }
739 zSig0 |= ( zSig1 != 0 );
dc355b76
PM
740 switch (roundingMode) {
741 case float_round_nearest_even:
f9288a76 742 case float_round_ties_away:
dc355b76
PM
743 break;
744 case float_round_to_zero:
745 roundIncrement = 0;
746 break;
747 case float_round_up:
748 roundIncrement = zSign ? 0 : roundMask;
749 break;
750 case float_round_down:
751 roundIncrement = zSign ? roundMask : 0;
752 break;
753 default:
754 abort();
158142c2
FB
755 }
756 roundBits = zSig0 & roundMask;
bb98fe42 757 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
158142c2
FB
758 if ( ( 0x7FFE < zExp )
759 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
760 ) {
761 goto overflow;
762 }
763 if ( zExp <= 0 ) {
e6afc87f
PM
764 if (STATUS(flush_to_zero)) {
765 float_raise(float_flag_output_denormal STATUS_VAR);
766 return packFloatx80(zSign, 0, 0);
767 }
158142c2
FB
768 isTiny =
769 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
770 || ( zExp < 0 )
771 || ( zSig0 <= zSig0 + roundIncrement );
772 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
773 zExp = 0;
774 roundBits = zSig0 & roundMask;
775 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
776 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
777 zSig0 += roundIncrement;
bb98fe42 778 if ( (int64_t) zSig0 < 0 ) zExp = 1;
158142c2
FB
779 roundIncrement = roundMask + 1;
780 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
781 roundMask |= roundIncrement;
782 }
783 zSig0 &= ~ roundMask;
784 return packFloatx80( zSign, zExp, zSig0 );
785 }
786 }
787 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
788 zSig0 += roundIncrement;
789 if ( zSig0 < roundIncrement ) {
790 ++zExp;
791 zSig0 = LIT64( 0x8000000000000000 );
792 }
793 roundIncrement = roundMask + 1;
794 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
795 roundMask |= roundIncrement;
796 }
797 zSig0 &= ~ roundMask;
798 if ( zSig0 == 0 ) zExp = 0;
799 return packFloatx80( zSign, zExp, zSig0 );
800 precision80:
dc355b76
PM
801 switch (roundingMode) {
802 case float_round_nearest_even:
f9288a76 803 case float_round_ties_away:
dc355b76
PM
804 increment = ((int64_t)zSig1 < 0);
805 break;
806 case float_round_to_zero:
807 increment = 0;
808 break;
809 case float_round_up:
810 increment = !zSign && zSig1;
811 break;
812 case float_round_down:
813 increment = zSign && zSig1;
814 break;
815 default:
816 abort();
158142c2 817 }
bb98fe42 818 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
158142c2
FB
819 if ( ( 0x7FFE < zExp )
820 || ( ( zExp == 0x7FFE )
821 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
822 && increment
823 )
824 ) {
825 roundMask = 0;
826 overflow:
827 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
828 if ( ( roundingMode == float_round_to_zero )
829 || ( zSign && ( roundingMode == float_round_up ) )
830 || ( ! zSign && ( roundingMode == float_round_down ) )
831 ) {
832 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
833 }
834 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
835 }
836 if ( zExp <= 0 ) {
837 isTiny =
838 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
839 || ( zExp < 0 )
840 || ! increment
841 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
842 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
843 zExp = 0;
844 if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
845 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
dc355b76
PM
846 switch (roundingMode) {
847 case float_round_nearest_even:
f9288a76 848 case float_round_ties_away:
dc355b76
PM
849 increment = ((int64_t)zSig1 < 0);
850 break;
851 case float_round_to_zero:
852 increment = 0;
853 break;
854 case float_round_up:
855 increment = !zSign && zSig1;
856 break;
857 case float_round_down:
858 increment = zSign && zSig1;
859 break;
860 default:
861 abort();
158142c2
FB
862 }
863 if ( increment ) {
864 ++zSig0;
865 zSig0 &=
bb98fe42
AF
866 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
867 if ( (int64_t) zSig0 < 0 ) zExp = 1;
158142c2
FB
868 }
869 return packFloatx80( zSign, zExp, zSig0 );
870 }
871 }
872 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
873 if ( increment ) {
874 ++zSig0;
875 if ( zSig0 == 0 ) {
876 ++zExp;
877 zSig0 = LIT64( 0x8000000000000000 );
878 }
879 else {
bb98fe42 880 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
158142c2
FB
881 }
882 }
883 else {
884 if ( zSig0 == 0 ) zExp = 0;
885 }
886 return packFloatx80( zSign, zExp, zSig0 );
887
888}
889
890/*----------------------------------------------------------------------------
891| Takes an abstract floating-point value having sign `zSign', exponent
892| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
893| and returns the proper extended double-precision floating-point value
894| corresponding to the abstract input. This routine is just like
895| `roundAndPackFloatx80' except that the input significand does not have to be
896| normalized.
897*----------------------------------------------------------------------------*/
898
899static floatx80
900 normalizeRoundAndPackFloatx80(
bb98fe42 901 int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
158142c2
FB
902 STATUS_PARAM)
903{
904 int8 shiftCount;
905
906 if ( zSig0 == 0 ) {
907 zSig0 = zSig1;
908 zSig1 = 0;
909 zExp -= 64;
910 }
911 shiftCount = countLeadingZeros64( zSig0 );
912 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
913 zExp -= shiftCount;
914 return
915 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
916
917}
918
158142c2
FB
919/*----------------------------------------------------------------------------
920| Returns the least-significant 64 fraction bits of the quadruple-precision
921| floating-point value `a'.
922*----------------------------------------------------------------------------*/
923
bb98fe42 924INLINE uint64_t extractFloat128Frac1( float128 a )
158142c2
FB
925{
926
927 return a.low;
928
929}
930
931/*----------------------------------------------------------------------------
932| Returns the most-significant 48 fraction bits of the quadruple-precision
933| floating-point value `a'.
934*----------------------------------------------------------------------------*/
935
bb98fe42 936INLINE uint64_t extractFloat128Frac0( float128 a )
158142c2
FB
937{
938
939 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
940
941}
942
943/*----------------------------------------------------------------------------
944| Returns the exponent bits of the quadruple-precision floating-point value
945| `a'.
946*----------------------------------------------------------------------------*/
947
948INLINE int32 extractFloat128Exp( float128 a )
949{
950
951 return ( a.high>>48 ) & 0x7FFF;
952
953}
954
955/*----------------------------------------------------------------------------
956| Returns the sign bit of the quadruple-precision floating-point value `a'.
957*----------------------------------------------------------------------------*/
958
959INLINE flag extractFloat128Sign( float128 a )
960{
961
962 return a.high>>63;
963
964}
965
966/*----------------------------------------------------------------------------
967| Normalizes the subnormal quadruple-precision floating-point value
968| represented by the denormalized significand formed by the concatenation of
969| `aSig0' and `aSig1'. The normalized exponent is stored at the location
970| pointed to by `zExpPtr'. The most significant 49 bits of the normalized
971| significand are stored at the location pointed to by `zSig0Ptr', and the
972| least significant 64 bits of the normalized significand are stored at the
973| location pointed to by `zSig1Ptr'.
974*----------------------------------------------------------------------------*/
975
976static void
977 normalizeFloat128Subnormal(
bb98fe42
AF
978 uint64_t aSig0,
979 uint64_t aSig1,
158142c2 980 int32 *zExpPtr,
bb98fe42
AF
981 uint64_t *zSig0Ptr,
982 uint64_t *zSig1Ptr
158142c2
FB
983 )
984{
985 int8 shiftCount;
986
987 if ( aSig0 == 0 ) {
988 shiftCount = countLeadingZeros64( aSig1 ) - 15;
989 if ( shiftCount < 0 ) {
990 *zSig0Ptr = aSig1>>( - shiftCount );
991 *zSig1Ptr = aSig1<<( shiftCount & 63 );
992 }
993 else {
994 *zSig0Ptr = aSig1<<shiftCount;
995 *zSig1Ptr = 0;
996 }
997 *zExpPtr = - shiftCount - 63;
998 }
999 else {
1000 shiftCount = countLeadingZeros64( aSig0 ) - 15;
1001 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
1002 *zExpPtr = 1 - shiftCount;
1003 }
1004
1005}
1006
1007/*----------------------------------------------------------------------------
1008| Packs the sign `zSign', the exponent `zExp', and the significand formed
1009| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
1010| floating-point value, returning the result. After being shifted into the
1011| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
1012| added together to form the most significant 32 bits of the result. This
1013| means that any integer portion of `zSig0' will be added into the exponent.
1014| Since a properly normalized significand will have an integer portion equal
1015| to 1, the `zExp' input should be 1 less than the desired result exponent
1016| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
1017| significand.
1018*----------------------------------------------------------------------------*/
1019
1020INLINE float128
bb98fe42 1021 packFloat128( flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 )
158142c2
FB
1022{
1023 float128 z;
1024
1025 z.low = zSig1;
bb98fe42 1026 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
158142c2
FB
1027 return z;
1028
1029}
1030
1031/*----------------------------------------------------------------------------
1032| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1033| and extended significand formed by the concatenation of `zSig0', `zSig1',
1034| and `zSig2', and returns the proper quadruple-precision floating-point value
1035| corresponding to the abstract input. Ordinarily, the abstract value is
1036| simply rounded and packed into the quadruple-precision format, with the
1037| inexact exception raised if the abstract input cannot be represented
1038| exactly. However, if the abstract value is too large, the overflow and
1039| inexact exceptions are raised and an infinity or maximal finite value is
1040| returned. If the abstract value is too small, the input value is rounded to
1041| a subnormal number, and the underflow and inexact exceptions are raised if
1042| the abstract input cannot be represented exactly as a subnormal quadruple-
1043| precision floating-point number.
1044| The input significand must be normalized or smaller. If the input
1045| significand is not normalized, `zExp' must be 0; in that case, the result
1046| returned is a subnormal number, and it must not require rounding. In the
1047| usual case that the input significand is normalized, `zExp' must be 1 less
1048| than the ``true'' floating-point exponent. The handling of underflow and
1049| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1050*----------------------------------------------------------------------------*/
1051
1052static float128
1053 roundAndPackFloat128(
bb98fe42 1054 flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1, uint64_t zSig2 STATUS_PARAM)
158142c2
FB
1055{
1056 int8 roundingMode;
1057 flag roundNearestEven, increment, isTiny;
1058
1059 roundingMode = STATUS(float_rounding_mode);
1060 roundNearestEven = ( roundingMode == float_round_nearest_even );
dc355b76
PM
1061 switch (roundingMode) {
1062 case float_round_nearest_even:
f9288a76 1063 case float_round_ties_away:
dc355b76
PM
1064 increment = ((int64_t)zSig2 < 0);
1065 break;
1066 case float_round_to_zero:
1067 increment = 0;
1068 break;
1069 case float_round_up:
1070 increment = !zSign && zSig2;
1071 break;
1072 case float_round_down:
1073 increment = zSign && zSig2;
1074 break;
1075 default:
1076 abort();
158142c2 1077 }
bb98fe42 1078 if ( 0x7FFD <= (uint32_t) zExp ) {
158142c2
FB
1079 if ( ( 0x7FFD < zExp )
1080 || ( ( zExp == 0x7FFD )
1081 && eq128(
1082 LIT64( 0x0001FFFFFFFFFFFF ),
1083 LIT64( 0xFFFFFFFFFFFFFFFF ),
1084 zSig0,
1085 zSig1
1086 )
1087 && increment
1088 )
1089 ) {
1090 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
1091 if ( ( roundingMode == float_round_to_zero )
1092 || ( zSign && ( roundingMode == float_round_up ) )
1093 || ( ! zSign && ( roundingMode == float_round_down ) )
1094 ) {
1095 return
1096 packFloat128(
1097 zSign,
1098 0x7FFE,
1099 LIT64( 0x0000FFFFFFFFFFFF ),
1100 LIT64( 0xFFFFFFFFFFFFFFFF )
1101 );
1102 }
1103 return packFloat128( zSign, 0x7FFF, 0, 0 );
1104 }
1105 if ( zExp < 0 ) {
e6afc87f
PM
1106 if (STATUS(flush_to_zero)) {
1107 float_raise(float_flag_output_denormal STATUS_VAR);
1108 return packFloat128(zSign, 0, 0, 0);
1109 }
158142c2
FB
1110 isTiny =
1111 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
1112 || ( zExp < -1 )
1113 || ! increment
1114 || lt128(
1115 zSig0,
1116 zSig1,
1117 LIT64( 0x0001FFFFFFFFFFFF ),
1118 LIT64( 0xFFFFFFFFFFFFFFFF )
1119 );
1120 shift128ExtraRightJamming(
1121 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1122 zExp = 0;
1123 if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
dc355b76
PM
1124 switch (roundingMode) {
1125 case float_round_nearest_even:
f9288a76 1126 case float_round_ties_away:
dc355b76
PM
1127 increment = ((int64_t)zSig2 < 0);
1128 break;
1129 case float_round_to_zero:
1130 increment = 0;
1131 break;
1132 case float_round_up:
1133 increment = !zSign && zSig2;
1134 break;
1135 case float_round_down:
1136 increment = zSign && zSig2;
1137 break;
1138 default:
1139 abort();
158142c2
FB
1140 }
1141 }
1142 }
1143 if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1144 if ( increment ) {
1145 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1146 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1147 }
1148 else {
1149 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1150 }
1151 return packFloat128( zSign, zExp, zSig0, zSig1 );
1152
1153}
1154
1155/*----------------------------------------------------------------------------
1156| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1157| and significand formed by the concatenation of `zSig0' and `zSig1', and
1158| returns the proper quadruple-precision floating-point value corresponding
1159| to the abstract input. This routine is just like `roundAndPackFloat128'
1160| except that the input significand has fewer bits and does not have to be
1161| normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1162| point exponent.
1163*----------------------------------------------------------------------------*/
1164
1165static float128
1166 normalizeRoundAndPackFloat128(
bb98fe42 1167 flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 STATUS_PARAM)
158142c2
FB
1168{
1169 int8 shiftCount;
bb98fe42 1170 uint64_t zSig2;
158142c2
FB
1171
1172 if ( zSig0 == 0 ) {
1173 zSig0 = zSig1;
1174 zSig1 = 0;
1175 zExp -= 64;
1176 }
1177 shiftCount = countLeadingZeros64( zSig0 ) - 15;
1178 if ( 0 <= shiftCount ) {
1179 zSig2 = 0;
1180 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1181 }
1182 else {
1183 shift128ExtraRightJamming(
1184 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1185 }
1186 zExp -= shiftCount;
1187 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1188
1189}
1190
158142c2
FB
1191/*----------------------------------------------------------------------------
1192| Returns the result of converting the 32-bit two's complement integer `a'
1193| to the single-precision floating-point format. The conversion is performed
1194| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1195*----------------------------------------------------------------------------*/
1196
c4850f9e 1197float32 int32_to_float32(int32_t a STATUS_PARAM)
158142c2
FB
1198{
1199 flag zSign;
1200
f090c9d4 1201 if ( a == 0 ) return float32_zero;
bb98fe42 1202 if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
158142c2
FB
1203 zSign = ( a < 0 );
1204 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1205
1206}
1207
1208/*----------------------------------------------------------------------------
1209| Returns the result of converting the 32-bit two's complement integer `a'
1210| to the double-precision floating-point format. The conversion is performed
1211| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1212*----------------------------------------------------------------------------*/
1213
c4850f9e 1214float64 int32_to_float64(int32_t a STATUS_PARAM)
158142c2
FB
1215{
1216 flag zSign;
1217 uint32 absA;
1218 int8 shiftCount;
bb98fe42 1219 uint64_t zSig;
158142c2 1220
f090c9d4 1221 if ( a == 0 ) return float64_zero;
158142c2
FB
1222 zSign = ( a < 0 );
1223 absA = zSign ? - a : a;
1224 shiftCount = countLeadingZeros32( absA ) + 21;
1225 zSig = absA;
1226 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1227
1228}
1229
158142c2
FB
1230/*----------------------------------------------------------------------------
1231| Returns the result of converting the 32-bit two's complement integer `a'
1232| to the extended double-precision floating-point format. The conversion
1233| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1234| Arithmetic.
1235*----------------------------------------------------------------------------*/
1236
c4850f9e 1237floatx80 int32_to_floatx80(int32_t a STATUS_PARAM)
158142c2
FB
1238{
1239 flag zSign;
1240 uint32 absA;
1241 int8 shiftCount;
bb98fe42 1242 uint64_t zSig;
158142c2
FB
1243
1244 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1245 zSign = ( a < 0 );
1246 absA = zSign ? - a : a;
1247 shiftCount = countLeadingZeros32( absA ) + 32;
1248 zSig = absA;
1249 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1250
1251}
1252
158142c2
FB
1253/*----------------------------------------------------------------------------
1254| Returns the result of converting the 32-bit two's complement integer `a' to
1255| the quadruple-precision floating-point format. The conversion is performed
1256| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1257*----------------------------------------------------------------------------*/
1258
c4850f9e 1259float128 int32_to_float128(int32_t a STATUS_PARAM)
158142c2
FB
1260{
1261 flag zSign;
1262 uint32 absA;
1263 int8 shiftCount;
bb98fe42 1264 uint64_t zSig0;
158142c2
FB
1265
1266 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1267 zSign = ( a < 0 );
1268 absA = zSign ? - a : a;
1269 shiftCount = countLeadingZeros32( absA ) + 17;
1270 zSig0 = absA;
1271 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1272
1273}
1274
158142c2
FB
1275/*----------------------------------------------------------------------------
1276| Returns the result of converting the 64-bit two's complement integer `a'
1277| to the single-precision floating-point format. The conversion is performed
1278| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1279*----------------------------------------------------------------------------*/
1280
c4850f9e 1281float32 int64_to_float32(int64_t a STATUS_PARAM)
158142c2
FB
1282{
1283 flag zSign;
1284 uint64 absA;
1285 int8 shiftCount;
1286
f090c9d4 1287 if ( a == 0 ) return float32_zero;
158142c2
FB
1288 zSign = ( a < 0 );
1289 absA = zSign ? - a : a;
1290 shiftCount = countLeadingZeros64( absA ) - 40;
1291 if ( 0 <= shiftCount ) {
1292 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1293 }
1294 else {
1295 shiftCount += 7;
1296 if ( shiftCount < 0 ) {
1297 shift64RightJamming( absA, - shiftCount, &absA );
1298 }
1299 else {
1300 absA <<= shiftCount;
1301 }
1302 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1303 }
1304
1305}
1306
c4850f9e 1307float32 uint64_to_float32(uint64_t a STATUS_PARAM)
75d62a58
JM
1308{
1309 int8 shiftCount;
1310
f090c9d4 1311 if ( a == 0 ) return float32_zero;
75d62a58
JM
1312 shiftCount = countLeadingZeros64( a ) - 40;
1313 if ( 0 <= shiftCount ) {
e744c06f 1314 return packFloat32(0, 0x95 - shiftCount, a<<shiftCount);
75d62a58
JM
1315 }
1316 else {
1317 shiftCount += 7;
1318 if ( shiftCount < 0 ) {
1319 shift64RightJamming( a, - shiftCount, &a );
1320 }
1321 else {
1322 a <<= shiftCount;
1323 }
e744c06f 1324 return roundAndPackFloat32(0, 0x9C - shiftCount, a STATUS_VAR);
75d62a58
JM
1325 }
1326}
1327
158142c2
FB
1328/*----------------------------------------------------------------------------
1329| Returns the result of converting the 64-bit two's complement integer `a'
1330| to the double-precision floating-point format. The conversion is performed
1331| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1332*----------------------------------------------------------------------------*/
1333
c4850f9e 1334float64 int64_to_float64(int64_t a STATUS_PARAM)
158142c2
FB
1335{
1336 flag zSign;
1337
f090c9d4 1338 if ( a == 0 ) return float64_zero;
bb98fe42 1339 if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
158142c2
FB
1340 return packFloat64( 1, 0x43E, 0 );
1341 }
1342 zSign = ( a < 0 );
1343 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1344
1345}
1346
c4850f9e 1347float64 uint64_to_float64(uint64_t a STATUS_PARAM)
75d62a58 1348{
17ed2293 1349 int exp = 0x43C;
75d62a58 1350
17ed2293
RH
1351 if (a == 0) {
1352 return float64_zero;
1353 }
1354 if ((int64_t)a < 0) {
1355 shift64RightJamming(a, 1, &a);
1356 exp += 1;
1357 }
1358 return normalizeRoundAndPackFloat64(0, exp, a STATUS_VAR);
75d62a58
JM
1359}
1360
158142c2
FB
1361/*----------------------------------------------------------------------------
1362| Returns the result of converting the 64-bit two's complement integer `a'
1363| to the extended double-precision floating-point format. The conversion
1364| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1365| Arithmetic.
1366*----------------------------------------------------------------------------*/
1367
c4850f9e 1368floatx80 int64_to_floatx80(int64_t a STATUS_PARAM)
158142c2
FB
1369{
1370 flag zSign;
1371 uint64 absA;
1372 int8 shiftCount;
1373
1374 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1375 zSign = ( a < 0 );
1376 absA = zSign ? - a : a;
1377 shiftCount = countLeadingZeros64( absA );
1378 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1379
1380}
1381
158142c2
FB
1382/*----------------------------------------------------------------------------
1383| Returns the result of converting the 64-bit two's complement integer `a' to
1384| the quadruple-precision floating-point format. The conversion is performed
1385| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1386*----------------------------------------------------------------------------*/
1387
c4850f9e 1388float128 int64_to_float128(int64_t a STATUS_PARAM)
158142c2
FB
1389{
1390 flag zSign;
1391 uint64 absA;
1392 int8 shiftCount;
1393 int32 zExp;
bb98fe42 1394 uint64_t zSig0, zSig1;
158142c2
FB
1395
1396 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1397 zSign = ( a < 0 );
1398 absA = zSign ? - a : a;
1399 shiftCount = countLeadingZeros64( absA ) + 49;
1400 zExp = 0x406E - shiftCount;
1401 if ( 64 <= shiftCount ) {
1402 zSig1 = 0;
1403 zSig0 = absA;
1404 shiftCount -= 64;
1405 }
1406 else {
1407 zSig1 = absA;
1408 zSig0 = 0;
1409 }
1410 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1411 return packFloat128( zSign, zExp, zSig0, zSig1 );
1412
1413}
1414
c4850f9e 1415float128 uint64_to_float128(uint64_t a STATUS_PARAM)
1e397ead
RH
1416{
1417 if (a == 0) {
1418 return float128_zero;
1419 }
1420 return normalizeRoundAndPackFloat128(0, 0x406E, a, 0 STATUS_VAR);
1421}
1422
158142c2
FB
1423/*----------------------------------------------------------------------------
1424| Returns the result of converting the single-precision floating-point value
1425| `a' to the 32-bit two's complement integer format. The conversion is
1426| performed according to the IEC/IEEE Standard for Binary Floating-Point
1427| Arithmetic---which means in particular that the conversion is rounded
1428| according to the current rounding mode. If `a' is a NaN, the largest
1429| positive integer is returned. Otherwise, if the conversion overflows, the
1430| largest integer with the same sign as `a' is returned.
1431*----------------------------------------------------------------------------*/
1432
1433int32 float32_to_int32( float32 a STATUS_PARAM )
1434{
1435 flag aSign;
94a49d86 1436 int_fast16_t aExp, shiftCount;
bb98fe42
AF
1437 uint32_t aSig;
1438 uint64_t aSig64;
158142c2 1439
37d18660 1440 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1441 aSig = extractFloat32Frac( a );
1442 aExp = extractFloat32Exp( a );
1443 aSign = extractFloat32Sign( a );
1444 if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1445 if ( aExp ) aSig |= 0x00800000;
1446 shiftCount = 0xAF - aExp;
1447 aSig64 = aSig;
1448 aSig64 <<= 32;
1449 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1450 return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1451
1452}
1453
1454/*----------------------------------------------------------------------------
1455| Returns the result of converting the single-precision floating-point value
1456| `a' to the 32-bit two's complement integer format. The conversion is
1457| performed according to the IEC/IEEE Standard for Binary Floating-Point
1458| Arithmetic, except that the conversion is always rounded toward zero.
1459| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1460| the conversion overflows, the largest integer with the same sign as `a' is
1461| returned.
1462*----------------------------------------------------------------------------*/
1463
1464int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1465{
1466 flag aSign;
94a49d86 1467 int_fast16_t aExp, shiftCount;
bb98fe42 1468 uint32_t aSig;
b3a6a2e0 1469 int32_t z;
37d18660 1470 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1471
1472 aSig = extractFloat32Frac( a );
1473 aExp = extractFloat32Exp( a );
1474 aSign = extractFloat32Sign( a );
1475 shiftCount = aExp - 0x9E;
1476 if ( 0 <= shiftCount ) {
f090c9d4 1477 if ( float32_val(a) != 0xCF000000 ) {
158142c2
FB
1478 float_raise( float_flag_invalid STATUS_VAR);
1479 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1480 }
bb98fe42 1481 return (int32_t) 0x80000000;
158142c2
FB
1482 }
1483 else if ( aExp <= 0x7E ) {
1484 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1485 return 0;
1486 }
1487 aSig = ( aSig | 0x00800000 )<<8;
1488 z = aSig>>( - shiftCount );
bb98fe42 1489 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
158142c2
FB
1490 STATUS(float_exception_flags) |= float_flag_inexact;
1491 }
1492 if ( aSign ) z = - z;
1493 return z;
1494
1495}
1496
cbcef455
PM
1497/*----------------------------------------------------------------------------
1498| Returns the result of converting the single-precision floating-point value
1499| `a' to the 16-bit two's complement integer format. The conversion is
1500| performed according to the IEC/IEEE Standard for Binary Floating-Point
1501| Arithmetic, except that the conversion is always rounded toward zero.
1502| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1503| the conversion overflows, the largest integer with the same sign as `a' is
1504| returned.
1505*----------------------------------------------------------------------------*/
1506
94a49d86 1507int_fast16_t float32_to_int16_round_to_zero(float32 a STATUS_PARAM)
cbcef455
PM
1508{
1509 flag aSign;
94a49d86 1510 int_fast16_t aExp, shiftCount;
bb98fe42 1511 uint32_t aSig;
cbcef455
PM
1512 int32 z;
1513
1514 aSig = extractFloat32Frac( a );
1515 aExp = extractFloat32Exp( a );
1516 aSign = extractFloat32Sign( a );
1517 shiftCount = aExp - 0x8E;
1518 if ( 0 <= shiftCount ) {
1519 if ( float32_val(a) != 0xC7000000 ) {
1520 float_raise( float_flag_invalid STATUS_VAR);
1521 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1522 return 0x7FFF;
1523 }
1524 }
bb98fe42 1525 return (int32_t) 0xffff8000;
cbcef455
PM
1526 }
1527 else if ( aExp <= 0x7E ) {
1528 if ( aExp | aSig ) {
1529 STATUS(float_exception_flags) |= float_flag_inexact;
1530 }
1531 return 0;
1532 }
1533 shiftCount -= 0x10;
1534 aSig = ( aSig | 0x00800000 )<<8;
1535 z = aSig>>( - shiftCount );
bb98fe42 1536 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
cbcef455
PM
1537 STATUS(float_exception_flags) |= float_flag_inexact;
1538 }
1539 if ( aSign ) {
1540 z = - z;
1541 }
1542 return z;
1543
1544}
1545
158142c2
FB
1546/*----------------------------------------------------------------------------
1547| Returns the result of converting the single-precision floating-point value
1548| `a' to the 64-bit two's complement integer format. The conversion is
1549| performed according to the IEC/IEEE Standard for Binary Floating-Point
1550| Arithmetic---which means in particular that the conversion is rounded
1551| according to the current rounding mode. If `a' is a NaN, the largest
1552| positive integer is returned. Otherwise, if the conversion overflows, the
1553| largest integer with the same sign as `a' is returned.
1554*----------------------------------------------------------------------------*/
1555
1556int64 float32_to_int64( float32 a STATUS_PARAM )
1557{
1558 flag aSign;
94a49d86 1559 int_fast16_t aExp, shiftCount;
bb98fe42
AF
1560 uint32_t aSig;
1561 uint64_t aSig64, aSigExtra;
37d18660 1562 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1563
1564 aSig = extractFloat32Frac( a );
1565 aExp = extractFloat32Exp( a );
1566 aSign = extractFloat32Sign( a );
1567 shiftCount = 0xBE - aExp;
1568 if ( shiftCount < 0 ) {
1569 float_raise( float_flag_invalid STATUS_VAR);
1570 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1571 return LIT64( 0x7FFFFFFFFFFFFFFF );
1572 }
bb98fe42 1573 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
1574 }
1575 if ( aExp ) aSig |= 0x00800000;
1576 aSig64 = aSig;
1577 aSig64 <<= 40;
1578 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1579 return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1580
1581}
1582
2f18bbf9
TM
1583/*----------------------------------------------------------------------------
1584| Returns the result of converting the single-precision floating-point value
1585| `a' to the 64-bit unsigned integer format. The conversion is
1586| performed according to the IEC/IEEE Standard for Binary Floating-Point
1587| Arithmetic---which means in particular that the conversion is rounded
1588| according to the current rounding mode. If `a' is a NaN, the largest
1589| unsigned integer is returned. Otherwise, if the conversion overflows, the
1590| largest unsigned integer is returned. If the 'a' is negative, the result
1591| is rounded and zero is returned; values that do not round to zero will
1592| raise the inexact exception flag.
1593*----------------------------------------------------------------------------*/
1594
1595uint64 float32_to_uint64(float32 a STATUS_PARAM)
1596{
1597 flag aSign;
1598 int_fast16_t aExp, shiftCount;
1599 uint32_t aSig;
1600 uint64_t aSig64, aSigExtra;
1601 a = float32_squash_input_denormal(a STATUS_VAR);
1602
1603 aSig = extractFloat32Frac(a);
1604 aExp = extractFloat32Exp(a);
1605 aSign = extractFloat32Sign(a);
1606 if ((aSign) && (aExp > 126)) {
1607 float_raise(float_flag_invalid STATUS_VAR);
1608 if (float32_is_any_nan(a)) {
1609 return LIT64(0xFFFFFFFFFFFFFFFF);
1610 } else {
1611 return 0;
1612 }
1613 }
1614 shiftCount = 0xBE - aExp;
1615 if (aExp) {
1616 aSig |= 0x00800000;
1617 }
1618 if (shiftCount < 0) {
1619 float_raise(float_flag_invalid STATUS_VAR);
1620 return LIT64(0xFFFFFFFFFFFFFFFF);
1621 }
1622
1623 aSig64 = aSig;
1624 aSig64 <<= 40;
1625 shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra);
1626 return roundAndPackUint64(aSign, aSig64, aSigExtra STATUS_VAR);
1627}
1628
158142c2
FB
1629/*----------------------------------------------------------------------------
1630| Returns the result of converting the single-precision floating-point value
1631| `a' to the 64-bit two's complement integer format. The conversion is
1632| performed according to the IEC/IEEE Standard for Binary Floating-Point
1633| Arithmetic, except that the conversion is always rounded toward zero. If
1634| `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1635| conversion overflows, the largest integer with the same sign as `a' is
1636| returned.
1637*----------------------------------------------------------------------------*/
1638
1639int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1640{
1641 flag aSign;
94a49d86 1642 int_fast16_t aExp, shiftCount;
bb98fe42
AF
1643 uint32_t aSig;
1644 uint64_t aSig64;
158142c2 1645 int64 z;
37d18660 1646 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1647
1648 aSig = extractFloat32Frac( a );
1649 aExp = extractFloat32Exp( a );
1650 aSign = extractFloat32Sign( a );
1651 shiftCount = aExp - 0xBE;
1652 if ( 0 <= shiftCount ) {
f090c9d4 1653 if ( float32_val(a) != 0xDF000000 ) {
158142c2
FB
1654 float_raise( float_flag_invalid STATUS_VAR);
1655 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1656 return LIT64( 0x7FFFFFFFFFFFFFFF );
1657 }
1658 }
bb98fe42 1659 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
1660 }
1661 else if ( aExp <= 0x7E ) {
1662 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1663 return 0;
1664 }
1665 aSig64 = aSig | 0x00800000;
1666 aSig64 <<= 40;
1667 z = aSig64>>( - shiftCount );
bb98fe42 1668 if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
158142c2
FB
1669 STATUS(float_exception_flags) |= float_flag_inexact;
1670 }
1671 if ( aSign ) z = - z;
1672 return z;
1673
1674}
1675
1676/*----------------------------------------------------------------------------
1677| Returns the result of converting the single-precision floating-point value
1678| `a' to the double-precision floating-point format. The conversion is
1679| performed according to the IEC/IEEE Standard for Binary Floating-Point
1680| Arithmetic.
1681*----------------------------------------------------------------------------*/
1682
1683float64 float32_to_float64( float32 a STATUS_PARAM )
1684{
1685 flag aSign;
94a49d86 1686 int_fast16_t aExp;
bb98fe42 1687 uint32_t aSig;
37d18660 1688 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1689
1690 aSig = extractFloat32Frac( a );
1691 aExp = extractFloat32Exp( a );
1692 aSign = extractFloat32Sign( a );
1693 if ( aExp == 0xFF ) {
bcd4d9af 1694 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
1695 return packFloat64( aSign, 0x7FF, 0 );
1696 }
1697 if ( aExp == 0 ) {
1698 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1699 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1700 --aExp;
1701 }
bb98fe42 1702 return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
158142c2
FB
1703
1704}
1705
158142c2
FB
1706/*----------------------------------------------------------------------------
1707| Returns the result of converting the single-precision floating-point value
1708| `a' to the extended double-precision floating-point format. The conversion
1709| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1710| Arithmetic.
1711*----------------------------------------------------------------------------*/
1712
1713floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1714{
1715 flag aSign;
94a49d86 1716 int_fast16_t aExp;
bb98fe42 1717 uint32_t aSig;
158142c2 1718
37d18660 1719 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1720 aSig = extractFloat32Frac( a );
1721 aExp = extractFloat32Exp( a );
1722 aSign = extractFloat32Sign( a );
1723 if ( aExp == 0xFF ) {
bcd4d9af 1724 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
1725 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1726 }
1727 if ( aExp == 0 ) {
1728 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1729 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1730 }
1731 aSig |= 0x00800000;
bb98fe42 1732 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
158142c2
FB
1733
1734}
1735
158142c2
FB
1736/*----------------------------------------------------------------------------
1737| Returns the result of converting the single-precision floating-point value
1738| `a' to the double-precision floating-point format. The conversion is
1739| performed according to the IEC/IEEE Standard for Binary Floating-Point
1740| Arithmetic.
1741*----------------------------------------------------------------------------*/
1742
1743float128 float32_to_float128( float32 a STATUS_PARAM )
1744{
1745 flag aSign;
94a49d86 1746 int_fast16_t aExp;
bb98fe42 1747 uint32_t aSig;
158142c2 1748
37d18660 1749 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1750 aSig = extractFloat32Frac( a );
1751 aExp = extractFloat32Exp( a );
1752 aSign = extractFloat32Sign( a );
1753 if ( aExp == 0xFF ) {
bcd4d9af 1754 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
1755 return packFloat128( aSign, 0x7FFF, 0, 0 );
1756 }
1757 if ( aExp == 0 ) {
1758 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1759 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1760 --aExp;
1761 }
bb98fe42 1762 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
158142c2
FB
1763
1764}
1765
158142c2
FB
1766/*----------------------------------------------------------------------------
1767| Rounds the single-precision floating-point value `a' to an integer, and
1768| returns the result as a single-precision floating-point value. The
1769| operation is performed according to the IEC/IEEE Standard for Binary
1770| Floating-Point Arithmetic.
1771*----------------------------------------------------------------------------*/
1772
1773float32 float32_round_to_int( float32 a STATUS_PARAM)
1774{
1775 flag aSign;
94a49d86 1776 int_fast16_t aExp;
bb98fe42 1777 uint32_t lastBitMask, roundBitsMask;
bb98fe42 1778 uint32_t z;
37d18660 1779 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1780
1781 aExp = extractFloat32Exp( a );
1782 if ( 0x96 <= aExp ) {
1783 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1784 return propagateFloat32NaN( a, a STATUS_VAR );
1785 }
1786 return a;
1787 }
1788 if ( aExp <= 0x7E ) {
bb98fe42 1789 if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
158142c2
FB
1790 STATUS(float_exception_flags) |= float_flag_inexact;
1791 aSign = extractFloat32Sign( a );
1792 switch ( STATUS(float_rounding_mode) ) {
1793 case float_round_nearest_even:
1794 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1795 return packFloat32( aSign, 0x7F, 0 );
1796 }
1797 break;
f9288a76
PM
1798 case float_round_ties_away:
1799 if (aExp == 0x7E) {
1800 return packFloat32(aSign, 0x7F, 0);
1801 }
1802 break;
158142c2 1803 case float_round_down:
f090c9d4 1804 return make_float32(aSign ? 0xBF800000 : 0);
158142c2 1805 case float_round_up:
f090c9d4 1806 return make_float32(aSign ? 0x80000000 : 0x3F800000);
158142c2
FB
1807 }
1808 return packFloat32( aSign, 0, 0 );
1809 }
1810 lastBitMask = 1;
1811 lastBitMask <<= 0x96 - aExp;
1812 roundBitsMask = lastBitMask - 1;
f090c9d4 1813 z = float32_val(a);
dc355b76
PM
1814 switch (STATUS(float_rounding_mode)) {
1815 case float_round_nearest_even:
158142c2 1816 z += lastBitMask>>1;
dc355b76
PM
1817 if ((z & roundBitsMask) == 0) {
1818 z &= ~lastBitMask;
1819 }
1820 break;
f9288a76
PM
1821 case float_round_ties_away:
1822 z += lastBitMask >> 1;
1823 break;
dc355b76
PM
1824 case float_round_to_zero:
1825 break;
1826 case float_round_up:
1827 if (!extractFloat32Sign(make_float32(z))) {
1828 z += roundBitsMask;
1829 }
1830 break;
1831 case float_round_down:
1832 if (extractFloat32Sign(make_float32(z))) {
158142c2
FB
1833 z += roundBitsMask;
1834 }
dc355b76
PM
1835 break;
1836 default:
1837 abort();
158142c2
FB
1838 }
1839 z &= ~ roundBitsMask;
f090c9d4
PB
1840 if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1841 return make_float32(z);
158142c2
FB
1842
1843}
1844
1845/*----------------------------------------------------------------------------
1846| Returns the result of adding the absolute values of the single-precision
1847| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1848| before being returned. `zSign' is ignored if the result is a NaN.
1849| The addition is performed according to the IEC/IEEE Standard for Binary
1850| Floating-Point Arithmetic.
1851*----------------------------------------------------------------------------*/
1852
1853static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1854{
94a49d86 1855 int_fast16_t aExp, bExp, zExp;
bb98fe42 1856 uint32_t aSig, bSig, zSig;
94a49d86 1857 int_fast16_t expDiff;
158142c2
FB
1858
1859 aSig = extractFloat32Frac( a );
1860 aExp = extractFloat32Exp( a );
1861 bSig = extractFloat32Frac( b );
1862 bExp = extractFloat32Exp( b );
1863 expDiff = aExp - bExp;
1864 aSig <<= 6;
1865 bSig <<= 6;
1866 if ( 0 < expDiff ) {
1867 if ( aExp == 0xFF ) {
1868 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1869 return a;
1870 }
1871 if ( bExp == 0 ) {
1872 --expDiff;
1873 }
1874 else {
1875 bSig |= 0x20000000;
1876 }
1877 shift32RightJamming( bSig, expDiff, &bSig );
1878 zExp = aExp;
1879 }
1880 else if ( expDiff < 0 ) {
1881 if ( bExp == 0xFF ) {
1882 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1883 return packFloat32( zSign, 0xFF, 0 );
1884 }
1885 if ( aExp == 0 ) {
1886 ++expDiff;
1887 }
1888 else {
1889 aSig |= 0x20000000;
1890 }
1891 shift32RightJamming( aSig, - expDiff, &aSig );
1892 zExp = bExp;
1893 }
1894 else {
1895 if ( aExp == 0xFF ) {
1896 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1897 return a;
1898 }
fe76d976 1899 if ( aExp == 0 ) {
e6afc87f
PM
1900 if (STATUS(flush_to_zero)) {
1901 if (aSig | bSig) {
1902 float_raise(float_flag_output_denormal STATUS_VAR);
1903 }
1904 return packFloat32(zSign, 0, 0);
1905 }
fe76d976
PB
1906 return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1907 }
158142c2
FB
1908 zSig = 0x40000000 + aSig + bSig;
1909 zExp = aExp;
1910 goto roundAndPack;
1911 }
1912 aSig |= 0x20000000;
1913 zSig = ( aSig + bSig )<<1;
1914 --zExp;
bb98fe42 1915 if ( (int32_t) zSig < 0 ) {
158142c2
FB
1916 zSig = aSig + bSig;
1917 ++zExp;
1918 }
1919 roundAndPack:
1920 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1921
1922}
1923
1924/*----------------------------------------------------------------------------
1925| Returns the result of subtracting the absolute values of the single-
1926| precision floating-point values `a' and `b'. If `zSign' is 1, the
1927| difference is negated before being returned. `zSign' is ignored if the
1928| result is a NaN. The subtraction is performed according to the IEC/IEEE
1929| Standard for Binary Floating-Point Arithmetic.
1930*----------------------------------------------------------------------------*/
1931
1932static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1933{
94a49d86 1934 int_fast16_t aExp, bExp, zExp;
bb98fe42 1935 uint32_t aSig, bSig, zSig;
94a49d86 1936 int_fast16_t expDiff;
158142c2
FB
1937
1938 aSig = extractFloat32Frac( a );
1939 aExp = extractFloat32Exp( a );
1940 bSig = extractFloat32Frac( b );
1941 bExp = extractFloat32Exp( b );
1942 expDiff = aExp - bExp;
1943 aSig <<= 7;
1944 bSig <<= 7;
1945 if ( 0 < expDiff ) goto aExpBigger;
1946 if ( expDiff < 0 ) goto bExpBigger;
1947 if ( aExp == 0xFF ) {
1948 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1949 float_raise( float_flag_invalid STATUS_VAR);
1950 return float32_default_nan;
1951 }
1952 if ( aExp == 0 ) {
1953 aExp = 1;
1954 bExp = 1;
1955 }
1956 if ( bSig < aSig ) goto aBigger;
1957 if ( aSig < bSig ) goto bBigger;
1958 return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1959 bExpBigger:
1960 if ( bExp == 0xFF ) {
1961 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1962 return packFloat32( zSign ^ 1, 0xFF, 0 );
1963 }
1964 if ( aExp == 0 ) {
1965 ++expDiff;
1966 }
1967 else {
1968 aSig |= 0x40000000;
1969 }
1970 shift32RightJamming( aSig, - expDiff, &aSig );
1971 bSig |= 0x40000000;
1972 bBigger:
1973 zSig = bSig - aSig;
1974 zExp = bExp;
1975 zSign ^= 1;
1976 goto normalizeRoundAndPack;
1977 aExpBigger:
1978 if ( aExp == 0xFF ) {
1979 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1980 return a;
1981 }
1982 if ( bExp == 0 ) {
1983 --expDiff;
1984 }
1985 else {
1986 bSig |= 0x40000000;
1987 }
1988 shift32RightJamming( bSig, expDiff, &bSig );
1989 aSig |= 0x40000000;
1990 aBigger:
1991 zSig = aSig - bSig;
1992 zExp = aExp;
1993 normalizeRoundAndPack:
1994 --zExp;
1995 return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1996
1997}
1998
1999/*----------------------------------------------------------------------------
2000| Returns the result of adding the single-precision floating-point values `a'
2001| and `b'. The operation is performed according to the IEC/IEEE Standard for
2002| Binary Floating-Point Arithmetic.
2003*----------------------------------------------------------------------------*/
2004
2005float32 float32_add( float32 a, float32 b STATUS_PARAM )
2006{
2007 flag aSign, bSign;
37d18660
PM
2008 a = float32_squash_input_denormal(a STATUS_VAR);
2009 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2010
2011 aSign = extractFloat32Sign( a );
2012 bSign = extractFloat32Sign( b );
2013 if ( aSign == bSign ) {
2014 return addFloat32Sigs( a, b, aSign STATUS_VAR);
2015 }
2016 else {
2017 return subFloat32Sigs( a, b, aSign STATUS_VAR );
2018 }
2019
2020}
2021
2022/*----------------------------------------------------------------------------
2023| Returns the result of subtracting the single-precision floating-point values
2024| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2025| for Binary Floating-Point Arithmetic.
2026*----------------------------------------------------------------------------*/
2027
2028float32 float32_sub( float32 a, float32 b STATUS_PARAM )
2029{
2030 flag aSign, bSign;
37d18660
PM
2031 a = float32_squash_input_denormal(a STATUS_VAR);
2032 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2033
2034 aSign = extractFloat32Sign( a );
2035 bSign = extractFloat32Sign( b );
2036 if ( aSign == bSign ) {
2037 return subFloat32Sigs( a, b, aSign STATUS_VAR );
2038 }
2039 else {
2040 return addFloat32Sigs( a, b, aSign STATUS_VAR );
2041 }
2042
2043}
2044
2045/*----------------------------------------------------------------------------
2046| Returns the result of multiplying the single-precision floating-point values
2047| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2048| for Binary Floating-Point Arithmetic.
2049*----------------------------------------------------------------------------*/
2050
2051float32 float32_mul( float32 a, float32 b STATUS_PARAM )
2052{
2053 flag aSign, bSign, zSign;
94a49d86 2054 int_fast16_t aExp, bExp, zExp;
bb98fe42
AF
2055 uint32_t aSig, bSig;
2056 uint64_t zSig64;
2057 uint32_t zSig;
158142c2 2058
37d18660
PM
2059 a = float32_squash_input_denormal(a STATUS_VAR);
2060 b = float32_squash_input_denormal(b STATUS_VAR);
2061
158142c2
FB
2062 aSig = extractFloat32Frac( a );
2063 aExp = extractFloat32Exp( a );
2064 aSign = extractFloat32Sign( a );
2065 bSig = extractFloat32Frac( b );
2066 bExp = extractFloat32Exp( b );
2067 bSign = extractFloat32Sign( b );
2068 zSign = aSign ^ bSign;
2069 if ( aExp == 0xFF ) {
2070 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2071 return propagateFloat32NaN( a, b STATUS_VAR );
2072 }
2073 if ( ( bExp | bSig ) == 0 ) {
2074 float_raise( float_flag_invalid STATUS_VAR);
2075 return float32_default_nan;
2076 }
2077 return packFloat32( zSign, 0xFF, 0 );
2078 }
2079 if ( bExp == 0xFF ) {
2080 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2081 if ( ( aExp | aSig ) == 0 ) {
2082 float_raise( float_flag_invalid STATUS_VAR);
2083 return float32_default_nan;
2084 }
2085 return packFloat32( zSign, 0xFF, 0 );
2086 }
2087 if ( aExp == 0 ) {
2088 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2089 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2090 }
2091 if ( bExp == 0 ) {
2092 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
2093 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2094 }
2095 zExp = aExp + bExp - 0x7F;
2096 aSig = ( aSig | 0x00800000 )<<7;
2097 bSig = ( bSig | 0x00800000 )<<8;
bb98fe42 2098 shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
158142c2 2099 zSig = zSig64;
bb98fe42 2100 if ( 0 <= (int32_t) ( zSig<<1 ) ) {
158142c2
FB
2101 zSig <<= 1;
2102 --zExp;
2103 }
2104 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2105
2106}
2107
2108/*----------------------------------------------------------------------------
2109| Returns the result of dividing the single-precision floating-point value `a'
2110| by the corresponding value `b'. The operation is performed according to the
2111| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2112*----------------------------------------------------------------------------*/
2113
2114float32 float32_div( float32 a, float32 b STATUS_PARAM )
2115{
2116 flag aSign, bSign, zSign;
94a49d86 2117 int_fast16_t aExp, bExp, zExp;
bb98fe42 2118 uint32_t aSig, bSig, zSig;
37d18660
PM
2119 a = float32_squash_input_denormal(a STATUS_VAR);
2120 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2121
2122 aSig = extractFloat32Frac( a );
2123 aExp = extractFloat32Exp( a );
2124 aSign = extractFloat32Sign( a );
2125 bSig = extractFloat32Frac( b );
2126 bExp = extractFloat32Exp( b );
2127 bSign = extractFloat32Sign( b );
2128 zSign = aSign ^ bSign;
2129 if ( aExp == 0xFF ) {
2130 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2131 if ( bExp == 0xFF ) {
2132 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2133 float_raise( float_flag_invalid STATUS_VAR);
2134 return float32_default_nan;
2135 }
2136 return packFloat32( zSign, 0xFF, 0 );
2137 }
2138 if ( bExp == 0xFF ) {
2139 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2140 return packFloat32( zSign, 0, 0 );
2141 }
2142 if ( bExp == 0 ) {
2143 if ( bSig == 0 ) {
2144 if ( ( aExp | aSig ) == 0 ) {
2145 float_raise( float_flag_invalid STATUS_VAR);
2146 return float32_default_nan;
2147 }
2148 float_raise( float_flag_divbyzero STATUS_VAR);
2149 return packFloat32( zSign, 0xFF, 0 );
2150 }
2151 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2152 }
2153 if ( aExp == 0 ) {
2154 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2155 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2156 }
2157 zExp = aExp - bExp + 0x7D;
2158 aSig = ( aSig | 0x00800000 )<<7;
2159 bSig = ( bSig | 0x00800000 )<<8;
2160 if ( bSig <= ( aSig + aSig ) ) {
2161 aSig >>= 1;
2162 ++zExp;
2163 }
bb98fe42 2164 zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
158142c2 2165 if ( ( zSig & 0x3F ) == 0 ) {
bb98fe42 2166 zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
158142c2
FB
2167 }
2168 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2169
2170}
2171
2172/*----------------------------------------------------------------------------
2173| Returns the remainder of the single-precision floating-point value `a'
2174| with respect to the corresponding value `b'. The operation is performed
2175| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2176*----------------------------------------------------------------------------*/
2177
2178float32 float32_rem( float32 a, float32 b STATUS_PARAM )
2179{
ed086f3d 2180 flag aSign, zSign;
94a49d86 2181 int_fast16_t aExp, bExp, expDiff;
bb98fe42
AF
2182 uint32_t aSig, bSig;
2183 uint32_t q;
2184 uint64_t aSig64, bSig64, q64;
2185 uint32_t alternateASig;
2186 int32_t sigMean;
37d18660
PM
2187 a = float32_squash_input_denormal(a STATUS_VAR);
2188 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2189
2190 aSig = extractFloat32Frac( a );
2191 aExp = extractFloat32Exp( a );
2192 aSign = extractFloat32Sign( a );
2193 bSig = extractFloat32Frac( b );
2194 bExp = extractFloat32Exp( b );
158142c2
FB
2195 if ( aExp == 0xFF ) {
2196 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2197 return propagateFloat32NaN( a, b STATUS_VAR );
2198 }
2199 float_raise( float_flag_invalid STATUS_VAR);
2200 return float32_default_nan;
2201 }
2202 if ( bExp == 0xFF ) {
2203 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2204 return a;
2205 }
2206 if ( bExp == 0 ) {
2207 if ( bSig == 0 ) {
2208 float_raise( float_flag_invalid STATUS_VAR);
2209 return float32_default_nan;
2210 }
2211 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2212 }
2213 if ( aExp == 0 ) {
2214 if ( aSig == 0 ) return a;
2215 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2216 }
2217 expDiff = aExp - bExp;
2218 aSig |= 0x00800000;
2219 bSig |= 0x00800000;
2220 if ( expDiff < 32 ) {
2221 aSig <<= 8;
2222 bSig <<= 8;
2223 if ( expDiff < 0 ) {
2224 if ( expDiff < -1 ) return a;
2225 aSig >>= 1;
2226 }
2227 q = ( bSig <= aSig );
2228 if ( q ) aSig -= bSig;
2229 if ( 0 < expDiff ) {
bb98fe42 2230 q = ( ( (uint64_t) aSig )<<32 ) / bSig;
158142c2
FB
2231 q >>= 32 - expDiff;
2232 bSig >>= 2;
2233 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2234 }
2235 else {
2236 aSig >>= 2;
2237 bSig >>= 2;
2238 }
2239 }
2240 else {
2241 if ( bSig <= aSig ) aSig -= bSig;
bb98fe42
AF
2242 aSig64 = ( (uint64_t) aSig )<<40;
2243 bSig64 = ( (uint64_t) bSig )<<40;
158142c2
FB
2244 expDiff -= 64;
2245 while ( 0 < expDiff ) {
2246 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2247 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2248 aSig64 = - ( ( bSig * q64 )<<38 );
2249 expDiff -= 62;
2250 }
2251 expDiff += 64;
2252 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2253 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2254 q = q64>>( 64 - expDiff );
2255 bSig <<= 6;
2256 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2257 }
2258 do {
2259 alternateASig = aSig;
2260 ++q;
2261 aSig -= bSig;
bb98fe42 2262 } while ( 0 <= (int32_t) aSig );
158142c2
FB
2263 sigMean = aSig + alternateASig;
2264 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2265 aSig = alternateASig;
2266 }
bb98fe42 2267 zSign = ( (int32_t) aSig < 0 );
158142c2
FB
2268 if ( zSign ) aSig = - aSig;
2269 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2270
2271}
2272
369be8f6
PM
2273/*----------------------------------------------------------------------------
2274| Returns the result of multiplying the single-precision floating-point values
2275| `a' and `b' then adding 'c', with no intermediate rounding step after the
2276| multiplication. The operation is performed according to the IEC/IEEE
2277| Standard for Binary Floating-Point Arithmetic 754-2008.
2278| The flags argument allows the caller to select negation of the
2279| addend, the intermediate product, or the final result. (The difference
2280| between this and having the caller do a separate negation is that negating
2281| externally will flip the sign bit on NaNs.)
2282*----------------------------------------------------------------------------*/
2283
2284float32 float32_muladd(float32 a, float32 b, float32 c, int flags STATUS_PARAM)
2285{
2286 flag aSign, bSign, cSign, zSign;
94a49d86 2287 int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
369be8f6
PM
2288 uint32_t aSig, bSig, cSig;
2289 flag pInf, pZero, pSign;
2290 uint64_t pSig64, cSig64, zSig64;
2291 uint32_t pSig;
2292 int shiftcount;
2293 flag signflip, infzero;
2294
2295 a = float32_squash_input_denormal(a STATUS_VAR);
2296 b = float32_squash_input_denormal(b STATUS_VAR);
2297 c = float32_squash_input_denormal(c STATUS_VAR);
2298 aSig = extractFloat32Frac(a);
2299 aExp = extractFloat32Exp(a);
2300 aSign = extractFloat32Sign(a);
2301 bSig = extractFloat32Frac(b);
2302 bExp = extractFloat32Exp(b);
2303 bSign = extractFloat32Sign(b);
2304 cSig = extractFloat32Frac(c);
2305 cExp = extractFloat32Exp(c);
2306 cSign = extractFloat32Sign(c);
2307
2308 infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) ||
2309 (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0));
2310
2311 /* It is implementation-defined whether the cases of (0,inf,qnan)
2312 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
2313 * they return if they do), so we have to hand this information
2314 * off to the target-specific pick-a-NaN routine.
2315 */
2316 if (((aExp == 0xff) && aSig) ||
2317 ((bExp == 0xff) && bSig) ||
2318 ((cExp == 0xff) && cSig)) {
2319 return propagateFloat32MulAddNaN(a, b, c, infzero STATUS_VAR);
2320 }
2321
2322 if (infzero) {
2323 float_raise(float_flag_invalid STATUS_VAR);
2324 return float32_default_nan;
2325 }
2326
2327 if (flags & float_muladd_negate_c) {
2328 cSign ^= 1;
2329 }
2330
2331 signflip = (flags & float_muladd_negate_result) ? 1 : 0;
2332
2333 /* Work out the sign and type of the product */
2334 pSign = aSign ^ bSign;
2335 if (flags & float_muladd_negate_product) {
2336 pSign ^= 1;
2337 }
2338 pInf = (aExp == 0xff) || (bExp == 0xff);
2339 pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
2340
2341 if (cExp == 0xff) {
2342 if (pInf && (pSign ^ cSign)) {
2343 /* addition of opposite-signed infinities => InvalidOperation */
2344 float_raise(float_flag_invalid STATUS_VAR);
2345 return float32_default_nan;
2346 }
2347 /* Otherwise generate an infinity of the same sign */
2348 return packFloat32(cSign ^ signflip, 0xff, 0);
2349 }
2350
2351 if (pInf) {
2352 return packFloat32(pSign ^ signflip, 0xff, 0);
2353 }
2354
2355 if (pZero) {
2356 if (cExp == 0) {
2357 if (cSig == 0) {
2358 /* Adding two exact zeroes */
2359 if (pSign == cSign) {
2360 zSign = pSign;
2361 } else if (STATUS(float_rounding_mode) == float_round_down) {
2362 zSign = 1;
2363 } else {
2364 zSign = 0;
2365 }
2366 return packFloat32(zSign ^ signflip, 0, 0);
2367 }
2368 /* Exact zero plus a denorm */
2369 if (STATUS(flush_to_zero)) {
2370 float_raise(float_flag_output_denormal STATUS_VAR);
2371 return packFloat32(cSign ^ signflip, 0, 0);
2372 }
2373 }
2374 /* Zero plus something non-zero : just return the something */
67d43538
PM
2375 if (flags & float_muladd_halve_result) {
2376 if (cExp == 0) {
2377 normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2378 }
2379 /* Subtract one to halve, and one again because roundAndPackFloat32
2380 * wants one less than the true exponent.
2381 */
2382 cExp -= 2;
2383 cSig = (cSig | 0x00800000) << 7;
2384 return roundAndPackFloat32(cSign ^ signflip, cExp, cSig STATUS_VAR);
2385 }
a6e7c184 2386 return packFloat32(cSign ^ signflip, cExp, cSig);
369be8f6
PM
2387 }
2388
2389 if (aExp == 0) {
2390 normalizeFloat32Subnormal(aSig, &aExp, &aSig);
2391 }
2392 if (bExp == 0) {
2393 normalizeFloat32Subnormal(bSig, &bExp, &bSig);
2394 }
2395
2396 /* Calculate the actual result a * b + c */
2397
2398 /* Multiply first; this is easy. */
2399 /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
2400 * because we want the true exponent, not the "one-less-than"
2401 * flavour that roundAndPackFloat32() takes.
2402 */
2403 pExp = aExp + bExp - 0x7e;
2404 aSig = (aSig | 0x00800000) << 7;
2405 bSig = (bSig | 0x00800000) << 8;
2406 pSig64 = (uint64_t)aSig * bSig;
2407 if ((int64_t)(pSig64 << 1) >= 0) {
2408 pSig64 <<= 1;
2409 pExp--;
2410 }
2411
2412 zSign = pSign ^ signflip;
2413
2414 /* Now pSig64 is the significand of the multiply, with the explicit bit in
2415 * position 62.
2416 */
2417 if (cExp == 0) {
2418 if (!cSig) {
2419 /* Throw out the special case of c being an exact zero now */
2420 shift64RightJamming(pSig64, 32, &pSig64);
2421 pSig = pSig64;
67d43538
PM
2422 if (flags & float_muladd_halve_result) {
2423 pExp--;
2424 }
369be8f6
PM
2425 return roundAndPackFloat32(zSign, pExp - 1,
2426 pSig STATUS_VAR);
2427 }
2428 normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2429 }
2430
2431 cSig64 = (uint64_t)cSig << (62 - 23);
2432 cSig64 |= LIT64(0x4000000000000000);
2433 expDiff = pExp - cExp;
2434
2435 if (pSign == cSign) {
2436 /* Addition */
2437 if (expDiff > 0) {
2438 /* scale c to match p */
2439 shift64RightJamming(cSig64, expDiff, &cSig64);
2440 zExp = pExp;
2441 } else if (expDiff < 0) {
2442 /* scale p to match c */
2443 shift64RightJamming(pSig64, -expDiff, &pSig64);
2444 zExp = cExp;
2445 } else {
2446 /* no scaling needed */
2447 zExp = cExp;
2448 }
2449 /* Add significands and make sure explicit bit ends up in posn 62 */
2450 zSig64 = pSig64 + cSig64;
2451 if ((int64_t)zSig64 < 0) {
2452 shift64RightJamming(zSig64, 1, &zSig64);
2453 } else {
2454 zExp--;
2455 }
2456 } else {
2457 /* Subtraction */
2458 if (expDiff > 0) {
2459 shift64RightJamming(cSig64, expDiff, &cSig64);
2460 zSig64 = pSig64 - cSig64;
2461 zExp = pExp;
2462 } else if (expDiff < 0) {
2463 shift64RightJamming(pSig64, -expDiff, &pSig64);
2464 zSig64 = cSig64 - pSig64;
2465 zExp = cExp;
2466 zSign ^= 1;
2467 } else {
2468 zExp = pExp;
2469 if (cSig64 < pSig64) {
2470 zSig64 = pSig64 - cSig64;
2471 } else if (pSig64 < cSig64) {
2472 zSig64 = cSig64 - pSig64;
2473 zSign ^= 1;
2474 } else {
2475 /* Exact zero */
2476 zSign = signflip;
2477 if (STATUS(float_rounding_mode) == float_round_down) {
2478 zSign ^= 1;
2479 }
2480 return packFloat32(zSign, 0, 0);
2481 }
2482 }
2483 --zExp;
2484 /* Normalize to put the explicit bit back into bit 62. */
2485 shiftcount = countLeadingZeros64(zSig64) - 1;
2486 zSig64 <<= shiftcount;
2487 zExp -= shiftcount;
2488 }
67d43538
PM
2489 if (flags & float_muladd_halve_result) {
2490 zExp--;
2491 }
2492
369be8f6
PM
2493 shift64RightJamming(zSig64, 32, &zSig64);
2494 return roundAndPackFloat32(zSign, zExp, zSig64 STATUS_VAR);
2495}
2496
2497
158142c2
FB
2498/*----------------------------------------------------------------------------
2499| Returns the square root of the single-precision floating-point value `a'.
2500| The operation is performed according to the IEC/IEEE Standard for Binary
2501| Floating-Point Arithmetic.
2502*----------------------------------------------------------------------------*/
2503
2504float32 float32_sqrt( float32 a STATUS_PARAM )
2505{
2506 flag aSign;
94a49d86 2507 int_fast16_t aExp, zExp;
bb98fe42
AF
2508 uint32_t aSig, zSig;
2509 uint64_t rem, term;
37d18660 2510 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
2511
2512 aSig = extractFloat32Frac( a );
2513 aExp = extractFloat32Exp( a );
2514 aSign = extractFloat32Sign( a );
2515 if ( aExp == 0xFF ) {
f090c9d4 2516 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
158142c2
FB
2517 if ( ! aSign ) return a;
2518 float_raise( float_flag_invalid STATUS_VAR);
2519 return float32_default_nan;
2520 }
2521 if ( aSign ) {
2522 if ( ( aExp | aSig ) == 0 ) return a;
2523 float_raise( float_flag_invalid STATUS_VAR);
2524 return float32_default_nan;
2525 }
2526 if ( aExp == 0 ) {
f090c9d4 2527 if ( aSig == 0 ) return float32_zero;
158142c2
FB
2528 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2529 }
2530 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2531 aSig = ( aSig | 0x00800000 )<<8;
2532 zSig = estimateSqrt32( aExp, aSig ) + 2;
2533 if ( ( zSig & 0x7F ) <= 5 ) {
2534 if ( zSig < 2 ) {
2535 zSig = 0x7FFFFFFF;
2536 goto roundAndPack;
2537 }
2538 aSig >>= aExp & 1;
bb98fe42
AF
2539 term = ( (uint64_t) zSig ) * zSig;
2540 rem = ( ( (uint64_t) aSig )<<32 ) - term;
2541 while ( (int64_t) rem < 0 ) {
158142c2 2542 --zSig;
bb98fe42 2543 rem += ( ( (uint64_t) zSig )<<1 ) | 1;
158142c2
FB
2544 }
2545 zSig |= ( rem != 0 );
2546 }
2547 shift32RightJamming( zSig, 1, &zSig );
2548 roundAndPack:
2549 return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2550
2551}
2552
8229c991
AJ
2553/*----------------------------------------------------------------------------
2554| Returns the binary exponential of the single-precision floating-point value
2555| `a'. The operation is performed according to the IEC/IEEE Standard for
2556| Binary Floating-Point Arithmetic.
2557|
2558| Uses the following identities:
2559|
2560| 1. -------------------------------------------------------------------------
2561| x x*ln(2)
2562| 2 = e
2563|
2564| 2. -------------------------------------------------------------------------
2565| 2 3 4 5 n
2566| x x x x x x x
2567| e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2568| 1! 2! 3! 4! 5! n!
2569*----------------------------------------------------------------------------*/
2570
2571static const float64 float32_exp2_coefficients[15] =
2572{
d5138cf4
PM
2573 const_float64( 0x3ff0000000000000ll ), /* 1 */
2574 const_float64( 0x3fe0000000000000ll ), /* 2 */
2575 const_float64( 0x3fc5555555555555ll ), /* 3 */
2576 const_float64( 0x3fa5555555555555ll ), /* 4 */
2577 const_float64( 0x3f81111111111111ll ), /* 5 */
2578 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
2579 const_float64( 0x3f2a01a01a01a01all ), /* 7 */
2580 const_float64( 0x3efa01a01a01a01all ), /* 8 */
2581 const_float64( 0x3ec71de3a556c734ll ), /* 9 */
2582 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2583 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2584 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2585 const_float64( 0x3de6124613a86d09ll ), /* 13 */
2586 const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2587 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
8229c991
AJ
2588};
2589
2590float32 float32_exp2( float32 a STATUS_PARAM )
2591{
2592 flag aSign;
94a49d86 2593 int_fast16_t aExp;
bb98fe42 2594 uint32_t aSig;
8229c991
AJ
2595 float64 r, x, xn;
2596 int i;
37d18660 2597 a = float32_squash_input_denormal(a STATUS_VAR);
8229c991
AJ
2598
2599 aSig = extractFloat32Frac( a );
2600 aExp = extractFloat32Exp( a );
2601 aSign = extractFloat32Sign( a );
2602
2603 if ( aExp == 0xFF) {
2604 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2605 return (aSign) ? float32_zero : a;
2606 }
2607 if (aExp == 0) {
2608 if (aSig == 0) return float32_one;
2609 }
2610
2611 float_raise( float_flag_inexact STATUS_VAR);
2612
2613 /* ******************************* */
2614 /* using float64 for approximation */
2615 /* ******************************* */
2616 x = float32_to_float64(a STATUS_VAR);
2617 x = float64_mul(x, float64_ln2 STATUS_VAR);
2618
2619 xn = x;
2620 r = float64_one;
2621 for (i = 0 ; i < 15 ; i++) {
2622 float64 f;
2623
2624 f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2625 r = float64_add(r, f STATUS_VAR);
2626
2627 xn = float64_mul(xn, x STATUS_VAR);
2628 }
2629
2630 return float64_to_float32(r, status);
2631}
2632
374dfc33
AJ
2633/*----------------------------------------------------------------------------
2634| Returns the binary log of the single-precision floating-point value `a'.
2635| The operation is performed according to the IEC/IEEE Standard for Binary
2636| Floating-Point Arithmetic.
2637*----------------------------------------------------------------------------*/
2638float32 float32_log2( float32 a STATUS_PARAM )
2639{
2640 flag aSign, zSign;
94a49d86 2641 int_fast16_t aExp;
bb98fe42 2642 uint32_t aSig, zSig, i;
374dfc33 2643
37d18660 2644 a = float32_squash_input_denormal(a STATUS_VAR);
374dfc33
AJ
2645 aSig = extractFloat32Frac( a );
2646 aExp = extractFloat32Exp( a );
2647 aSign = extractFloat32Sign( a );
2648
2649 if ( aExp == 0 ) {
2650 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2651 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2652 }
2653 if ( aSign ) {
2654 float_raise( float_flag_invalid STATUS_VAR);
2655 return float32_default_nan;
2656 }
2657 if ( aExp == 0xFF ) {
2658 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2659 return a;
2660 }
2661
2662 aExp -= 0x7F;
2663 aSig |= 0x00800000;
2664 zSign = aExp < 0;
2665 zSig = aExp << 23;
2666
2667 for (i = 1 << 22; i > 0; i >>= 1) {
bb98fe42 2668 aSig = ( (uint64_t)aSig * aSig ) >> 23;
374dfc33
AJ
2669 if ( aSig & 0x01000000 ) {
2670 aSig >>= 1;
2671 zSig |= i;
2672 }
2673 }
2674
2675 if ( zSign )
2676 zSig = -zSig;
2677
2678 return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2679}
2680
158142c2
FB
2681/*----------------------------------------------------------------------------
2682| Returns 1 if the single-precision floating-point value `a' is equal to
b689362d
AJ
2683| the corresponding value `b', and 0 otherwise. The invalid exception is
2684| raised if either operand is a NaN. Otherwise, the comparison is performed
158142c2
FB
2685| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2686*----------------------------------------------------------------------------*/
2687
b689362d 2688int float32_eq( float32 a, float32 b STATUS_PARAM )
158142c2 2689{
b689362d 2690 uint32_t av, bv;
37d18660
PM
2691 a = float32_squash_input_denormal(a STATUS_VAR);
2692 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2693
2694 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2695 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2696 ) {
b689362d 2697 float_raise( float_flag_invalid STATUS_VAR);
158142c2
FB
2698 return 0;
2699 }
b689362d
AJ
2700 av = float32_val(a);
2701 bv = float32_val(b);
2702 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
158142c2
FB
2703}
2704
2705/*----------------------------------------------------------------------------
2706| Returns 1 if the single-precision floating-point value `a' is less than
f5a64251
AJ
2707| or equal to the corresponding value `b', and 0 otherwise. The invalid
2708| exception is raised if either operand is a NaN. The comparison is performed
2709| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
2710*----------------------------------------------------------------------------*/
2711
750afe93 2712int float32_le( float32 a, float32 b STATUS_PARAM )
158142c2
FB
2713{
2714 flag aSign, bSign;
bb98fe42 2715 uint32_t av, bv;
37d18660
PM
2716 a = float32_squash_input_denormal(a STATUS_VAR);
2717 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2718
2719 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2720 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2721 ) {
2722 float_raise( float_flag_invalid STATUS_VAR);
2723 return 0;
2724 }
2725 aSign = extractFloat32Sign( a );
2726 bSign = extractFloat32Sign( b );
f090c9d4
PB
2727 av = float32_val(a);
2728 bv = float32_val(b);
bb98fe42 2729 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 2730 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
2731
2732}
2733
2734/*----------------------------------------------------------------------------
2735| Returns 1 if the single-precision floating-point value `a' is less than
f5a64251
AJ
2736| the corresponding value `b', and 0 otherwise. The invalid exception is
2737| raised if either operand is a NaN. The comparison is performed according
2738| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
2739*----------------------------------------------------------------------------*/
2740
750afe93 2741int float32_lt( float32 a, float32 b STATUS_PARAM )
158142c2
FB
2742{
2743 flag aSign, bSign;
bb98fe42 2744 uint32_t av, bv;
37d18660
PM
2745 a = float32_squash_input_denormal(a STATUS_VAR);
2746 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2747
2748 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2749 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2750 ) {
2751 float_raise( float_flag_invalid STATUS_VAR);
2752 return 0;
2753 }
2754 aSign = extractFloat32Sign( a );
2755 bSign = extractFloat32Sign( b );
f090c9d4
PB
2756 av = float32_val(a);
2757 bv = float32_val(b);
bb98fe42 2758 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 2759 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
2760
2761}
2762
67b7861d
AJ
2763/*----------------------------------------------------------------------------
2764| Returns 1 if the single-precision floating-point values `a' and `b' cannot
f5a64251
AJ
2765| be compared, and 0 otherwise. The invalid exception is raised if either
2766| operand is a NaN. The comparison is performed according to the IEC/IEEE
2767| Standard for Binary Floating-Point Arithmetic.
67b7861d
AJ
2768*----------------------------------------------------------------------------*/
2769
2770int float32_unordered( float32 a, float32 b STATUS_PARAM )
2771{
2772 a = float32_squash_input_denormal(a STATUS_VAR);
2773 b = float32_squash_input_denormal(b STATUS_VAR);
2774
2775 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2776 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2777 ) {
2778 float_raise( float_flag_invalid STATUS_VAR);
2779 return 1;
2780 }
2781 return 0;
2782}
b689362d 2783
158142c2
FB
2784/*----------------------------------------------------------------------------
2785| Returns 1 if the single-precision floating-point value `a' is equal to
f5a64251
AJ
2786| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2787| exception. The comparison is performed according to the IEC/IEEE Standard
2788| for Binary Floating-Point Arithmetic.
158142c2
FB
2789*----------------------------------------------------------------------------*/
2790
b689362d 2791int float32_eq_quiet( float32 a, float32 b STATUS_PARAM )
158142c2 2792{
37d18660
PM
2793 a = float32_squash_input_denormal(a STATUS_VAR);
2794 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2795
2796 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2797 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2798 ) {
b689362d
AJ
2799 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2800 float_raise( float_flag_invalid STATUS_VAR);
2801 }
158142c2
FB
2802 return 0;
2803 }
b689362d
AJ
2804 return ( float32_val(a) == float32_val(b) ) ||
2805 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
158142c2
FB
2806}
2807
2808/*----------------------------------------------------------------------------
2809| Returns 1 if the single-precision floating-point value `a' is less than or
2810| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2811| cause an exception. Otherwise, the comparison is performed according to the
2812| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2813*----------------------------------------------------------------------------*/
2814
750afe93 2815int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
158142c2
FB
2816{
2817 flag aSign, bSign;
bb98fe42 2818 uint32_t av, bv;
37d18660
PM
2819 a = float32_squash_input_denormal(a STATUS_VAR);
2820 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2821
2822 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2823 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2824 ) {
2825 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2826 float_raise( float_flag_invalid STATUS_VAR);
2827 }
2828 return 0;
2829 }
2830 aSign = extractFloat32Sign( a );
2831 bSign = extractFloat32Sign( b );
f090c9d4
PB
2832 av = float32_val(a);
2833 bv = float32_val(b);
bb98fe42 2834 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 2835 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
2836
2837}
2838
2839/*----------------------------------------------------------------------------
2840| Returns 1 if the single-precision floating-point value `a' is less than
2841| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2842| exception. Otherwise, the comparison is performed according to the IEC/IEEE
2843| Standard for Binary Floating-Point Arithmetic.
2844*----------------------------------------------------------------------------*/
2845
750afe93 2846int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
158142c2
FB
2847{
2848 flag aSign, bSign;
bb98fe42 2849 uint32_t av, bv;
37d18660
PM
2850 a = float32_squash_input_denormal(a STATUS_VAR);
2851 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2852
2853 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2854 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2855 ) {
2856 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2857 float_raise( float_flag_invalid STATUS_VAR);
2858 }
2859 return 0;
2860 }
2861 aSign = extractFloat32Sign( a );
2862 bSign = extractFloat32Sign( b );
f090c9d4
PB
2863 av = float32_val(a);
2864 bv = float32_val(b);
bb98fe42 2865 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 2866 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
2867
2868}
2869
67b7861d
AJ
2870/*----------------------------------------------------------------------------
2871| Returns 1 if the single-precision floating-point values `a' and `b' cannot
2872| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
2873| comparison is performed according to the IEC/IEEE Standard for Binary
2874| Floating-Point Arithmetic.
2875*----------------------------------------------------------------------------*/
2876
2877int float32_unordered_quiet( float32 a, float32 b STATUS_PARAM )
2878{
2879 a = float32_squash_input_denormal(a STATUS_VAR);
2880 b = float32_squash_input_denormal(b STATUS_VAR);
2881
2882 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2883 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2884 ) {
2885 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2886 float_raise( float_flag_invalid STATUS_VAR);
2887 }
2888 return 1;
2889 }
2890 return 0;
2891}
2892
158142c2
FB
2893/*----------------------------------------------------------------------------
2894| Returns the result of converting the double-precision floating-point value
2895| `a' to the 32-bit two's complement integer format. The conversion is
2896| performed according to the IEC/IEEE Standard for Binary Floating-Point
2897| Arithmetic---which means in particular that the conversion is rounded
2898| according to the current rounding mode. If `a' is a NaN, the largest
2899| positive integer is returned. Otherwise, if the conversion overflows, the
2900| largest integer with the same sign as `a' is returned.
2901*----------------------------------------------------------------------------*/
2902
2903int32 float64_to_int32( float64 a STATUS_PARAM )
2904{
2905 flag aSign;
94a49d86 2906 int_fast16_t aExp, shiftCount;
bb98fe42 2907 uint64_t aSig;
37d18660 2908 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
2909
2910 aSig = extractFloat64Frac( a );
2911 aExp = extractFloat64Exp( a );
2912 aSign = extractFloat64Sign( a );
2913 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2914 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2915 shiftCount = 0x42C - aExp;
2916 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2917 return roundAndPackInt32( aSign, aSig STATUS_VAR );
2918
2919}
2920
2921/*----------------------------------------------------------------------------
2922| Returns the result of converting the double-precision floating-point value
2923| `a' to the 32-bit two's complement integer format. The conversion is
2924| performed according to the IEC/IEEE Standard for Binary Floating-Point
2925| Arithmetic, except that the conversion is always rounded toward zero.
2926| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2927| the conversion overflows, the largest integer with the same sign as `a' is
2928| returned.
2929*----------------------------------------------------------------------------*/
2930
2931int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2932{
2933 flag aSign;
94a49d86 2934 int_fast16_t aExp, shiftCount;
bb98fe42 2935 uint64_t aSig, savedASig;
b3a6a2e0 2936 int32_t z;
37d18660 2937 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
2938
2939 aSig = extractFloat64Frac( a );
2940 aExp = extractFloat64Exp( a );
2941 aSign = extractFloat64Sign( a );
2942 if ( 0x41E < aExp ) {
2943 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2944 goto invalid;
2945 }
2946 else if ( aExp < 0x3FF ) {
2947 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2948 return 0;
2949 }
2950 aSig |= LIT64( 0x0010000000000000 );
2951 shiftCount = 0x433 - aExp;
2952 savedASig = aSig;
2953 aSig >>= shiftCount;
2954 z = aSig;
2955 if ( aSign ) z = - z;
2956 if ( ( z < 0 ) ^ aSign ) {
2957 invalid:
2958 float_raise( float_flag_invalid STATUS_VAR);
bb98fe42 2959 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2
FB
2960 }
2961 if ( ( aSig<<shiftCount ) != savedASig ) {
2962 STATUS(float_exception_flags) |= float_flag_inexact;
2963 }
2964 return z;
2965
2966}
2967
cbcef455
PM
2968/*----------------------------------------------------------------------------
2969| Returns the result of converting the double-precision floating-point value
2970| `a' to the 16-bit two's complement integer format. The conversion is
2971| performed according to the IEC/IEEE Standard for Binary Floating-Point
2972| Arithmetic, except that the conversion is always rounded toward zero.
2973| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2974| the conversion overflows, the largest integer with the same sign as `a' is
2975| returned.
2976*----------------------------------------------------------------------------*/
2977
94a49d86 2978int_fast16_t float64_to_int16_round_to_zero(float64 a STATUS_PARAM)
cbcef455
PM
2979{
2980 flag aSign;
94a49d86 2981 int_fast16_t aExp, shiftCount;
bb98fe42 2982 uint64_t aSig, savedASig;
cbcef455
PM
2983 int32 z;
2984
2985 aSig = extractFloat64Frac( a );
2986 aExp = extractFloat64Exp( a );
2987 aSign = extractFloat64Sign( a );
2988 if ( 0x40E < aExp ) {
2989 if ( ( aExp == 0x7FF ) && aSig ) {
2990 aSign = 0;
2991 }
2992 goto invalid;
2993 }
2994 else if ( aExp < 0x3FF ) {
2995 if ( aExp || aSig ) {
2996 STATUS(float_exception_flags) |= float_flag_inexact;
2997 }
2998 return 0;
2999 }
3000 aSig |= LIT64( 0x0010000000000000 );
3001 shiftCount = 0x433 - aExp;
3002 savedASig = aSig;
3003 aSig >>= shiftCount;
3004 z = aSig;
3005 if ( aSign ) {
3006 z = - z;
3007 }
3008 if ( ( (int16_t)z < 0 ) ^ aSign ) {
3009 invalid:
3010 float_raise( float_flag_invalid STATUS_VAR);
bb98fe42 3011 return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
cbcef455
PM
3012 }
3013 if ( ( aSig<<shiftCount ) != savedASig ) {
3014 STATUS(float_exception_flags) |= float_flag_inexact;
3015 }
3016 return z;
3017}
3018
158142c2
FB
3019/*----------------------------------------------------------------------------
3020| Returns the result of converting the double-precision floating-point value
3021| `a' to the 64-bit two's complement integer format. The conversion is
3022| performed according to the IEC/IEEE Standard for Binary Floating-Point
3023| Arithmetic---which means in particular that the conversion is rounded
3024| according to the current rounding mode. If `a' is a NaN, the largest
3025| positive integer is returned. Otherwise, if the conversion overflows, the
3026| largest integer with the same sign as `a' is returned.
3027*----------------------------------------------------------------------------*/
3028
3029int64 float64_to_int64( float64 a STATUS_PARAM )
3030{
3031 flag aSign;
94a49d86 3032 int_fast16_t aExp, shiftCount;
bb98fe42 3033 uint64_t aSig, aSigExtra;
37d18660 3034 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
3035
3036 aSig = extractFloat64Frac( a );
3037 aExp = extractFloat64Exp( a );
3038 aSign = extractFloat64Sign( a );
3039 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3040 shiftCount = 0x433 - aExp;
3041 if ( shiftCount <= 0 ) {
3042 if ( 0x43E < aExp ) {
3043 float_raise( float_flag_invalid STATUS_VAR);
3044 if ( ! aSign
3045 || ( ( aExp == 0x7FF )
3046 && ( aSig != LIT64( 0x0010000000000000 ) ) )
3047 ) {
3048 return LIT64( 0x7FFFFFFFFFFFFFFF );
3049 }
bb98fe42 3050 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
3051 }
3052 aSigExtra = 0;
3053 aSig <<= - shiftCount;
3054 }
3055 else {
3056 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3057 }
3058 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3059
3060}
3061
3062/*----------------------------------------------------------------------------
3063| Returns the result of converting the double-precision floating-point value
3064| `a' to the 64-bit two's complement integer format. The conversion is
3065| performed according to the IEC/IEEE Standard for Binary Floating-Point
3066| Arithmetic, except that the conversion is always rounded toward zero.
3067| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
3068| the conversion overflows, the largest integer with the same sign as `a' is
3069| returned.
3070*----------------------------------------------------------------------------*/
3071
3072int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
3073{
3074 flag aSign;
94a49d86 3075 int_fast16_t aExp, shiftCount;
bb98fe42 3076 uint64_t aSig;
158142c2 3077 int64 z;
37d18660 3078 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
3079
3080 aSig = extractFloat64Frac( a );
3081 aExp = extractFloat64Exp( a );
3082 aSign = extractFloat64Sign( a );
3083 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3084 shiftCount = aExp - 0x433;
3085 if ( 0 <= shiftCount ) {
3086 if ( 0x43E <= aExp ) {
f090c9d4 3087 if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
158142c2
FB
3088 float_raise( float_flag_invalid STATUS_VAR);
3089 if ( ! aSign
3090 || ( ( aExp == 0x7FF )
3091 && ( aSig != LIT64( 0x0010000000000000 ) ) )
3092 ) {
3093 return LIT64( 0x7FFFFFFFFFFFFFFF );
3094 }
3095 }
bb98fe42 3096 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
3097 }
3098 z = aSig<<shiftCount;
3099 }
3100 else {
3101 if ( aExp < 0x3FE ) {
3102 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3103 return 0;
3104 }
3105 z = aSig>>( - shiftCount );
bb98fe42 3106 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
158142c2
FB
3107 STATUS(float_exception_flags) |= float_flag_inexact;
3108 }
3109 }
3110 if ( aSign ) z = - z;
3111 return z;
3112
3113}
3114
3115/*----------------------------------------------------------------------------
3116| Returns the result of converting the double-precision floating-point value
3117| `a' to the single-precision floating-point format. The conversion is
3118| performed according to the IEC/IEEE Standard for Binary Floating-Point
3119| Arithmetic.
3120*----------------------------------------------------------------------------*/
3121
3122float32 float64_to_float32( float64 a STATUS_PARAM )
3123{
3124 flag aSign;
94a49d86 3125 int_fast16_t aExp;
bb98fe42
AF
3126 uint64_t aSig;
3127 uint32_t zSig;
37d18660 3128 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
3129
3130 aSig = extractFloat64Frac( a );
3131 aExp = extractFloat64Exp( a );
3132 aSign = extractFloat64Sign( a );
3133 if ( aExp == 0x7FF ) {
bcd4d9af 3134 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
3135 return packFloat32( aSign, 0xFF, 0 );
3136 }
3137 shift64RightJamming( aSig, 22, &aSig );
3138 zSig = aSig;
3139 if ( aExp || zSig ) {
3140 zSig |= 0x40000000;
3141 aExp -= 0x381;
3142 }
3143 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
3144
3145}
3146
60011498
PB
3147
3148/*----------------------------------------------------------------------------
3149| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3150| half-precision floating-point value, returning the result. After being
3151| shifted into the proper positions, the three fields are simply added
3152| together to form the result. This means that any integer portion of `zSig'
3153| will be added into the exponent. Since a properly normalized significand
3154| will have an integer portion equal to 1, the `zExp' input should be 1 less
3155| than the desired result exponent whenever `zSig' is a complete, normalized
3156| significand.
3157*----------------------------------------------------------------------------*/
94a49d86 3158static float16 packFloat16(flag zSign, int_fast16_t zExp, uint16_t zSig)
60011498 3159{
bb4d4bb3 3160 return make_float16(
bb98fe42 3161 (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
60011498
PB
3162}
3163
c4a1c5e7
PM
3164/*----------------------------------------------------------------------------
3165| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3166| and significand `zSig', and returns the proper half-precision floating-
3167| point value corresponding to the abstract input. Ordinarily, the abstract
3168| value is simply rounded and packed into the half-precision format, with
3169| the inexact exception raised if the abstract input cannot be represented
3170| exactly. However, if the abstract value is too large, the overflow and
3171| inexact exceptions are raised and an infinity or maximal finite value is
3172| returned. If the abstract value is too small, the input value is rounded to
3173| a subnormal number, and the underflow and inexact exceptions are raised if
3174| the abstract input cannot be represented exactly as a subnormal half-
3175| precision floating-point number.
3176| The `ieee' flag indicates whether to use IEEE standard half precision, or
3177| ARM-style "alternative representation", which omits the NaN and Inf
3178| encodings in order to raise the maximum representable exponent by one.
3179| The input significand `zSig' has its binary point between bits 22
3180| and 23, which is 13 bits to the left of the usual location. This shifted
3181| significand must be normalized or smaller. If `zSig' is not normalized,
3182| `zExp' must be 0; in that case, the result returned is a subnormal number,
3183| and it must not require rounding. In the usual case that `zSig' is
3184| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3185| Note the slightly odd position of the binary point in zSig compared with the
3186| other roundAndPackFloat functions. This should probably be fixed if we
3187| need to implement more float16 routines than just conversion.
3188| The handling of underflow and overflow follows the IEC/IEEE Standard for
3189| Binary Floating-Point Arithmetic.
3190*----------------------------------------------------------------------------*/
3191
3192static float32 roundAndPackFloat16(flag zSign, int_fast16_t zExp,
3193 uint32_t zSig, flag ieee STATUS_PARAM)
3194{
3195 int maxexp = ieee ? 29 : 30;
3196 uint32_t mask;
3197 uint32_t increment;
c4a1c5e7
PM
3198 bool rounding_bumps_exp;
3199 bool is_tiny = false;
3200
3201 /* Calculate the mask of bits of the mantissa which are not
3202 * representable in half-precision and will be lost.
3203 */
3204 if (zExp < 1) {
3205 /* Will be denormal in halfprec */
3206 mask = 0x00ffffff;
3207 if (zExp >= -11) {
3208 mask >>= 11 + zExp;
3209 }
3210 } else {
3211 /* Normal number in halfprec */
3212 mask = 0x00001fff;
3213 }
3214
dc355b76 3215 switch (STATUS(float_rounding_mode)) {
c4a1c5e7
PM
3216 case float_round_nearest_even:
3217 increment = (mask + 1) >> 1;
3218 if ((zSig & mask) == increment) {
3219 increment = zSig & (increment << 1);
3220 }
3221 break;
f9288a76
PM
3222 case float_round_ties_away:
3223 increment = (mask + 1) >> 1;
3224 break;
c4a1c5e7
PM
3225 case float_round_up:
3226 increment = zSign ? 0 : mask;
3227 break;
3228 case float_round_down:
3229 increment = zSign ? mask : 0;
3230 break;
3231 default: /* round_to_zero */
3232 increment = 0;
3233 break;
3234 }
3235
3236 rounding_bumps_exp = (zSig + increment >= 0x01000000);
3237
3238 if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3239 if (ieee) {
3240 float_raise(float_flag_overflow | float_flag_inexact STATUS_VAR);
3241 return packFloat16(zSign, 0x1f, 0);
3242 } else {
3243 float_raise(float_flag_invalid STATUS_VAR);
3244 return packFloat16(zSign, 0x1f, 0x3ff);
3245 }
3246 }
3247
3248 if (zExp < 0) {
3249 /* Note that flush-to-zero does not affect half-precision results */
3250 is_tiny =
3251 (STATUS(float_detect_tininess) == float_tininess_before_rounding)
3252 || (zExp < -1)
3253 || (!rounding_bumps_exp);
3254 }
3255 if (zSig & mask) {
3256 float_raise(float_flag_inexact STATUS_VAR);
3257 if (is_tiny) {
3258 float_raise(float_flag_underflow STATUS_VAR);
3259 }
3260 }
3261
3262 zSig += increment;
3263 if (rounding_bumps_exp) {
3264 zSig >>= 1;
3265 zExp++;
3266 }
3267
3268 if (zExp < -10) {
3269 return packFloat16(zSign, 0, 0);
3270 }
3271 if (zExp < 0) {
3272 zSig >>= -zExp;
3273 zExp = 0;
3274 }
3275 return packFloat16(zSign, zExp, zSig >> 13);
3276}
3277
3278static void normalizeFloat16Subnormal(uint32_t aSig, int_fast16_t *zExpPtr,
3279 uint32_t *zSigPtr)
3280{
3281 int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3282 *zSigPtr = aSig << shiftCount;
3283 *zExpPtr = 1 - shiftCount;
3284}
3285
60011498
PB
3286/* Half precision floats come in two formats: standard IEEE and "ARM" format.
3287 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
bb4d4bb3
PM
3288
3289float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM)
60011498
PB
3290{
3291 flag aSign;
94a49d86 3292 int_fast16_t aExp;
bb98fe42 3293 uint32_t aSig;
60011498 3294
bb4d4bb3
PM
3295 aSign = extractFloat16Sign(a);
3296 aExp = extractFloat16Exp(a);
3297 aSig = extractFloat16Frac(a);
60011498
PB
3298
3299 if (aExp == 0x1f && ieee) {
3300 if (aSig) {
f591e1be 3301 return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
60011498 3302 }
4be8eeac 3303 return packFloat32(aSign, 0xff, 0);
60011498
PB
3304 }
3305 if (aExp == 0) {
60011498
PB
3306 if (aSig == 0) {
3307 return packFloat32(aSign, 0, 0);
3308 }
3309
c4a1c5e7
PM
3310 normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3311 aExp--;
60011498
PB
3312 }
3313 return packFloat32( aSign, aExp + 0x70, aSig << 13);
3314}
3315
bb4d4bb3 3316float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM)
60011498
PB
3317{
3318 flag aSign;
94a49d86 3319 int_fast16_t aExp;
bb98fe42 3320 uint32_t aSig;
38970efa 3321
37d18660 3322 a = float32_squash_input_denormal(a STATUS_VAR);
60011498
PB
3323
3324 aSig = extractFloat32Frac( a );
3325 aExp = extractFloat32Exp( a );
3326 aSign = extractFloat32Sign( a );
3327 if ( aExp == 0xFF ) {
3328 if (aSig) {
600e30d2 3329 /* Input is a NaN */
600e30d2 3330 if (!ieee) {
38970efa 3331 float_raise(float_flag_invalid STATUS_VAR);
600e30d2
PM
3332 return packFloat16(aSign, 0, 0);
3333 }
38970efa
PM
3334 return commonNaNToFloat16(
3335 float32ToCommonNaN(a STATUS_VAR) STATUS_VAR);
60011498 3336 }
600e30d2
PM
3337 /* Infinity */
3338 if (!ieee) {
3339 float_raise(float_flag_invalid STATUS_VAR);
3340 return packFloat16(aSign, 0x1f, 0x3ff);
3341 }
3342 return packFloat16(aSign, 0x1f, 0);
60011498 3343 }
600e30d2 3344 if (aExp == 0 && aSig == 0) {
60011498
PB
3345 return packFloat16(aSign, 0, 0);
3346 }
38970efa
PM
3347 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3348 * even if the input is denormal; however this is harmless because
3349 * the largest possible single-precision denormal is still smaller
3350 * than the smallest representable half-precision denormal, and so we
3351 * will end up ignoring aSig and returning via the "always return zero"
3352 * codepath.
3353 */
60011498 3354 aSig |= 0x00800000;
c4a1c5e7 3355 aExp -= 0x71;
60011498 3356
c4a1c5e7 3357 return roundAndPackFloat16(aSign, aExp, aSig, ieee STATUS_VAR);
60011498
PB
3358}
3359
14c9a07e
PM
3360float64 float16_to_float64(float16 a, flag ieee STATUS_PARAM)
3361{
3362 flag aSign;
3363 int_fast16_t aExp;
3364 uint32_t aSig;
3365
3366 aSign = extractFloat16Sign(a);
3367 aExp = extractFloat16Exp(a);
3368 aSig = extractFloat16Frac(a);
3369
3370 if (aExp == 0x1f && ieee) {
3371 if (aSig) {
3372 return commonNaNToFloat64(
3373 float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3374 }
3375 return packFloat64(aSign, 0x7ff, 0);
3376 }
3377 if (aExp == 0) {
3378 if (aSig == 0) {
3379 return packFloat64(aSign, 0, 0);
3380 }
3381
3382 normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3383 aExp--;
3384 }
3385 return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);
3386}
3387
3388float16 float64_to_float16(float64 a, flag ieee STATUS_PARAM)
3389{
3390 flag aSign;
3391 int_fast16_t aExp;
3392 uint64_t aSig;
3393 uint32_t zSig;
3394
3395 a = float64_squash_input_denormal(a STATUS_VAR);
3396
3397 aSig = extractFloat64Frac(a);
3398 aExp = extractFloat64Exp(a);
3399 aSign = extractFloat64Sign(a);
3400 if (aExp == 0x7FF) {
3401 if (aSig) {
3402 /* Input is a NaN */
3403 if (!ieee) {
3404 float_raise(float_flag_invalid STATUS_VAR);
3405 return packFloat16(aSign, 0, 0);
3406 }
3407 return commonNaNToFloat16(
3408 float64ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3409 }
3410 /* Infinity */
3411 if (!ieee) {
3412 float_raise(float_flag_invalid STATUS_VAR);
3413 return packFloat16(aSign, 0x1f, 0x3ff);
3414 }
3415 return packFloat16(aSign, 0x1f, 0);
3416 }
3417 shift64RightJamming(aSig, 29, &aSig);
3418 zSig = aSig;
3419 if (aExp == 0 && zSig == 0) {
3420 return packFloat16(aSign, 0, 0);
3421 }
3422 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3423 * even if the input is denormal; however this is harmless because
3424 * the largest possible single-precision denormal is still smaller
3425 * than the smallest representable half-precision denormal, and so we
3426 * will end up ignoring aSig and returning via the "always return zero"
3427 * codepath.
3428 */
3429 zSig |= 0x00800000;
3430 aExp -= 0x3F1;
3431
3432 return roundAndPackFloat16(aSign, aExp, zSig, ieee STATUS_VAR);
3433}
3434
158142c2
FB
3435/*----------------------------------------------------------------------------
3436| Returns the result of converting the double-precision floating-point value
3437| `a' to the extended double-precision floating-point format. The conversion
3438| is performed according to the IEC/IEEE Standard for Binary Floating-Point
3439| Arithmetic.
3440*----------------------------------------------------------------------------*/
3441
3442floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
3443{
3444 flag aSign;
94a49d86 3445 int_fast16_t aExp;
bb98fe42 3446 uint64_t aSig;
158142c2 3447
37d18660 3448 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
3449 aSig = extractFloat64Frac( a );
3450 aExp = extractFloat64Exp( a );
3451 aSign = extractFloat64Sign( a );
3452 if ( aExp == 0x7FF ) {
bcd4d9af 3453 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
3454 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3455 }
3456 if ( aExp == 0 ) {
3457 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3458 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3459 }
3460 return
3461 packFloatx80(
3462 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
3463
3464}
3465
158142c2
FB
3466/*----------------------------------------------------------------------------
3467| Returns the result of converting the double-precision floating-point value
3468| `a' to the quadruple-precision floating-point format. The conversion is
3469| performed according to the IEC/IEEE Standard for Binary Floating-Point
3470| Arithmetic.
3471*----------------------------------------------------------------------------*/
3472
3473float128 float64_to_float128( float64 a STATUS_PARAM )
3474{
3475 flag aSign;
94a49d86 3476 int_fast16_t aExp;
bb98fe42 3477 uint64_t aSig, zSig0, zSig1;
158142c2 3478
37d18660 3479 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
3480 aSig = extractFloat64Frac( a );
3481 aExp = extractFloat64Exp( a );
3482 aSign = extractFloat64Sign( a );
3483 if ( aExp == 0x7FF ) {
bcd4d9af 3484 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
3485 return packFloat128( aSign, 0x7FFF, 0, 0 );
3486 }
3487 if ( aExp == 0 ) {
3488 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3489 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3490 --aExp;
3491 }
3492 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
3493 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
3494
3495}
3496
158142c2
FB
3497/*----------------------------------------------------------------------------
3498| Rounds the double-precision floating-point value `a' to an integer, and
3499| returns the result as a double-precision floating-point value. The
3500| operation is performed according to the IEC/IEEE Standard for Binary
3501| Floating-Point Arithmetic.
3502*----------------------------------------------------------------------------*/
3503
3504float64 float64_round_to_int( float64 a STATUS_PARAM )
3505{
3506 flag aSign;
94a49d86 3507 int_fast16_t aExp;
bb98fe42 3508 uint64_t lastBitMask, roundBitsMask;
bb98fe42 3509 uint64_t z;
37d18660 3510 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
3511
3512 aExp = extractFloat64Exp( a );
3513 if ( 0x433 <= aExp ) {
3514 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
3515 return propagateFloat64NaN( a, a STATUS_VAR );
3516 }
3517 return a;
3518 }
3519 if ( aExp < 0x3FF ) {
bb98fe42 3520 if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
158142c2
FB
3521 STATUS(float_exception_flags) |= float_flag_inexact;
3522 aSign = extractFloat64Sign( a );
3523 switch ( STATUS(float_rounding_mode) ) {
3524 case float_round_nearest_even:
3525 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3526 return packFloat64( aSign, 0x3FF, 0 );
3527 }
3528 break;
f9288a76
PM
3529 case float_round_ties_away:
3530 if (aExp == 0x3FE) {
3531 return packFloat64(aSign, 0x3ff, 0);
3532 }
3533 break;
158142c2 3534 case float_round_down:
f090c9d4 3535 return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
158142c2 3536 case float_round_up:
f090c9d4
PB
3537 return make_float64(
3538 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
158142c2
FB
3539 }
3540 return packFloat64( aSign, 0, 0 );
3541 }
3542 lastBitMask = 1;
3543 lastBitMask <<= 0x433 - aExp;
3544 roundBitsMask = lastBitMask - 1;
f090c9d4 3545 z = float64_val(a);
dc355b76
PM
3546 switch (STATUS(float_rounding_mode)) {
3547 case float_round_nearest_even:
3548 z += lastBitMask >> 1;
3549 if ((z & roundBitsMask) == 0) {
3550 z &= ~lastBitMask;
3551 }
3552 break;
f9288a76
PM
3553 case float_round_ties_away:
3554 z += lastBitMask >> 1;
3555 break;
dc355b76
PM
3556 case float_round_to_zero:
3557 break;
3558 case float_round_up:
3559 if (!extractFloat64Sign(make_float64(z))) {
3560 z += roundBitsMask;
3561 }
3562 break;
3563 case float_round_down:
3564 if (extractFloat64Sign(make_float64(z))) {
158142c2
FB
3565 z += roundBitsMask;
3566 }
dc355b76
PM
3567 break;
3568 default:
3569 abort();
158142c2
FB
3570 }
3571 z &= ~ roundBitsMask;
f090c9d4
PB
3572 if ( z != float64_val(a) )
3573 STATUS(float_exception_flags) |= float_flag_inexact;
3574 return make_float64(z);
158142c2
FB
3575
3576}
3577
e6e5906b
PB
3578float64 float64_trunc_to_int( float64 a STATUS_PARAM)
3579{
3580 int oldmode;
3581 float64 res;
3582 oldmode = STATUS(float_rounding_mode);
3583 STATUS(float_rounding_mode) = float_round_to_zero;
3584 res = float64_round_to_int(a STATUS_VAR);
3585 STATUS(float_rounding_mode) = oldmode;
3586 return res;
3587}
3588
158142c2
FB
3589/*----------------------------------------------------------------------------
3590| Returns the result of adding the absolute values of the double-precision
3591| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
3592| before being returned. `zSign' is ignored if the result is a NaN.
3593| The addition is performed according to the IEC/IEEE Standard for Binary
3594| Floating-Point Arithmetic.
3595*----------------------------------------------------------------------------*/
3596
3597static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3598{
94a49d86 3599 int_fast16_t aExp, bExp, zExp;
bb98fe42 3600 uint64_t aSig, bSig, zSig;
94a49d86 3601 int_fast16_t expDiff;
158142c2
FB
3602
3603 aSig = extractFloat64Frac( a );
3604 aExp = extractFloat64Exp( a );
3605 bSig = extractFloat64Frac( b );
3606 bExp = extractFloat64Exp( b );
3607 expDiff = aExp - bExp;
3608 aSig <<= 9;
3609 bSig <<= 9;
3610 if ( 0 < expDiff ) {
3611 if ( aExp == 0x7FF ) {
3612 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3613 return a;
3614 }
3615 if ( bExp == 0 ) {
3616 --expDiff;
3617 }
3618 else {
3619 bSig |= LIT64( 0x2000000000000000 );
3620 }
3621 shift64RightJamming( bSig, expDiff, &bSig );
3622 zExp = aExp;
3623 }
3624 else if ( expDiff < 0 ) {
3625 if ( bExp == 0x7FF ) {
3626 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3627 return packFloat64( zSign, 0x7FF, 0 );
3628 }
3629 if ( aExp == 0 ) {
3630 ++expDiff;
3631 }
3632 else {
3633 aSig |= LIT64( 0x2000000000000000 );
3634 }
3635 shift64RightJamming( aSig, - expDiff, &aSig );
3636 zExp = bExp;
3637 }
3638 else {
3639 if ( aExp == 0x7FF ) {
3640 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3641 return a;
3642 }
fe76d976 3643 if ( aExp == 0 ) {
e6afc87f
PM
3644 if (STATUS(flush_to_zero)) {
3645 if (aSig | bSig) {
3646 float_raise(float_flag_output_denormal STATUS_VAR);
3647 }
3648 return packFloat64(zSign, 0, 0);
3649 }
fe76d976
PB
3650 return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3651 }
158142c2
FB
3652 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3653 zExp = aExp;
3654 goto roundAndPack;
3655 }
3656 aSig |= LIT64( 0x2000000000000000 );
3657 zSig = ( aSig + bSig )<<1;
3658 --zExp;
bb98fe42 3659 if ( (int64_t) zSig < 0 ) {
158142c2
FB
3660 zSig = aSig + bSig;
3661 ++zExp;
3662 }
3663 roundAndPack:
3664 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3665
3666}
3667
3668/*----------------------------------------------------------------------------
3669| Returns the result of subtracting the absolute values of the double-
3670| precision floating-point values `a' and `b'. If `zSign' is 1, the
3671| difference is negated before being returned. `zSign' is ignored if the
3672| result is a NaN. The subtraction is performed according to the IEC/IEEE
3673| Standard for Binary Floating-Point Arithmetic.
3674*----------------------------------------------------------------------------*/
3675
3676static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3677{
94a49d86 3678 int_fast16_t aExp, bExp, zExp;
bb98fe42 3679 uint64_t aSig, bSig, zSig;
94a49d86 3680 int_fast16_t expDiff;
158142c2
FB
3681
3682 aSig = extractFloat64Frac( a );
3683 aExp = extractFloat64Exp( a );
3684 bSig = extractFloat64Frac( b );
3685 bExp = extractFloat64Exp( b );
3686 expDiff = aExp - bExp;
3687 aSig <<= 10;
3688 bSig <<= 10;
3689 if ( 0 < expDiff ) goto aExpBigger;
3690 if ( expDiff < 0 ) goto bExpBigger;
3691 if ( aExp == 0x7FF ) {
3692 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3693 float_raise( float_flag_invalid STATUS_VAR);
3694 return float64_default_nan;
3695 }
3696 if ( aExp == 0 ) {
3697 aExp = 1;
3698 bExp = 1;
3699 }
3700 if ( bSig < aSig ) goto aBigger;
3701 if ( aSig < bSig ) goto bBigger;
3702 return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3703 bExpBigger:
3704 if ( bExp == 0x7FF ) {
3705 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3706 return packFloat64( zSign ^ 1, 0x7FF, 0 );
3707 }
3708 if ( aExp == 0 ) {
3709 ++expDiff;
3710 }
3711 else {
3712 aSig |= LIT64( 0x4000000000000000 );
3713 }
3714 shift64RightJamming( aSig, - expDiff, &aSig );
3715 bSig |= LIT64( 0x4000000000000000 );
3716 bBigger:
3717 zSig = bSig - aSig;
3718 zExp = bExp;
3719 zSign ^= 1;
3720 goto normalizeRoundAndPack;
3721 aExpBigger:
3722 if ( aExp == 0x7FF ) {
3723 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3724 return a;
3725 }
3726 if ( bExp == 0 ) {
3727 --expDiff;
3728 }
3729 else {
3730 bSig |= LIT64( 0x4000000000000000 );
3731 }
3732 shift64RightJamming( bSig, expDiff, &bSig );
3733 aSig |= LIT64( 0x4000000000000000 );
3734 aBigger:
3735 zSig = aSig - bSig;
3736 zExp = aExp;
3737 normalizeRoundAndPack:
3738 --zExp;
3739 return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3740
3741}
3742
3743/*----------------------------------------------------------------------------
3744| Returns the result of adding the double-precision floating-point values `a'
3745| and `b'. The operation is performed according to the IEC/IEEE Standard for
3746| Binary Floating-Point Arithmetic.
3747*----------------------------------------------------------------------------*/
3748
3749float64 float64_add( float64 a, float64 b STATUS_PARAM )
3750{
3751 flag aSign, bSign;
37d18660
PM
3752 a = float64_squash_input_denormal(a STATUS_VAR);
3753 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
3754
3755 aSign = extractFloat64Sign( a );
3756 bSign = extractFloat64Sign( b );
3757 if ( aSign == bSign ) {
3758 return addFloat64Sigs( a, b, aSign STATUS_VAR );
3759 }
3760 else {
3761 return subFloat64Sigs( a, b, aSign STATUS_VAR );
3762 }
3763
3764}
3765
3766/*----------------------------------------------------------------------------
3767| Returns the result of subtracting the double-precision floating-point values
3768| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3769| for Binary Floating-Point Arithmetic.
3770*----------------------------------------------------------------------------*/
3771
3772float64 float64_sub( float64 a, float64 b STATUS_PARAM )
3773{
3774 flag aSign, bSign;
37d18660
PM
3775 a = float64_squash_input_denormal(a STATUS_VAR);
3776 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
3777
3778 aSign = extractFloat64Sign( a );
3779 bSign = extractFloat64Sign( b );
3780 if ( aSign == bSign ) {
3781 return subFloat64Sigs( a, b, aSign STATUS_VAR );
3782 }
3783 else {
3784 return addFloat64Sigs( a, b, aSign STATUS_VAR );
3785 }
3786
3787}
3788
3789/*----------------------------------------------------------------------------
3790| Returns the result of multiplying the double-precision floating-point values
3791| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3792| for Binary Floating-Point Arithmetic.
3793*----------------------------------------------------------------------------*/
3794
3795float64 float64_mul( float64 a, float64 b STATUS_PARAM )
3796{
3797 flag aSign, bSign, zSign;
94a49d86 3798 int_fast16_t aExp, bExp, zExp;
bb98fe42 3799 uint64_t aSig, bSig, zSig0, zSig1;
158142c2 3800
37d18660
PM
3801 a = float64_squash_input_denormal(a STATUS_VAR);
3802 b = float64_squash_input_denormal(b STATUS_VAR);
3803
158142c2
FB
3804 aSig = extractFloat64Frac( a );
3805 aExp = extractFloat64Exp( a );
3806 aSign = extractFloat64Sign( a );
3807 bSig = extractFloat64Frac( b );
3808 bExp = extractFloat64Exp( b );
3809 bSign = extractFloat64Sign( b );
3810 zSign = aSign ^ bSign;
3811 if ( aExp == 0x7FF ) {
3812 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3813 return propagateFloat64NaN( a, b STATUS_VAR );
3814 }
3815 if ( ( bExp | bSig ) == 0 ) {
3816 float_raise( float_flag_invalid STATUS_VAR);
3817 return float64_default_nan;
3818 }
3819 return packFloat64( zSign, 0x7FF, 0 );
3820 }
3821 if ( bExp == 0x7FF ) {
3822 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3823 if ( ( aExp | aSig ) == 0 ) {
3824 float_raise( float_flag_invalid STATUS_VAR);
3825 return float64_default_nan;
3826 }
3827 return packFloat64( zSign, 0x7FF, 0 );
3828 }
3829 if ( aExp == 0 ) {
3830 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3831 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3832 }
3833 if ( bExp == 0 ) {
3834 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
3835 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3836 }
3837 zExp = aExp + bExp - 0x3FF;
3838 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3839 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3840 mul64To128( aSig, bSig, &zSig0, &zSig1 );
3841 zSig0 |= ( zSig1 != 0 );
bb98fe42 3842 if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
158142c2
FB
3843 zSig0 <<= 1;
3844 --zExp;
3845 }
3846 return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
3847
3848}
3849
3850/*----------------------------------------------------------------------------
3851| Returns the result of dividing the double-precision floating-point value `a'
3852| by the corresponding value `b'. The operation is performed according to
3853| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3854*----------------------------------------------------------------------------*/
3855
3856float64 float64_div( float64 a, float64 b STATUS_PARAM )
3857{
3858 flag aSign, bSign, zSign;
94a49d86 3859 int_fast16_t aExp, bExp, zExp;
bb98fe42
AF
3860 uint64_t aSig, bSig, zSig;
3861 uint64_t rem0, rem1;
3862 uint64_t term0, term1;
37d18660
PM
3863 a = float64_squash_input_denormal(a STATUS_VAR);
3864 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
3865
3866 aSig = extractFloat64Frac( a );
3867 aExp = extractFloat64Exp( a );
3868 aSign = extractFloat64Sign( a );
3869 bSig = extractFloat64Frac( b );
3870 bExp = extractFloat64Exp( b );
3871 bSign = extractFloat64Sign( b );
3872 zSign = aSign ^ bSign;
3873 if ( aExp == 0x7FF ) {
3874 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3875 if ( bExp == 0x7FF ) {
3876 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3877 float_raise( float_flag_invalid STATUS_VAR);
3878 return float64_default_nan;
3879 }
3880 return packFloat64( zSign, 0x7FF, 0 );
3881 }
3882 if ( bExp == 0x7FF ) {
3883 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3884 return packFloat64( zSign, 0, 0 );
3885 }
3886 if ( bExp == 0 ) {
3887 if ( bSig == 0 ) {
3888 if ( ( aExp | aSig ) == 0 ) {
3889 float_raise( float_flag_invalid STATUS_VAR);
3890 return float64_default_nan;
3891 }
3892 float_raise( float_flag_divbyzero STATUS_VAR);
3893 return packFloat64( zSign, 0x7FF, 0 );
3894 }
3895 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3896 }
3897 if ( aExp == 0 ) {
3898 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3899 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3900 }
3901 zExp = aExp - bExp + 0x3FD;
3902 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3903 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3904 if ( bSig <= ( aSig + aSig ) ) {
3905 aSig >>= 1;
3906 ++zExp;
3907 }
3908 zSig = estimateDiv128To64( aSig, 0, bSig );
3909 if ( ( zSig & 0x1FF ) <= 2 ) {
3910 mul64To128( bSig, zSig, &term0, &term1 );
3911 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
bb98fe42 3912 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
3913 --zSig;
3914 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3915 }
3916 zSig |= ( rem1 != 0 );
3917 }
3918 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3919
3920}
3921
3922/*----------------------------------------------------------------------------
3923| Returns the remainder of the double-precision floating-point value `a'
3924| with respect to the corresponding value `b'. The operation is performed
3925| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3926*----------------------------------------------------------------------------*/
3927
3928float64 float64_rem( float64 a, float64 b STATUS_PARAM )
3929{
ed086f3d 3930 flag aSign, zSign;
94a49d86 3931 int_fast16_t aExp, bExp, expDiff;
bb98fe42
AF
3932 uint64_t aSig, bSig;
3933 uint64_t q, alternateASig;
3934 int64_t sigMean;
158142c2 3935
37d18660
PM
3936 a = float64_squash_input_denormal(a STATUS_VAR);
3937 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
3938 aSig = extractFloat64Frac( a );
3939 aExp = extractFloat64Exp( a );
3940 aSign = extractFloat64Sign( a );
3941 bSig = extractFloat64Frac( b );
3942 bExp = extractFloat64Exp( b );
158142c2
FB
3943 if ( aExp == 0x7FF ) {
3944 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3945 return propagateFloat64NaN( a, b STATUS_VAR );
3946 }
3947 float_raise( float_flag_invalid STATUS_VAR);
3948 return float64_default_nan;
3949 }
3950 if ( bExp == 0x7FF ) {
3951 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3952 return a;
3953 }
3954 if ( bExp == 0 ) {
3955 if ( bSig == 0 ) {
3956 float_raise( float_flag_invalid STATUS_VAR);
3957 return float64_default_nan;
3958 }
3959 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3960 }
3961 if ( aExp == 0 ) {
3962 if ( aSig == 0 ) return a;
3963 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3964 }
3965 expDiff = aExp - bExp;
3966 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3967 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3968 if ( expDiff < 0 ) {
3969 if ( expDiff < -1 ) return a;
3970 aSig >>= 1;
3971 }
3972 q = ( bSig <= aSig );
3973 if ( q ) aSig -= bSig;
3974 expDiff -= 64;
3975 while ( 0 < expDiff ) {
3976 q = estimateDiv128To64( aSig, 0, bSig );
3977 q = ( 2 < q ) ? q - 2 : 0;
3978 aSig = - ( ( bSig>>2 ) * q );
3979 expDiff -= 62;
3980 }
3981 expDiff += 64;
3982 if ( 0 < expDiff ) {
3983 q = estimateDiv128To64( aSig, 0, bSig );
3984 q = ( 2 < q ) ? q - 2 : 0;
3985 q >>= 64 - expDiff;
3986 bSig >>= 2;
3987 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3988 }
3989 else {
3990 aSig >>= 2;
3991 bSig >>= 2;
3992 }
3993 do {
3994 alternateASig = aSig;
3995 ++q;
3996 aSig -= bSig;
bb98fe42 3997 } while ( 0 <= (int64_t) aSig );
158142c2
FB
3998 sigMean = aSig + alternateASig;
3999 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4000 aSig = alternateASig;
4001 }
bb98fe42 4002 zSign = ( (int64_t) aSig < 0 );
158142c2
FB
4003 if ( zSign ) aSig = - aSig;
4004 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
4005
4006}
4007
369be8f6
PM
4008/*----------------------------------------------------------------------------
4009| Returns the result of multiplying the double-precision floating-point values
4010| `a' and `b' then adding 'c', with no intermediate rounding step after the
4011| multiplication. The operation is performed according to the IEC/IEEE
4012| Standard for Binary Floating-Point Arithmetic 754-2008.
4013| The flags argument allows the caller to select negation of the
4014| addend, the intermediate product, or the final result. (The difference
4015| between this and having the caller do a separate negation is that negating
4016| externally will flip the sign bit on NaNs.)
4017*----------------------------------------------------------------------------*/
4018
4019float64 float64_muladd(float64 a, float64 b, float64 c, int flags STATUS_PARAM)
4020{
4021 flag aSign, bSign, cSign, zSign;
94a49d86 4022 int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
369be8f6
PM
4023 uint64_t aSig, bSig, cSig;
4024 flag pInf, pZero, pSign;
4025 uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1;
4026 int shiftcount;
4027 flag signflip, infzero;
4028
4029 a = float64_squash_input_denormal(a STATUS_VAR);
4030 b = float64_squash_input_denormal(b STATUS_VAR);
4031 c = float64_squash_input_denormal(c STATUS_VAR);
4032 aSig = extractFloat64Frac(a);
4033 aExp = extractFloat64Exp(a);
4034 aSign = extractFloat64Sign(a);
4035 bSig = extractFloat64Frac(b);
4036 bExp = extractFloat64Exp(b);
4037 bSign = extractFloat64Sign(b);
4038 cSig = extractFloat64Frac(c);
4039 cExp = extractFloat64Exp(c);
4040 cSign = extractFloat64Sign(c);
4041
4042 infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) ||
4043 (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0));
4044
4045 /* It is implementation-defined whether the cases of (0,inf,qnan)
4046 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
4047 * they return if they do), so we have to hand this information
4048 * off to the target-specific pick-a-NaN routine.
4049 */
4050 if (((aExp == 0x7ff) && aSig) ||
4051 ((bExp == 0x7ff) && bSig) ||
4052 ((cExp == 0x7ff) && cSig)) {
4053 return propagateFloat64MulAddNaN(a, b, c, infzero STATUS_VAR);
4054 }
4055
4056 if (infzero) {
4057 float_raise(float_flag_invalid STATUS_VAR);
4058 return float64_default_nan;
4059 }
4060
4061 if (flags & float_muladd_negate_c) {
4062 cSign ^= 1;
4063 }
4064
4065 signflip = (flags & float_muladd_negate_result) ? 1 : 0;
4066
4067 /* Work out the sign and type of the product */
4068 pSign = aSign ^ bSign;
4069 if (flags & float_muladd_negate_product) {
4070 pSign ^= 1;
4071 }
4072 pInf = (aExp == 0x7ff) || (bExp == 0x7ff);
4073 pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
4074
4075 if (cExp == 0x7ff) {
4076 if (pInf && (pSign ^ cSign)) {
4077 /* addition of opposite-signed infinities => InvalidOperation */
4078 float_raise(float_flag_invalid STATUS_VAR);
4079 return float64_default_nan;
4080 }
4081 /* Otherwise generate an infinity of the same sign */
4082 return packFloat64(cSign ^ signflip, 0x7ff, 0);
4083 }
4084
4085 if (pInf) {
4086 return packFloat64(pSign ^ signflip, 0x7ff, 0);
4087 }
4088
4089 if (pZero) {
4090 if (cExp == 0) {
4091 if (cSig == 0) {
4092 /* Adding two exact zeroes */
4093 if (pSign == cSign) {
4094 zSign = pSign;
4095 } else if (STATUS(float_rounding_mode) == float_round_down) {
4096 zSign = 1;
4097 } else {
4098 zSign = 0;
4099 }
4100 return packFloat64(zSign ^ signflip, 0, 0);
4101 }
4102 /* Exact zero plus a denorm */
4103 if (STATUS(flush_to_zero)) {
4104 float_raise(float_flag_output_denormal STATUS_VAR);
4105 return packFloat64(cSign ^ signflip, 0, 0);
4106 }
4107 }
4108 /* Zero plus something non-zero : just return the something */
67d43538
PM
4109 if (flags & float_muladd_halve_result) {
4110 if (cExp == 0) {
4111 normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4112 }
4113 /* Subtract one to halve, and one again because roundAndPackFloat64
4114 * wants one less than the true exponent.
4115 */
4116 cExp -= 2;
4117 cSig = (cSig | 0x0010000000000000ULL) << 10;
4118 return roundAndPackFloat64(cSign ^ signflip, cExp, cSig STATUS_VAR);
4119 }
a6e7c184 4120 return packFloat64(cSign ^ signflip, cExp, cSig);
369be8f6
PM
4121 }
4122
4123 if (aExp == 0) {
4124 normalizeFloat64Subnormal(aSig, &aExp, &aSig);
4125 }
4126 if (bExp == 0) {
4127 normalizeFloat64Subnormal(bSig, &bExp, &bSig);
4128 }
4129
4130 /* Calculate the actual result a * b + c */
4131
4132 /* Multiply first; this is easy. */
4133 /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff
4134 * because we want the true exponent, not the "one-less-than"
4135 * flavour that roundAndPackFloat64() takes.
4136 */
4137 pExp = aExp + bExp - 0x3fe;
4138 aSig = (aSig | LIT64(0x0010000000000000))<<10;
4139 bSig = (bSig | LIT64(0x0010000000000000))<<11;
4140 mul64To128(aSig, bSig, &pSig0, &pSig1);
4141 if ((int64_t)(pSig0 << 1) >= 0) {
4142 shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1);
4143 pExp--;
4144 }
4145
4146 zSign = pSign ^ signflip;
4147
4148 /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit
4149 * bit in position 126.
4150 */
4151 if (cExp == 0) {
4152 if (!cSig) {
4153 /* Throw out the special case of c being an exact zero now */
4154 shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1);
67d43538
PM
4155 if (flags & float_muladd_halve_result) {
4156 pExp--;
4157 }
369be8f6
PM
4158 return roundAndPackFloat64(zSign, pExp - 1,
4159 pSig1 STATUS_VAR);
4160 }
4161 normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4162 }
4163
4164 /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the
4165 * significand of the addend, with the explicit bit in position 126.
4166 */
4167 cSig0 = cSig << (126 - 64 - 52);
4168 cSig1 = 0;
4169 cSig0 |= LIT64(0x4000000000000000);
4170 expDiff = pExp - cExp;
4171
4172 if (pSign == cSign) {
4173 /* Addition */
4174 if (expDiff > 0) {
4175 /* scale c to match p */
4176 shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4177 zExp = pExp;
4178 } else if (expDiff < 0) {
4179 /* scale p to match c */
4180 shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4181 zExp = cExp;
4182 } else {
4183 /* no scaling needed */
4184 zExp = cExp;
4185 }
4186 /* Add significands and make sure explicit bit ends up in posn 126 */
4187 add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4188 if ((int64_t)zSig0 < 0) {
4189 shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);
4190 } else {
4191 zExp--;
4192 }
4193 shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1);
67d43538
PM
4194 if (flags & float_muladd_halve_result) {
4195 zExp--;
4196 }
369be8f6
PM
4197 return roundAndPackFloat64(zSign, zExp, zSig1 STATUS_VAR);
4198 } else {
4199 /* Subtraction */
4200 if (expDiff > 0) {
4201 shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4202 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4203 zExp = pExp;
4204 } else if (expDiff < 0) {
4205 shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4206 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4207 zExp = cExp;
4208 zSign ^= 1;
4209 } else {
4210 zExp = pExp;
4211 if (lt128(cSig0, cSig1, pSig0, pSig1)) {
4212 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4213 } else if (lt128(pSig0, pSig1, cSig0, cSig1)) {
4214 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4215 zSign ^= 1;
4216 } else {
4217 /* Exact zero */
4218 zSign = signflip;
4219 if (STATUS(float_rounding_mode) == float_round_down) {
4220 zSign ^= 1;
4221 }
4222 return packFloat64(zSign, 0, 0);
4223 }
4224 }
4225 --zExp;
4226 /* Do the equivalent of normalizeRoundAndPackFloat64() but
4227 * starting with the significand in a pair of uint64_t.
4228 */
4229 if (zSig0) {
4230 shiftcount = countLeadingZeros64(zSig0) - 1;
4231 shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1);
4232 if (zSig1) {
4233 zSig0 |= 1;
4234 }
4235 zExp -= shiftcount;
4236 } else {
e3d142d0
PM
4237 shiftcount = countLeadingZeros64(zSig1);
4238 if (shiftcount == 0) {
4239 zSig0 = (zSig1 >> 1) | (zSig1 & 1);
4240 zExp -= 63;
4241 } else {
4242 shiftcount--;
4243 zSig0 = zSig1 << shiftcount;
4244 zExp -= (shiftcount + 64);
4245 }
369be8f6 4246 }
67d43538
PM
4247 if (flags & float_muladd_halve_result) {
4248 zExp--;
4249 }
369be8f6
PM
4250 return roundAndPackFloat64(zSign, zExp, zSig0 STATUS_VAR);
4251 }
4252}
4253
158142c2
FB
4254/*----------------------------------------------------------------------------
4255| Returns the square root of the double-precision floating-point value `a'.
4256| The operation is performed according to the IEC/IEEE Standard for Binary
4257| Floating-Point Arithmetic.
4258*----------------------------------------------------------------------------*/
4259
4260float64 float64_sqrt( float64 a STATUS_PARAM )
4261{
4262 flag aSign;
94a49d86 4263 int_fast16_t aExp, zExp;
bb98fe42
AF
4264 uint64_t aSig, zSig, doubleZSig;
4265 uint64_t rem0, rem1, term0, term1;
37d18660 4266 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
4267
4268 aSig = extractFloat64Frac( a );
4269 aExp = extractFloat64Exp( a );
4270 aSign = extractFloat64Sign( a );
4271 if ( aExp == 0x7FF ) {
4272 if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
4273 if ( ! aSign ) return a;
4274 float_raise( float_flag_invalid STATUS_VAR);
4275 return float64_default_nan;
4276 }
4277 if ( aSign ) {
4278 if ( ( aExp | aSig ) == 0 ) return a;
4279 float_raise( float_flag_invalid STATUS_VAR);
4280 return float64_default_nan;
4281 }
4282 if ( aExp == 0 ) {
f090c9d4 4283 if ( aSig == 0 ) return float64_zero;
158142c2
FB
4284 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4285 }
4286 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
4287 aSig |= LIT64( 0x0010000000000000 );
4288 zSig = estimateSqrt32( aExp, aSig>>21 );
4289 aSig <<= 9 - ( aExp & 1 );
4290 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
4291 if ( ( zSig & 0x1FF ) <= 5 ) {
4292 doubleZSig = zSig<<1;
4293 mul64To128( zSig, zSig, &term0, &term1 );
4294 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
bb98fe42 4295 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
4296 --zSig;
4297 doubleZSig -= 2;
4298 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
4299 }
4300 zSig |= ( ( rem0 | rem1 ) != 0 );
4301 }
4302 return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
4303
4304}
4305
374dfc33
AJ
4306/*----------------------------------------------------------------------------
4307| Returns the binary log of the double-precision floating-point value `a'.
4308| The operation is performed according to the IEC/IEEE Standard for Binary
4309| Floating-Point Arithmetic.
4310*----------------------------------------------------------------------------*/
4311float64 float64_log2( float64 a STATUS_PARAM )
4312{
4313 flag aSign, zSign;
94a49d86 4314 int_fast16_t aExp;
bb98fe42 4315 uint64_t aSig, aSig0, aSig1, zSig, i;
37d18660 4316 a = float64_squash_input_denormal(a STATUS_VAR);
374dfc33
AJ
4317
4318 aSig = extractFloat64Frac( a );
4319 aExp = extractFloat64Exp( a );
4320 aSign = extractFloat64Sign( a );
4321
4322 if ( aExp == 0 ) {
4323 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4324 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4325 }
4326 if ( aSign ) {
4327 float_raise( float_flag_invalid STATUS_VAR);
4328 return float64_default_nan;
4329 }
4330 if ( aExp == 0x7FF ) {
4331 if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
4332 return a;
4333 }
4334
4335 aExp -= 0x3FF;
4336 aSig |= LIT64( 0x0010000000000000 );
4337 zSign = aExp < 0;
bb98fe42 4338 zSig = (uint64_t)aExp << 52;
374dfc33
AJ
4339 for (i = 1LL << 51; i > 0; i >>= 1) {
4340 mul64To128( aSig, aSig, &aSig0, &aSig1 );
4341 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4342 if ( aSig & LIT64( 0x0020000000000000 ) ) {
4343 aSig >>= 1;
4344 zSig |= i;
4345 }
4346 }
4347
4348 if ( zSign )
4349 zSig = -zSig;
4350 return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
4351}
4352
158142c2
FB
4353/*----------------------------------------------------------------------------
4354| Returns 1 if the double-precision floating-point value `a' is equal to the
b689362d
AJ
4355| corresponding value `b', and 0 otherwise. The invalid exception is raised
4356| if either operand is a NaN. Otherwise, the comparison is performed
158142c2
FB
4357| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4358*----------------------------------------------------------------------------*/
4359
b689362d 4360int float64_eq( float64 a, float64 b STATUS_PARAM )
158142c2 4361{
bb98fe42 4362 uint64_t av, bv;
37d18660
PM
4363 a = float64_squash_input_denormal(a STATUS_VAR);
4364 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
4365
4366 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4367 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4368 ) {
b689362d 4369 float_raise( float_flag_invalid STATUS_VAR);
158142c2
FB
4370 return 0;
4371 }
f090c9d4 4372 av = float64_val(a);
a1b91bb4 4373 bv = float64_val(b);
bb98fe42 4374 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
158142c2
FB
4375
4376}
4377
4378/*----------------------------------------------------------------------------
4379| Returns 1 if the double-precision floating-point value `a' is less than or
f5a64251
AJ
4380| equal to the corresponding value `b', and 0 otherwise. The invalid
4381| exception is raised if either operand is a NaN. The comparison is performed
4382| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
4383*----------------------------------------------------------------------------*/
4384
750afe93 4385int float64_le( float64 a, float64 b STATUS_PARAM )
158142c2
FB
4386{
4387 flag aSign, bSign;
bb98fe42 4388 uint64_t av, bv;
37d18660
PM
4389 a = float64_squash_input_denormal(a STATUS_VAR);
4390 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
4391
4392 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4393 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4394 ) {
4395 float_raise( float_flag_invalid STATUS_VAR);
4396 return 0;
4397 }
4398 aSign = extractFloat64Sign( a );
4399 bSign = extractFloat64Sign( b );
f090c9d4 4400 av = float64_val(a);
a1b91bb4 4401 bv = float64_val(b);
bb98fe42 4402 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 4403 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
4404
4405}
4406
4407/*----------------------------------------------------------------------------
4408| Returns 1 if the double-precision floating-point value `a' is less than
f5a64251
AJ
4409| the corresponding value `b', and 0 otherwise. The invalid exception is
4410| raised if either operand is a NaN. The comparison is performed according
4411| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
4412*----------------------------------------------------------------------------*/
4413
750afe93 4414int float64_lt( float64 a, float64 b STATUS_PARAM )
158142c2
FB
4415{
4416 flag aSign, bSign;
bb98fe42 4417 uint64_t av, bv;
158142c2 4418
37d18660
PM
4419 a = float64_squash_input_denormal(a STATUS_VAR);
4420 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
4421 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4422 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4423 ) {
4424 float_raise( float_flag_invalid STATUS_VAR);
4425 return 0;
4426 }
4427 aSign = extractFloat64Sign( a );
4428 bSign = extractFloat64Sign( b );
f090c9d4 4429 av = float64_val(a);
a1b91bb4 4430 bv = float64_val(b);
bb98fe42 4431 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 4432 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
4433
4434}
4435
67b7861d
AJ
4436/*----------------------------------------------------------------------------
4437| Returns 1 if the double-precision floating-point values `a' and `b' cannot
f5a64251
AJ
4438| be compared, and 0 otherwise. The invalid exception is raised if either
4439| operand is a NaN. The comparison is performed according to the IEC/IEEE
4440| Standard for Binary Floating-Point Arithmetic.
67b7861d
AJ
4441*----------------------------------------------------------------------------*/
4442
4443int float64_unordered( float64 a, float64 b STATUS_PARAM )
4444{
4445 a = float64_squash_input_denormal(a STATUS_VAR);
4446 b = float64_squash_input_denormal(b STATUS_VAR);
4447
4448 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4449 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4450 ) {
4451 float_raise( float_flag_invalid STATUS_VAR);
4452 return 1;
4453 }
4454 return 0;
4455}
4456
158142c2
FB
4457/*----------------------------------------------------------------------------
4458| Returns 1 if the double-precision floating-point value `a' is equal to the
f5a64251
AJ
4459| corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4460| exception.The comparison is performed according to the IEC/IEEE Standard
4461| for Binary Floating-Point Arithmetic.
158142c2
FB
4462*----------------------------------------------------------------------------*/
4463
b689362d 4464int float64_eq_quiet( float64 a, float64 b STATUS_PARAM )
158142c2 4465{
bb98fe42 4466 uint64_t av, bv;
37d18660
PM
4467 a = float64_squash_input_denormal(a STATUS_VAR);
4468 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
4469
4470 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4471 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4472 ) {
b689362d
AJ
4473 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4474 float_raise( float_flag_invalid STATUS_VAR);
4475 }
158142c2
FB
4476 return 0;
4477 }
f090c9d4 4478 av = float64_val(a);
a1b91bb4 4479 bv = float64_val(b);
bb98fe42 4480 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
158142c2
FB
4481
4482}
4483
4484/*----------------------------------------------------------------------------
4485| Returns 1 if the double-precision floating-point value `a' is less than or
4486| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4487| cause an exception. Otherwise, the comparison is performed according to the
4488| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4489*----------------------------------------------------------------------------*/
4490
750afe93 4491int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
158142c2
FB
4492{
4493 flag aSign, bSign;
bb98fe42 4494 uint64_t av, bv;
37d18660
PM
4495 a = float64_squash_input_denormal(a STATUS_VAR);
4496 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
4497
4498 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4499 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4500 ) {
4501 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4502 float_raise( float_flag_invalid STATUS_VAR);
4503 }
4504 return 0;
4505 }
4506 aSign = extractFloat64Sign( a );
4507 bSign = extractFloat64Sign( b );
f090c9d4 4508 av = float64_val(a);
a1b91bb4 4509 bv = float64_val(b);
bb98fe42 4510 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 4511 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
4512
4513}
4514
4515/*----------------------------------------------------------------------------
4516| Returns 1 if the double-precision floating-point value `a' is less than
4517| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4518| exception. Otherwise, the comparison is performed according to the IEC/IEEE
4519| Standard for Binary Floating-Point Arithmetic.
4520*----------------------------------------------------------------------------*/
4521
750afe93 4522int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
158142c2
FB
4523{
4524 flag aSign, bSign;
bb98fe42 4525 uint64_t av, bv;
37d18660
PM
4526 a = float64_squash_input_denormal(a STATUS_VAR);
4527 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
4528
4529 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4530 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4531 ) {
4532 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4533 float_raise( float_flag_invalid STATUS_VAR);
4534 }
4535 return 0;
4536 }
4537 aSign = extractFloat64Sign( a );
4538 bSign = extractFloat64Sign( b );
f090c9d4 4539 av = float64_val(a);
a1b91bb4 4540 bv = float64_val(b);
bb98fe42 4541 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 4542 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
4543
4544}
4545
67b7861d
AJ
4546/*----------------------------------------------------------------------------
4547| Returns 1 if the double-precision floating-point values `a' and `b' cannot
4548| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4549| comparison is performed according to the IEC/IEEE Standard for Binary
4550| Floating-Point Arithmetic.
4551*----------------------------------------------------------------------------*/
4552
4553int float64_unordered_quiet( float64 a, float64 b STATUS_PARAM )
4554{
4555 a = float64_squash_input_denormal(a STATUS_VAR);
4556 b = float64_squash_input_denormal(b STATUS_VAR);
4557
4558 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4559 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4560 ) {
4561 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4562 float_raise( float_flag_invalid STATUS_VAR);
4563 }
4564 return 1;
4565 }
4566 return 0;
4567}
4568
158142c2
FB
4569/*----------------------------------------------------------------------------
4570| Returns the result of converting the extended double-precision floating-
4571| point value `a' to the 32-bit two's complement integer format. The
4572| conversion is performed according to the IEC/IEEE Standard for Binary
4573| Floating-Point Arithmetic---which means in particular that the conversion
4574| is rounded according to the current rounding mode. If `a' is a NaN, the
4575| largest positive integer is returned. Otherwise, if the conversion
4576| overflows, the largest integer with the same sign as `a' is returned.
4577*----------------------------------------------------------------------------*/
4578
4579int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
4580{
4581 flag aSign;
4582 int32 aExp, shiftCount;
bb98fe42 4583 uint64_t aSig;
158142c2
FB
4584
4585 aSig = extractFloatx80Frac( a );
4586 aExp = extractFloatx80Exp( a );
4587 aSign = extractFloatx80Sign( a );
bb98fe42 4588 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
158142c2
FB
4589 shiftCount = 0x4037 - aExp;
4590 if ( shiftCount <= 0 ) shiftCount = 1;
4591 shift64RightJamming( aSig, shiftCount, &aSig );
4592 return roundAndPackInt32( aSign, aSig STATUS_VAR );
4593
4594}
4595
4596/*----------------------------------------------------------------------------
4597| Returns the result of converting the extended double-precision floating-
4598| point value `a' to the 32-bit two's complement integer format. The
4599| conversion is performed according to the IEC/IEEE Standard for Binary
4600| Floating-Point Arithmetic, except that the conversion is always rounded
4601| toward zero. If `a' is a NaN, the largest positive integer is returned.
4602| Otherwise, if the conversion overflows, the largest integer with the same
4603| sign as `a' is returned.
4604*----------------------------------------------------------------------------*/
4605
4606int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
4607{
4608 flag aSign;
4609 int32 aExp, shiftCount;
bb98fe42 4610 uint64_t aSig, savedASig;
b3a6a2e0 4611 int32_t z;
158142c2
FB
4612
4613 aSig = extractFloatx80Frac( a );
4614 aExp = extractFloatx80Exp( a );
4615 aSign = extractFloatx80Sign( a );
4616 if ( 0x401E < aExp ) {
bb98fe42 4617 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
158142c2
FB
4618 goto invalid;
4619 }
4620 else if ( aExp < 0x3FFF ) {
4621 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
4622 return 0;
4623 }
4624 shiftCount = 0x403E - aExp;
4625 savedASig = aSig;
4626 aSig >>= shiftCount;
4627 z = aSig;
4628 if ( aSign ) z = - z;
4629 if ( ( z < 0 ) ^ aSign ) {
4630 invalid:
4631 float_raise( float_flag_invalid STATUS_VAR);
bb98fe42 4632 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2
FB
4633 }
4634 if ( ( aSig<<shiftCount ) != savedASig ) {
4635 STATUS(float_exception_flags) |= float_flag_inexact;
4636 }
4637 return z;
4638
4639}
4640
4641/*----------------------------------------------------------------------------
4642| Returns the result of converting the extended double-precision floating-
4643| point value `a' to the 64-bit two's complement integer format. The
4644| conversion is performed according to the IEC/IEEE Standard for Binary
4645| Floating-Point Arithmetic---which means in particular that the conversion
4646| is rounded according to the current rounding mode. If `a' is a NaN,
4647| the largest positive integer is returned. Otherwise, if the conversion
4648| overflows, the largest integer with the same sign as `a' is returned.
4649*----------------------------------------------------------------------------*/
4650
4651int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
4652{
4653 flag aSign;
4654 int32 aExp, shiftCount;
bb98fe42 4655 uint64_t aSig, aSigExtra;
158142c2
FB
4656
4657 aSig = extractFloatx80Frac( a );
4658 aExp = extractFloatx80Exp( a );
4659 aSign = extractFloatx80Sign( a );
4660 shiftCount = 0x403E - aExp;
4661 if ( shiftCount <= 0 ) {
4662 if ( shiftCount ) {
4663 float_raise( float_flag_invalid STATUS_VAR);
4664 if ( ! aSign
4665 || ( ( aExp == 0x7FFF )
4666 && ( aSig != LIT64( 0x8000000000000000 ) ) )
4667 ) {
4668 return LIT64( 0x7FFFFFFFFFFFFFFF );
4669 }
bb98fe42 4670 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
4671 }
4672 aSigExtra = 0;
4673 }
4674 else {
4675 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4676 }
4677 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
4678
4679}
4680
4681/*----------------------------------------------------------------------------
4682| Returns the result of converting the extended double-precision floating-
4683| point value `a' to the 64-bit two's complement integer format. The
4684| conversion is performed according to the IEC/IEEE Standard for Binary
4685| Floating-Point Arithmetic, except that the conversion is always rounded
4686| toward zero. If `a' is a NaN, the largest positive integer is returned.
4687| Otherwise, if the conversion overflows, the largest integer with the same
4688| sign as `a' is returned.
4689*----------------------------------------------------------------------------*/
4690
4691int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
4692{
4693 flag aSign;
4694 int32 aExp, shiftCount;
bb98fe42 4695 uint64_t aSig;
158142c2
FB
4696 int64 z;
4697
4698 aSig = extractFloatx80Frac( a );
4699 aExp = extractFloatx80Exp( a );
4700 aSign = extractFloatx80Sign( a );
4701 shiftCount = aExp - 0x403E;
4702 if ( 0 <= shiftCount ) {
4703 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4704 if ( ( a.high != 0xC03E ) || aSig ) {
4705 float_raise( float_flag_invalid STATUS_VAR);
4706 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4707 return LIT64( 0x7FFFFFFFFFFFFFFF );
4708 }
4709 }
bb98fe42 4710 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
4711 }
4712 else if ( aExp < 0x3FFF ) {
4713 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
4714 return 0;
4715 }
4716 z = aSig>>( - shiftCount );
bb98fe42 4717 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
158142c2
FB
4718 STATUS(float_exception_flags) |= float_flag_inexact;
4719 }
4720 if ( aSign ) z = - z;
4721 return z;
4722
4723}
4724
4725/*----------------------------------------------------------------------------
4726| Returns the result of converting the extended double-precision floating-
4727| point value `a' to the single-precision floating-point format. The
4728| conversion is performed according to the IEC/IEEE Standard for Binary
4729| Floating-Point Arithmetic.
4730*----------------------------------------------------------------------------*/
4731
4732float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
4733{
4734 flag aSign;
4735 int32 aExp;
bb98fe42 4736 uint64_t aSig;
158142c2
FB
4737
4738 aSig = extractFloatx80Frac( a );
4739 aExp = extractFloatx80Exp( a );
4740 aSign = extractFloatx80Sign( a );
4741 if ( aExp == 0x7FFF ) {
bb98fe42 4742 if ( (uint64_t) ( aSig<<1 ) ) {
bcd4d9af 4743 return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
4744 }
4745 return packFloat32( aSign, 0xFF, 0 );
4746 }
4747 shift64RightJamming( aSig, 33, &aSig );
4748 if ( aExp || aSig ) aExp -= 0x3F81;
4749 return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
4750
4751}
4752
4753/*----------------------------------------------------------------------------
4754| Returns the result of converting the extended double-precision floating-
4755| point value `a' to the double-precision floating-point format. The
4756| conversion is performed according to the IEC/IEEE Standard for Binary
4757| Floating-Point Arithmetic.
4758*----------------------------------------------------------------------------*/
4759
4760float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
4761{
4762 flag aSign;
4763 int32 aExp;
bb98fe42 4764 uint64_t aSig, zSig;
158142c2
FB
4765
4766 aSig = extractFloatx80Frac( a );
4767 aExp = extractFloatx80Exp( a );
4768 aSign = extractFloatx80Sign( a );
4769 if ( aExp == 0x7FFF ) {
bb98fe42 4770 if ( (uint64_t) ( aSig<<1 ) ) {
bcd4d9af 4771 return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
4772 }
4773 return packFloat64( aSign, 0x7FF, 0 );
4774 }
4775 shift64RightJamming( aSig, 1, &zSig );
4776 if ( aExp || aSig ) aExp -= 0x3C01;
4777 return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
4778
4779}
4780
158142c2
FB
4781/*----------------------------------------------------------------------------
4782| Returns the result of converting the extended double-precision floating-
4783| point value `a' to the quadruple-precision floating-point format. The
4784| conversion is performed according to the IEC/IEEE Standard for Binary
4785| Floating-Point Arithmetic.
4786*----------------------------------------------------------------------------*/
4787
4788float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
4789{
4790 flag aSign;
94a49d86 4791 int_fast16_t aExp;
bb98fe42 4792 uint64_t aSig, zSig0, zSig1;
158142c2
FB
4793
4794 aSig = extractFloatx80Frac( a );
4795 aExp = extractFloatx80Exp( a );
4796 aSign = extractFloatx80Sign( a );
bb98fe42 4797 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
bcd4d9af 4798 return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
4799 }
4800 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4801 return packFloat128( aSign, aExp, zSig0, zSig1 );
4802
4803}
4804
158142c2
FB
4805/*----------------------------------------------------------------------------
4806| Rounds the extended double-precision floating-point value `a' to an integer,
4807| and returns the result as an extended quadruple-precision floating-point
4808| value. The operation is performed according to the IEC/IEEE Standard for
4809| Binary Floating-Point Arithmetic.
4810*----------------------------------------------------------------------------*/
4811
4812floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
4813{
4814 flag aSign;
4815 int32 aExp;
bb98fe42 4816 uint64_t lastBitMask, roundBitsMask;
158142c2
FB
4817 floatx80 z;
4818
4819 aExp = extractFloatx80Exp( a );
4820 if ( 0x403E <= aExp ) {
bb98fe42 4821 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
158142c2
FB
4822 return propagateFloatx80NaN( a, a STATUS_VAR );
4823 }
4824 return a;
4825 }
4826 if ( aExp < 0x3FFF ) {
4827 if ( ( aExp == 0 )
bb98fe42 4828 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
158142c2
FB
4829 return a;
4830 }
4831 STATUS(float_exception_flags) |= float_flag_inexact;
4832 aSign = extractFloatx80Sign( a );
4833 switch ( STATUS(float_rounding_mode) ) {
4834 case float_round_nearest_even:
bb98fe42 4835 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
158142c2
FB
4836 ) {
4837 return
4838 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4839 }
4840 break;
f9288a76
PM
4841 case float_round_ties_away:
4842 if (aExp == 0x3FFE) {
4843 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
4844 }
4845 break;
158142c2
FB
4846 case float_round_down:
4847 return
4848 aSign ?
4849 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4850 : packFloatx80( 0, 0, 0 );
4851 case float_round_up:
4852 return
4853 aSign ? packFloatx80( 1, 0, 0 )
4854 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4855 }
4856 return packFloatx80( aSign, 0, 0 );
4857 }
4858 lastBitMask = 1;
4859 lastBitMask <<= 0x403E - aExp;
4860 roundBitsMask = lastBitMask - 1;
4861 z = a;
dc355b76
PM
4862 switch (STATUS(float_rounding_mode)) {
4863 case float_round_nearest_even:
158142c2 4864 z.low += lastBitMask>>1;
dc355b76
PM
4865 if ((z.low & roundBitsMask) == 0) {
4866 z.low &= ~lastBitMask;
4867 }
4868 break;
f9288a76
PM
4869 case float_round_ties_away:
4870 z.low += lastBitMask >> 1;
4871 break;
dc355b76
PM
4872 case float_round_to_zero:
4873 break;
4874 case float_round_up:
4875 if (!extractFloatx80Sign(z)) {
4876 z.low += roundBitsMask;
4877 }
4878 break;
4879 case float_round_down:
4880 if (extractFloatx80Sign(z)) {
158142c2
FB
4881 z.low += roundBitsMask;
4882 }
dc355b76
PM
4883 break;
4884 default:
4885 abort();
158142c2
FB
4886 }
4887 z.low &= ~ roundBitsMask;
4888 if ( z.low == 0 ) {
4889 ++z.high;
4890 z.low = LIT64( 0x8000000000000000 );
4891 }
4892 if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
4893 return z;
4894
4895}
4896
4897/*----------------------------------------------------------------------------
4898| Returns the result of adding the absolute values of the extended double-
4899| precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
4900| negated before being returned. `zSign' is ignored if the result is a NaN.
4901| The addition is performed according to the IEC/IEEE Standard for Binary
4902| Floating-Point Arithmetic.
4903*----------------------------------------------------------------------------*/
4904
4905static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
4906{
4907 int32 aExp, bExp, zExp;
bb98fe42 4908 uint64_t aSig, bSig, zSig0, zSig1;
158142c2
FB
4909 int32 expDiff;
4910
4911 aSig = extractFloatx80Frac( a );
4912 aExp = extractFloatx80Exp( a );
4913 bSig = extractFloatx80Frac( b );
4914 bExp = extractFloatx80Exp( b );
4915 expDiff = aExp - bExp;
4916 if ( 0 < expDiff ) {
4917 if ( aExp == 0x7FFF ) {
bb98fe42 4918 if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
4919 return a;
4920 }
4921 if ( bExp == 0 ) --expDiff;
4922 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4923 zExp = aExp;
4924 }
4925 else if ( expDiff < 0 ) {
4926 if ( bExp == 0x7FFF ) {
bb98fe42 4927 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
4928 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4929 }
4930 if ( aExp == 0 ) ++expDiff;
4931 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4932 zExp = bExp;
4933 }
4934 else {
4935 if ( aExp == 0x7FFF ) {
bb98fe42 4936 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
158142c2
FB
4937 return propagateFloatx80NaN( a, b STATUS_VAR );
4938 }
4939 return a;
4940 }
4941 zSig1 = 0;
4942 zSig0 = aSig + bSig;
4943 if ( aExp == 0 ) {
4944 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4945 goto roundAndPack;
4946 }
4947 zExp = aExp;
4948 goto shiftRight1;
4949 }
4950 zSig0 = aSig + bSig;
bb98fe42 4951 if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
158142c2
FB
4952 shiftRight1:
4953 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4954 zSig0 |= LIT64( 0x8000000000000000 );
4955 ++zExp;
4956 roundAndPack:
4957 return
4958 roundAndPackFloatx80(
4959 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4960
4961}
4962
4963/*----------------------------------------------------------------------------
4964| Returns the result of subtracting the absolute values of the extended
4965| double-precision floating-point values `a' and `b'. If `zSign' is 1, the
4966| difference is negated before being returned. `zSign' is ignored if the
4967| result is a NaN. The subtraction is performed according to the IEC/IEEE
4968| Standard for Binary Floating-Point Arithmetic.
4969*----------------------------------------------------------------------------*/
4970
4971static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
4972{
4973 int32 aExp, bExp, zExp;
bb98fe42 4974 uint64_t aSig, bSig, zSig0, zSig1;
158142c2
FB
4975 int32 expDiff;
4976 floatx80 z;
4977
4978 aSig = extractFloatx80Frac( a );
4979 aExp = extractFloatx80Exp( a );
4980 bSig = extractFloatx80Frac( b );
4981 bExp = extractFloatx80Exp( b );
4982 expDiff = aExp - bExp;
4983 if ( 0 < expDiff ) goto aExpBigger;
4984 if ( expDiff < 0 ) goto bExpBigger;
4985 if ( aExp == 0x7FFF ) {
bb98fe42 4986 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
158142c2
FB
4987 return propagateFloatx80NaN( a, b STATUS_VAR );
4988 }
4989 float_raise( float_flag_invalid STATUS_VAR);
4990 z.low = floatx80_default_nan_low;
4991 z.high = floatx80_default_nan_high;
4992 return z;
4993 }
4994 if ( aExp == 0 ) {
4995 aExp = 1;
4996 bExp = 1;
4997 }
4998 zSig1 = 0;
4999 if ( bSig < aSig ) goto aBigger;
5000 if ( aSig < bSig ) goto bBigger;
5001 return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
5002 bExpBigger:
5003 if ( bExp == 0x7FFF ) {
bb98fe42 5004 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
5005 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
5006 }
5007 if ( aExp == 0 ) ++expDiff;
5008 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5009 bBigger:
5010 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5011 zExp = bExp;
5012 zSign ^= 1;
5013 goto normalizeRoundAndPack;
5014 aExpBigger:
5015 if ( aExp == 0x7FFF ) {
bb98fe42 5016 if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
5017 return a;
5018 }
5019 if ( bExp == 0 ) --expDiff;
5020 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5021 aBigger:
5022 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5023 zExp = aExp;
5024 normalizeRoundAndPack:
5025 return
5026 normalizeRoundAndPackFloatx80(
5027 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
5028
5029}
5030
5031/*----------------------------------------------------------------------------
5032| Returns the result of adding the extended double-precision floating-point
5033| values `a' and `b'. The operation is performed according to the IEC/IEEE
5034| Standard for Binary Floating-Point Arithmetic.
5035*----------------------------------------------------------------------------*/
5036
5037floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
5038{
5039 flag aSign, bSign;
5040
5041 aSign = extractFloatx80Sign( a );
5042 bSign = extractFloatx80Sign( b );
5043 if ( aSign == bSign ) {
5044 return addFloatx80Sigs( a, b, aSign STATUS_VAR );
5045 }
5046 else {
5047 return subFloatx80Sigs( a, b, aSign STATUS_VAR );
5048 }
5049
5050}
5051
5052/*----------------------------------------------------------------------------
5053| Returns the result of subtracting the extended double-precision floating-
5054| point values `a' and `b'. The operation is performed according to the
5055| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5056*----------------------------------------------------------------------------*/
5057
5058floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
5059{
5060 flag aSign, bSign;
5061
5062 aSign = extractFloatx80Sign( a );
5063 bSign = extractFloatx80Sign( b );
5064 if ( aSign == bSign ) {
5065 return subFloatx80Sigs( a, b, aSign STATUS_VAR );
5066 }
5067 else {
5068 return addFloatx80Sigs( a, b, aSign STATUS_VAR );
5069 }
5070
5071}
5072
5073/*----------------------------------------------------------------------------
5074| Returns the result of multiplying the extended double-precision floating-
5075| point values `a' and `b'. The operation is performed according to the
5076| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5077*----------------------------------------------------------------------------*/
5078
5079floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
5080{
5081 flag aSign, bSign, zSign;
5082 int32 aExp, bExp, zExp;
bb98fe42 5083 uint64_t aSig, bSig, zSig0, zSig1;
158142c2
FB
5084 floatx80 z;
5085
5086 aSig = extractFloatx80Frac( a );
5087 aExp = extractFloatx80Exp( a );
5088 aSign = extractFloatx80Sign( a );
5089 bSig = extractFloatx80Frac( b );
5090 bExp = extractFloatx80Exp( b );
5091 bSign = extractFloatx80Sign( b );
5092 zSign = aSign ^ bSign;
5093 if ( aExp == 0x7FFF ) {
bb98fe42
AF
5094 if ( (uint64_t) ( aSig<<1 )
5095 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
158142c2
FB
5096 return propagateFloatx80NaN( a, b STATUS_VAR );
5097 }
5098 if ( ( bExp | bSig ) == 0 ) goto invalid;
5099 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5100 }
5101 if ( bExp == 0x7FFF ) {
bb98fe42 5102 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
5103 if ( ( aExp | aSig ) == 0 ) {
5104 invalid:
5105 float_raise( float_flag_invalid STATUS_VAR);
5106 z.low = floatx80_default_nan_low;
5107 z.high = floatx80_default_nan_high;
5108 return z;
5109 }
5110 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5111 }
5112 if ( aExp == 0 ) {
5113 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5114 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5115 }
5116 if ( bExp == 0 ) {
5117 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5118 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5119 }
5120 zExp = aExp + bExp - 0x3FFE;
5121 mul64To128( aSig, bSig, &zSig0, &zSig1 );
bb98fe42 5122 if ( 0 < (int64_t) zSig0 ) {
158142c2
FB
5123 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5124 --zExp;
5125 }
5126 return
5127 roundAndPackFloatx80(
5128 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
5129
5130}
5131
5132/*----------------------------------------------------------------------------
5133| Returns the result of dividing the extended double-precision floating-point
5134| value `a' by the corresponding value `b'. The operation is performed
5135| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5136*----------------------------------------------------------------------------*/
5137
5138floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
5139{
5140 flag aSign, bSign, zSign;
5141 int32 aExp, bExp, zExp;
bb98fe42
AF
5142 uint64_t aSig, bSig, zSig0, zSig1;
5143 uint64_t rem0, rem1, rem2, term0, term1, term2;
158142c2
FB
5144 floatx80 z;
5145
5146 aSig = extractFloatx80Frac( a );
5147 aExp = extractFloatx80Exp( a );
5148 aSign = extractFloatx80Sign( a );
5149 bSig = extractFloatx80Frac( b );
5150 bExp = extractFloatx80Exp( b );
5151 bSign = extractFloatx80Sign( b );
5152 zSign = aSign ^ bSign;
5153 if ( aExp == 0x7FFF ) {
bb98fe42 5154 if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2 5155 if ( bExp == 0x7FFF ) {
bb98fe42 5156 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
5157 goto invalid;
5158 }
5159 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5160 }
5161 if ( bExp == 0x7FFF ) {
bb98fe42 5162 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
5163 return packFloatx80( zSign, 0, 0 );
5164 }
5165 if ( bExp == 0 ) {
5166 if ( bSig == 0 ) {
5167 if ( ( aExp | aSig ) == 0 ) {
5168 invalid:
5169 float_raise( float_flag_invalid STATUS_VAR);
5170 z.low = floatx80_default_nan_low;
5171 z.high = floatx80_default_nan_high;
5172 return z;
5173 }
5174 float_raise( float_flag_divbyzero STATUS_VAR);
5175 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5176 }
5177 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5178 }
5179 if ( aExp == 0 ) {
5180 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5181 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5182 }
5183 zExp = aExp - bExp + 0x3FFE;
5184 rem1 = 0;
5185 if ( bSig <= aSig ) {
5186 shift128Right( aSig, 0, 1, &aSig, &rem1 );
5187 ++zExp;
5188 }
5189 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5190 mul64To128( bSig, zSig0, &term0, &term1 );
5191 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
bb98fe42 5192 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
5193 --zSig0;
5194 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5195 }
5196 zSig1 = estimateDiv128To64( rem1, 0, bSig );
bb98fe42 5197 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
158142c2
FB
5198 mul64To128( bSig, zSig1, &term1, &term2 );
5199 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
bb98fe42 5200 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
5201 --zSig1;
5202 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5203 }
5204 zSig1 |= ( ( rem1 | rem2 ) != 0 );
5205 }
5206 return
5207 roundAndPackFloatx80(
5208 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
5209
5210}
5211
5212/*----------------------------------------------------------------------------
5213| Returns the remainder of the extended double-precision floating-point value
5214| `a' with respect to the corresponding value `b'. The operation is performed
5215| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5216*----------------------------------------------------------------------------*/
5217
5218floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
5219{
ed086f3d 5220 flag aSign, zSign;
158142c2 5221 int32 aExp, bExp, expDiff;
bb98fe42
AF
5222 uint64_t aSig0, aSig1, bSig;
5223 uint64_t q, term0, term1, alternateASig0, alternateASig1;
158142c2
FB
5224 floatx80 z;
5225
5226 aSig0 = extractFloatx80Frac( a );
5227 aExp = extractFloatx80Exp( a );
5228 aSign = extractFloatx80Sign( a );
5229 bSig = extractFloatx80Frac( b );
5230 bExp = extractFloatx80Exp( b );
158142c2 5231 if ( aExp == 0x7FFF ) {
bb98fe42
AF
5232 if ( (uint64_t) ( aSig0<<1 )
5233 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
158142c2
FB
5234 return propagateFloatx80NaN( a, b STATUS_VAR );
5235 }
5236 goto invalid;
5237 }
5238 if ( bExp == 0x7FFF ) {
bb98fe42 5239 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
5240 return a;
5241 }
5242 if ( bExp == 0 ) {
5243 if ( bSig == 0 ) {
5244 invalid:
5245 float_raise( float_flag_invalid STATUS_VAR);
5246 z.low = floatx80_default_nan_low;
5247 z.high = floatx80_default_nan_high;
5248 return z;
5249 }
5250 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5251 }
5252 if ( aExp == 0 ) {
bb98fe42 5253 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
158142c2
FB
5254 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5255 }
5256 bSig |= LIT64( 0x8000000000000000 );
5257 zSign = aSign;
5258 expDiff = aExp - bExp;
5259 aSig1 = 0;
5260 if ( expDiff < 0 ) {
5261 if ( expDiff < -1 ) return a;
5262 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5263 expDiff = 0;
5264 }
5265 q = ( bSig <= aSig0 );
5266 if ( q ) aSig0 -= bSig;
5267 expDiff -= 64;
5268 while ( 0 < expDiff ) {
5269 q = estimateDiv128To64( aSig0, aSig1, bSig );
5270 q = ( 2 < q ) ? q - 2 : 0;
5271 mul64To128( bSig, q, &term0, &term1 );
5272 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5273 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5274 expDiff -= 62;
5275 }
5276 expDiff += 64;
5277 if ( 0 < expDiff ) {
5278 q = estimateDiv128To64( aSig0, aSig1, bSig );
5279 q = ( 2 < q ) ? q - 2 : 0;
5280 q >>= 64 - expDiff;
5281 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5282 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5283 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5284 while ( le128( term0, term1, aSig0, aSig1 ) ) {
5285 ++q;
5286 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5287 }
5288 }
5289 else {
5290 term1 = 0;
5291 term0 = bSig;
5292 }
5293 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5294 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5295 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5296 && ( q & 1 ) )
5297 ) {
5298 aSig0 = alternateASig0;
5299 aSig1 = alternateASig1;
5300 zSign = ! zSign;
5301 }
5302 return
5303 normalizeRoundAndPackFloatx80(
5304 80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
5305
5306}
5307
5308/*----------------------------------------------------------------------------
5309| Returns the square root of the extended double-precision floating-point
5310| value `a'. The operation is performed according to the IEC/IEEE Standard
5311| for Binary Floating-Point Arithmetic.
5312*----------------------------------------------------------------------------*/
5313
5314floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
5315{
5316 flag aSign;
5317 int32 aExp, zExp;
bb98fe42
AF
5318 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5319 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
158142c2
FB
5320 floatx80 z;
5321
5322 aSig0 = extractFloatx80Frac( a );
5323 aExp = extractFloatx80Exp( a );
5324 aSign = extractFloatx80Sign( a );
5325 if ( aExp == 0x7FFF ) {
bb98fe42 5326 if ( (uint64_t) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
158142c2
FB
5327 if ( ! aSign ) return a;
5328 goto invalid;
5329 }
5330 if ( aSign ) {
5331 if ( ( aExp | aSig0 ) == 0 ) return a;
5332 invalid:
5333 float_raise( float_flag_invalid STATUS_VAR);
5334 z.low = floatx80_default_nan_low;
5335 z.high = floatx80_default_nan_high;
5336 return z;
5337 }
5338 if ( aExp == 0 ) {
5339 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5340 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5341 }
5342 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5343 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5344 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5345 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5346 doubleZSig0 = zSig0<<1;
5347 mul64To128( zSig0, zSig0, &term0, &term1 );
5348 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
bb98fe42 5349 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
5350 --zSig0;
5351 doubleZSig0 -= 2;
5352 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5353 }
5354 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5355 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5356 if ( zSig1 == 0 ) zSig1 = 1;
5357 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5358 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5359 mul64To128( zSig1, zSig1, &term2, &term3 );
5360 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
bb98fe42 5361 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
5362 --zSig1;
5363 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5364 term3 |= 1;
5365 term2 |= doubleZSig0;
5366 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5367 }
5368 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5369 }
5370 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5371 zSig0 |= doubleZSig0;
5372 return
5373 roundAndPackFloatx80(
5374 STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
5375
5376}
5377
5378/*----------------------------------------------------------------------------
b689362d
AJ
5379| Returns 1 if the extended double-precision floating-point value `a' is equal
5380| to the corresponding value `b', and 0 otherwise. The invalid exception is
5381| raised if either operand is a NaN. Otherwise, the comparison is performed
5382| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
5383*----------------------------------------------------------------------------*/
5384
b689362d 5385int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
158142c2
FB
5386{
5387
5388 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5389 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5390 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5391 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2 5392 ) {
b689362d 5393 float_raise( float_flag_invalid STATUS_VAR);
158142c2
FB
5394 return 0;
5395 }
5396 return
5397 ( a.low == b.low )
5398 && ( ( a.high == b.high )
5399 || ( ( a.low == 0 )
bb98fe42 5400 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
5401 );
5402
5403}
5404
5405/*----------------------------------------------------------------------------
5406| Returns 1 if the extended double-precision floating-point value `a' is
5407| less than or equal to the corresponding value `b', and 0 otherwise. The
f5a64251
AJ
5408| invalid exception is raised if either operand is a NaN. The comparison is
5409| performed according to the IEC/IEEE Standard for Binary Floating-Point
5410| Arithmetic.
158142c2
FB
5411*----------------------------------------------------------------------------*/
5412
750afe93 5413int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
158142c2
FB
5414{
5415 flag aSign, bSign;
5416
5417 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5418 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5419 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5420 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2
FB
5421 ) {
5422 float_raise( float_flag_invalid STATUS_VAR);
5423 return 0;
5424 }
5425 aSign = extractFloatx80Sign( a );
5426 bSign = extractFloatx80Sign( b );
5427 if ( aSign != bSign ) {
5428 return
5429 aSign
bb98fe42 5430 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5431 == 0 );
5432 }
5433 return
5434 aSign ? le128( b.high, b.low, a.high, a.low )
5435 : le128( a.high, a.low, b.high, b.low );
5436
5437}
5438
5439/*----------------------------------------------------------------------------
5440| Returns 1 if the extended double-precision floating-point value `a' is
f5a64251
AJ
5441| less than the corresponding value `b', and 0 otherwise. The invalid
5442| exception is raised if either operand is a NaN. The comparison is performed
5443| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
5444*----------------------------------------------------------------------------*/
5445
750afe93 5446int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
158142c2
FB
5447{
5448 flag aSign, bSign;
5449
5450 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5451 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5452 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5453 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2
FB
5454 ) {
5455 float_raise( float_flag_invalid STATUS_VAR);
5456 return 0;
5457 }
5458 aSign = extractFloatx80Sign( a );
5459 bSign = extractFloatx80Sign( b );
5460 if ( aSign != bSign ) {
5461 return
5462 aSign
bb98fe42 5463 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5464 != 0 );
5465 }
5466 return
5467 aSign ? lt128( b.high, b.low, a.high, a.low )
5468 : lt128( a.high, a.low, b.high, b.low );
5469
5470}
5471
67b7861d
AJ
5472/*----------------------------------------------------------------------------
5473| Returns 1 if the extended double-precision floating-point values `a' and `b'
f5a64251
AJ
5474| cannot be compared, and 0 otherwise. The invalid exception is raised if
5475| either operand is a NaN. The comparison is performed according to the
5476| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
67b7861d
AJ
5477*----------------------------------------------------------------------------*/
5478int floatx80_unordered( floatx80 a, floatx80 b STATUS_PARAM )
5479{
5480 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5481 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5482 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5483 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5484 ) {
5485 float_raise( float_flag_invalid STATUS_VAR);
5486 return 1;
5487 }
5488 return 0;
5489}
5490
158142c2 5491/*----------------------------------------------------------------------------
b689362d 5492| Returns 1 if the extended double-precision floating-point value `a' is
f5a64251
AJ
5493| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5494| cause an exception. The comparison is performed according to the IEC/IEEE
5495| Standard for Binary Floating-Point Arithmetic.
158142c2
FB
5496*----------------------------------------------------------------------------*/
5497
b689362d 5498int floatx80_eq_quiet( floatx80 a, floatx80 b STATUS_PARAM )
158142c2
FB
5499{
5500
5501 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5502 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5503 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5504 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2 5505 ) {
b689362d
AJ
5506 if ( floatx80_is_signaling_nan( a )
5507 || floatx80_is_signaling_nan( b ) ) {
5508 float_raise( float_flag_invalid STATUS_VAR);
5509 }
158142c2
FB
5510 return 0;
5511 }
5512 return
5513 ( a.low == b.low )
5514 && ( ( a.high == b.high )
5515 || ( ( a.low == 0 )
bb98fe42 5516 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
5517 );
5518
5519}
5520
5521/*----------------------------------------------------------------------------
5522| Returns 1 if the extended double-precision floating-point value `a' is less
5523| than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
5524| do not cause an exception. Otherwise, the comparison is performed according
5525| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5526*----------------------------------------------------------------------------*/
5527
750afe93 5528int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
158142c2
FB
5529{
5530 flag aSign, bSign;
5531
5532 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5533 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5534 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5535 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2
FB
5536 ) {
5537 if ( floatx80_is_signaling_nan( a )
5538 || floatx80_is_signaling_nan( b ) ) {
5539 float_raise( float_flag_invalid STATUS_VAR);
5540 }
5541 return 0;
5542 }
5543 aSign = extractFloatx80Sign( a );
5544 bSign = extractFloatx80Sign( b );
5545 if ( aSign != bSign ) {
5546 return
5547 aSign
bb98fe42 5548 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5549 == 0 );
5550 }
5551 return
5552 aSign ? le128( b.high, b.low, a.high, a.low )
5553 : le128( a.high, a.low, b.high, b.low );
5554
5555}
5556
5557/*----------------------------------------------------------------------------
5558| Returns 1 if the extended double-precision floating-point value `a' is less
5559| than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
5560| an exception. Otherwise, the comparison is performed according to the
5561| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5562*----------------------------------------------------------------------------*/
5563
750afe93 5564int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
158142c2
FB
5565{
5566 flag aSign, bSign;
5567
5568 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5569 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5570 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5571 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2
FB
5572 ) {
5573 if ( floatx80_is_signaling_nan( a )
5574 || floatx80_is_signaling_nan( b ) ) {
5575 float_raise( float_flag_invalid STATUS_VAR);
5576 }
5577 return 0;
5578 }
5579 aSign = extractFloatx80Sign( a );
5580 bSign = extractFloatx80Sign( b );
5581 if ( aSign != bSign ) {
5582 return
5583 aSign
bb98fe42 5584 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5585 != 0 );
5586 }
5587 return
5588 aSign ? lt128( b.high, b.low, a.high, a.low )
5589 : lt128( a.high, a.low, b.high, b.low );
5590
5591}
5592
67b7861d
AJ
5593/*----------------------------------------------------------------------------
5594| Returns 1 if the extended double-precision floating-point values `a' and `b'
5595| cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
5596| The comparison is performed according to the IEC/IEEE Standard for Binary
5597| Floating-Point Arithmetic.
5598*----------------------------------------------------------------------------*/
5599int floatx80_unordered_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5600{
5601 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5602 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5603 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5604 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5605 ) {
5606 if ( floatx80_is_signaling_nan( a )
5607 || floatx80_is_signaling_nan( b ) ) {
5608 float_raise( float_flag_invalid STATUS_VAR);
5609 }
5610 return 1;
5611 }
5612 return 0;
5613}
5614
158142c2
FB
5615/*----------------------------------------------------------------------------
5616| Returns the result of converting the quadruple-precision floating-point
5617| value `a' to the 32-bit two's complement integer format. The conversion
5618| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5619| Arithmetic---which means in particular that the conversion is rounded
5620| according to the current rounding mode. If `a' is a NaN, the largest
5621| positive integer is returned. Otherwise, if the conversion overflows, the
5622| largest integer with the same sign as `a' is returned.
5623*----------------------------------------------------------------------------*/
5624
5625int32 float128_to_int32( float128 a STATUS_PARAM )
5626{
5627 flag aSign;
5628 int32 aExp, shiftCount;
bb98fe42 5629 uint64_t aSig0, aSig1;
158142c2
FB
5630
5631 aSig1 = extractFloat128Frac1( a );
5632 aSig0 = extractFloat128Frac0( a );
5633 aExp = extractFloat128Exp( a );
5634 aSign = extractFloat128Sign( a );
5635 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5636 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5637 aSig0 |= ( aSig1 != 0 );
5638 shiftCount = 0x4028 - aExp;
5639 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5640 return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
5641
5642}
5643
5644/*----------------------------------------------------------------------------
5645| Returns the result of converting the quadruple-precision floating-point
5646| value `a' to the 32-bit two's complement integer format. The conversion
5647| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5648| Arithmetic, except that the conversion is always rounded toward zero. If
5649| `a' is a NaN, the largest positive integer is returned. Otherwise, if the
5650| conversion overflows, the largest integer with the same sign as `a' is
5651| returned.
5652*----------------------------------------------------------------------------*/
5653
5654int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
5655{
5656 flag aSign;
5657 int32 aExp, shiftCount;
bb98fe42 5658 uint64_t aSig0, aSig1, savedASig;
b3a6a2e0 5659 int32_t z;
158142c2
FB
5660
5661 aSig1 = extractFloat128Frac1( a );
5662 aSig0 = extractFloat128Frac0( a );
5663 aExp = extractFloat128Exp( a );
5664 aSign = extractFloat128Sign( a );
5665 aSig0 |= ( aSig1 != 0 );
5666 if ( 0x401E < aExp ) {
5667 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5668 goto invalid;
5669 }
5670 else if ( aExp < 0x3FFF ) {
5671 if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
5672 return 0;
5673 }
5674 aSig0 |= LIT64( 0x0001000000000000 );
5675 shiftCount = 0x402F - aExp;
5676 savedASig = aSig0;
5677 aSig0 >>= shiftCount;
5678 z = aSig0;
5679 if ( aSign ) z = - z;
5680 if ( ( z < 0 ) ^ aSign ) {
5681 invalid:
5682 float_raise( float_flag_invalid STATUS_VAR);
bb98fe42 5683 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2
FB
5684 }
5685 if ( ( aSig0<<shiftCount ) != savedASig ) {
5686 STATUS(float_exception_flags) |= float_flag_inexact;
5687 }
5688 return z;
5689
5690}
5691
5692/*----------------------------------------------------------------------------
5693| Returns the result of converting the quadruple-precision floating-point
5694| value `a' to the 64-bit two's complement integer format. The conversion
5695| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5696| Arithmetic---which means in particular that the conversion is rounded
5697| according to the current rounding mode. If `a' is a NaN, the largest
5698| positive integer is returned. Otherwise, if the conversion overflows, the
5699| largest integer with the same sign as `a' is returned.
5700*----------------------------------------------------------------------------*/
5701
5702int64 float128_to_int64( float128 a STATUS_PARAM )
5703{
5704 flag aSign;
5705 int32 aExp, shiftCount;
bb98fe42 5706 uint64_t aSig0, aSig1;
158142c2
FB
5707
5708 aSig1 = extractFloat128Frac1( a );
5709 aSig0 = extractFloat128Frac0( a );
5710 aExp = extractFloat128Exp( a );
5711 aSign = extractFloat128Sign( a );
5712 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5713 shiftCount = 0x402F - aExp;
5714 if ( shiftCount <= 0 ) {
5715 if ( 0x403E < aExp ) {
5716 float_raise( float_flag_invalid STATUS_VAR);
5717 if ( ! aSign
5718 || ( ( aExp == 0x7FFF )
5719 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5720 )
5721 ) {
5722 return LIT64( 0x7FFFFFFFFFFFFFFF );
5723 }
bb98fe42 5724 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
5725 }
5726 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5727 }
5728 else {
5729 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5730 }
5731 return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
5732
5733}
5734
5735/*----------------------------------------------------------------------------
5736| Returns the result of converting the quadruple-precision floating-point
5737| value `a' to the 64-bit two's complement integer format. The conversion
5738| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5739| Arithmetic, except that the conversion is always rounded toward zero.
5740| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
5741| the conversion overflows, the largest integer with the same sign as `a' is
5742| returned.
5743*----------------------------------------------------------------------------*/
5744
5745int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
5746{
5747 flag aSign;
5748 int32 aExp, shiftCount;
bb98fe42 5749 uint64_t aSig0, aSig1;
158142c2
FB
5750 int64 z;
5751
5752 aSig1 = extractFloat128Frac1( a );
5753 aSig0 = extractFloat128Frac0( a );
5754 aExp = extractFloat128Exp( a );
5755 aSign = extractFloat128Sign( a );
5756 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5757 shiftCount = aExp - 0x402F;
5758 if ( 0 < shiftCount ) {
5759 if ( 0x403E <= aExp ) {
5760 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5761 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
5762 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5763 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
5764 }
5765 else {
5766 float_raise( float_flag_invalid STATUS_VAR);
5767 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5768 return LIT64( 0x7FFFFFFFFFFFFFFF );
5769 }
5770 }
bb98fe42 5771 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
5772 }
5773 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
bb98fe42 5774 if ( (uint64_t) ( aSig1<<shiftCount ) ) {
158142c2
FB
5775 STATUS(float_exception_flags) |= float_flag_inexact;
5776 }
5777 }
5778 else {
5779 if ( aExp < 0x3FFF ) {
5780 if ( aExp | aSig0 | aSig1 ) {
5781 STATUS(float_exception_flags) |= float_flag_inexact;
5782 }
5783 return 0;
5784 }
5785 z = aSig0>>( - shiftCount );
5786 if ( aSig1
bb98fe42 5787 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
158142c2
FB
5788 STATUS(float_exception_flags) |= float_flag_inexact;
5789 }
5790 }
5791 if ( aSign ) z = - z;
5792 return z;
5793
5794}
5795
5796/*----------------------------------------------------------------------------
5797| Returns the result of converting the quadruple-precision floating-point
5798| value `a' to the single-precision floating-point format. The conversion
5799| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5800| Arithmetic.
5801*----------------------------------------------------------------------------*/
5802
5803float32 float128_to_float32( float128 a STATUS_PARAM )
5804{
5805 flag aSign;
5806 int32 aExp;
bb98fe42
AF
5807 uint64_t aSig0, aSig1;
5808 uint32_t zSig;
158142c2
FB
5809
5810 aSig1 = extractFloat128Frac1( a );
5811 aSig0 = extractFloat128Frac0( a );
5812 aExp = extractFloat128Exp( a );
5813 aSign = extractFloat128Sign( a );
5814 if ( aExp == 0x7FFF ) {
5815 if ( aSig0 | aSig1 ) {
bcd4d9af 5816 return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
5817 }
5818 return packFloat32( aSign, 0xFF, 0 );
5819 }
5820 aSig0 |= ( aSig1 != 0 );
5821 shift64RightJamming( aSig0, 18, &aSig0 );
5822 zSig = aSig0;
5823 if ( aExp || zSig ) {
5824 zSig |= 0x40000000;
5825 aExp -= 0x3F81;
5826 }
5827 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
5828
5829}
5830
5831/*----------------------------------------------------------------------------
5832| Returns the result of converting the quadruple-precision floating-point
5833| value `a' to the double-precision floating-point format. The conversion
5834| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5835| Arithmetic.
5836*----------------------------------------------------------------------------*/
5837
5838float64 float128_to_float64( float128 a STATUS_PARAM )
5839{
5840 flag aSign;
5841 int32 aExp;
bb98fe42 5842 uint64_t aSig0, aSig1;
158142c2
FB
5843
5844 aSig1 = extractFloat128Frac1( a );
5845 aSig0 = extractFloat128Frac0( a );
5846 aExp = extractFloat128Exp( a );
5847 aSign = extractFloat128Sign( a );
5848 if ( aExp == 0x7FFF ) {
5849 if ( aSig0 | aSig1 ) {
bcd4d9af 5850 return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
5851 }
5852 return packFloat64( aSign, 0x7FF, 0 );
5853 }
5854 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5855 aSig0 |= ( aSig1 != 0 );
5856 if ( aExp || aSig0 ) {
5857 aSig0 |= LIT64( 0x4000000000000000 );
5858 aExp -= 0x3C01;
5859 }
5860 return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
5861
5862}
5863
158142c2
FB
5864/*----------------------------------------------------------------------------
5865| Returns the result of converting the quadruple-precision floating-point
5866| value `a' to the extended double-precision floating-point format. The
5867| conversion is performed according to the IEC/IEEE Standard for Binary
5868| Floating-Point Arithmetic.
5869*----------------------------------------------------------------------------*/
5870
5871floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
5872{
5873 flag aSign;
5874 int32 aExp;
bb98fe42 5875 uint64_t aSig0, aSig1;
158142c2
FB
5876
5877 aSig1 = extractFloat128Frac1( a );
5878 aSig0 = extractFloat128Frac0( a );
5879 aExp = extractFloat128Exp( a );
5880 aSign = extractFloat128Sign( a );
5881 if ( aExp == 0x7FFF ) {
5882 if ( aSig0 | aSig1 ) {
bcd4d9af 5883 return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
5884 }
5885 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5886 }
5887 if ( aExp == 0 ) {
5888 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5889 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5890 }
5891 else {
5892 aSig0 |= LIT64( 0x0001000000000000 );
5893 }
5894 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5895 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
5896
5897}
5898
158142c2
FB
5899/*----------------------------------------------------------------------------
5900| Rounds the quadruple-precision floating-point value `a' to an integer, and
5901| returns the result as a quadruple-precision floating-point value. The
5902| operation is performed according to the IEC/IEEE Standard for Binary
5903| Floating-Point Arithmetic.
5904*----------------------------------------------------------------------------*/
5905
5906float128 float128_round_to_int( float128 a STATUS_PARAM )
5907{
5908 flag aSign;
5909 int32 aExp;
bb98fe42 5910 uint64_t lastBitMask, roundBitsMask;
158142c2
FB
5911 float128 z;
5912
5913 aExp = extractFloat128Exp( a );
5914 if ( 0x402F <= aExp ) {
5915 if ( 0x406F <= aExp ) {
5916 if ( ( aExp == 0x7FFF )
5917 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5918 ) {
5919 return propagateFloat128NaN( a, a STATUS_VAR );
5920 }
5921 return a;
5922 }
5923 lastBitMask = 1;
5924 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5925 roundBitsMask = lastBitMask - 1;
5926 z = a;
dc355b76
PM
5927 switch (STATUS(float_rounding_mode)) {
5928 case float_round_nearest_even:
158142c2
FB
5929 if ( lastBitMask ) {
5930 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
5931 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
5932 }
5933 else {
bb98fe42 5934 if ( (int64_t) z.low < 0 ) {
158142c2 5935 ++z.high;
bb98fe42 5936 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
158142c2
FB
5937 }
5938 }
dc355b76 5939 break;
f9288a76
PM
5940 case float_round_ties_away:
5941 if (lastBitMask) {
5942 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
5943 } else {
5944 if ((int64_t) z.low < 0) {
5945 ++z.high;
5946 }
5947 }
5948 break;
dc355b76
PM
5949 case float_round_to_zero:
5950 break;
5951 case float_round_up:
5952 if (!extractFloat128Sign(z)) {
5953 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
5954 }
5955 break;
5956 case float_round_down:
5957 if (extractFloat128Sign(z)) {
5958 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
158142c2 5959 }
dc355b76
PM
5960 break;
5961 default:
5962 abort();
158142c2
FB
5963 }
5964 z.low &= ~ roundBitsMask;
5965 }
5966 else {
5967 if ( aExp < 0x3FFF ) {
bb98fe42 5968 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
158142c2
FB
5969 STATUS(float_exception_flags) |= float_flag_inexact;
5970 aSign = extractFloat128Sign( a );
5971 switch ( STATUS(float_rounding_mode) ) {
5972 case float_round_nearest_even:
5973 if ( ( aExp == 0x3FFE )
5974 && ( extractFloat128Frac0( a )
5975 | extractFloat128Frac1( a ) )
5976 ) {
5977 return packFloat128( aSign, 0x3FFF, 0, 0 );
5978 }
5979 break;
f9288a76
PM
5980 case float_round_ties_away:
5981 if (aExp == 0x3FFE) {
5982 return packFloat128(aSign, 0x3FFF, 0, 0);
5983 }
5984 break;
158142c2
FB
5985 case float_round_down:
5986 return
5987 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
5988 : packFloat128( 0, 0, 0, 0 );
5989 case float_round_up:
5990 return
5991 aSign ? packFloat128( 1, 0, 0, 0 )
5992 : packFloat128( 0, 0x3FFF, 0, 0 );
5993 }
5994 return packFloat128( aSign, 0, 0, 0 );
5995 }
5996 lastBitMask = 1;
5997 lastBitMask <<= 0x402F - aExp;
5998 roundBitsMask = lastBitMask - 1;
5999 z.low = 0;
6000 z.high = a.high;
dc355b76
PM
6001 switch (STATUS(float_rounding_mode)) {
6002 case float_round_nearest_even:
158142c2
FB
6003 z.high += lastBitMask>>1;
6004 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6005 z.high &= ~ lastBitMask;
6006 }
dc355b76 6007 break;
f9288a76
PM
6008 case float_round_ties_away:
6009 z.high += lastBitMask>>1;
6010 break;
dc355b76
PM
6011 case float_round_to_zero:
6012 break;
6013 case float_round_up:
6014 if (!extractFloat128Sign(z)) {
158142c2
FB
6015 z.high |= ( a.low != 0 );
6016 z.high += roundBitsMask;
6017 }
dc355b76
PM
6018 break;
6019 case float_round_down:
6020 if (extractFloat128Sign(z)) {
6021 z.high |= (a.low != 0);
6022 z.high += roundBitsMask;
6023 }
6024 break;
6025 default:
6026 abort();
158142c2
FB
6027 }
6028 z.high &= ~ roundBitsMask;
6029 }
6030 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6031 STATUS(float_exception_flags) |= float_flag_inexact;
6032 }
6033 return z;
6034
6035}
6036
6037/*----------------------------------------------------------------------------
6038| Returns the result of adding the absolute values of the quadruple-precision
6039| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
6040| before being returned. `zSign' is ignored if the result is a NaN.
6041| The addition is performed according to the IEC/IEEE Standard for Binary
6042| Floating-Point Arithmetic.
6043*----------------------------------------------------------------------------*/
6044
6045static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
6046{
6047 int32 aExp, bExp, zExp;
bb98fe42 6048 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
158142c2
FB
6049 int32 expDiff;
6050
6051 aSig1 = extractFloat128Frac1( a );
6052 aSig0 = extractFloat128Frac0( a );
6053 aExp = extractFloat128Exp( a );
6054 bSig1 = extractFloat128Frac1( b );
6055 bSig0 = extractFloat128Frac0( b );
6056 bExp = extractFloat128Exp( b );
6057 expDiff = aExp - bExp;
6058 if ( 0 < expDiff ) {
6059 if ( aExp == 0x7FFF ) {
6060 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6061 return a;
6062 }
6063 if ( bExp == 0 ) {
6064 --expDiff;
6065 }
6066 else {
6067 bSig0 |= LIT64( 0x0001000000000000 );
6068 }
6069 shift128ExtraRightJamming(
6070 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6071 zExp = aExp;
6072 }
6073 else if ( expDiff < 0 ) {
6074 if ( bExp == 0x7FFF ) {
6075 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6076 return packFloat128( zSign, 0x7FFF, 0, 0 );
6077 }
6078 if ( aExp == 0 ) {
6079 ++expDiff;
6080 }
6081 else {
6082 aSig0 |= LIT64( 0x0001000000000000 );
6083 }
6084 shift128ExtraRightJamming(
6085 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6086 zExp = bExp;
6087 }
6088 else {
6089 if ( aExp == 0x7FFF ) {
6090 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6091 return propagateFloat128NaN( a, b STATUS_VAR );
6092 }
6093 return a;
6094 }
6095 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
fe76d976 6096 if ( aExp == 0 ) {
e6afc87f
PM
6097 if (STATUS(flush_to_zero)) {
6098 if (zSig0 | zSig1) {
6099 float_raise(float_flag_output_denormal STATUS_VAR);
6100 }
6101 return packFloat128(zSign, 0, 0, 0);
6102 }
fe76d976
PB
6103 return packFloat128( zSign, 0, zSig0, zSig1 );
6104 }
158142c2
FB
6105 zSig2 = 0;
6106 zSig0 |= LIT64( 0x0002000000000000 );
6107 zExp = aExp;
6108 goto shiftRight1;
6109 }
6110 aSig0 |= LIT64( 0x0001000000000000 );
6111 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6112 --zExp;
6113 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6114 ++zExp;
6115 shiftRight1:
6116 shift128ExtraRightJamming(
6117 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6118 roundAndPack:
6119 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6120
6121}
6122
6123/*----------------------------------------------------------------------------
6124| Returns the result of subtracting the absolute values of the quadruple-
6125| precision floating-point values `a' and `b'. If `zSign' is 1, the
6126| difference is negated before being returned. `zSign' is ignored if the
6127| result is a NaN. The subtraction is performed according to the IEC/IEEE
6128| Standard for Binary Floating-Point Arithmetic.
6129*----------------------------------------------------------------------------*/
6130
6131static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
6132{
6133 int32 aExp, bExp, zExp;
bb98fe42 6134 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
158142c2
FB
6135 int32 expDiff;
6136 float128 z;
6137
6138 aSig1 = extractFloat128Frac1( a );
6139 aSig0 = extractFloat128Frac0( a );
6140 aExp = extractFloat128Exp( a );
6141 bSig1 = extractFloat128Frac1( b );
6142 bSig0 = extractFloat128Frac0( b );
6143 bExp = extractFloat128Exp( b );
6144 expDiff = aExp - bExp;
6145 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6146 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6147 if ( 0 < expDiff ) goto aExpBigger;
6148 if ( expDiff < 0 ) goto bExpBigger;
6149 if ( aExp == 0x7FFF ) {
6150 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6151 return propagateFloat128NaN( a, b STATUS_VAR );
6152 }
6153 float_raise( float_flag_invalid STATUS_VAR);
6154 z.low = float128_default_nan_low;
6155 z.high = float128_default_nan_high;
6156 return z;
6157 }
6158 if ( aExp == 0 ) {
6159 aExp = 1;
6160 bExp = 1;
6161 }
6162 if ( bSig0 < aSig0 ) goto aBigger;
6163 if ( aSig0 < bSig0 ) goto bBigger;
6164 if ( bSig1 < aSig1 ) goto aBigger;
6165 if ( aSig1 < bSig1 ) goto bBigger;
6166 return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
6167 bExpBigger:
6168 if ( bExp == 0x7FFF ) {
6169 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6170 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6171 }
6172 if ( aExp == 0 ) {
6173 ++expDiff;
6174 }
6175 else {
6176 aSig0 |= LIT64( 0x4000000000000000 );
6177 }
6178 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6179 bSig0 |= LIT64( 0x4000000000000000 );
6180 bBigger:
6181 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6182 zExp = bExp;
6183 zSign ^= 1;
6184 goto normalizeRoundAndPack;
6185 aExpBigger:
6186 if ( aExp == 0x7FFF ) {
6187 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6188 return a;
6189 }
6190 if ( bExp == 0 ) {
6191 --expDiff;
6192 }
6193 else {
6194 bSig0 |= LIT64( 0x4000000000000000 );
6195 }
6196 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6197 aSig0 |= LIT64( 0x4000000000000000 );
6198 aBigger:
6199 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6200 zExp = aExp;
6201 normalizeRoundAndPack:
6202 --zExp;
6203 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
6204
6205}
6206
6207/*----------------------------------------------------------------------------
6208| Returns the result of adding the quadruple-precision floating-point values
6209| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
6210| for Binary Floating-Point Arithmetic.
6211*----------------------------------------------------------------------------*/
6212
6213float128 float128_add( float128 a, float128 b STATUS_PARAM )
6214{
6215 flag aSign, bSign;
6216
6217 aSign = extractFloat128Sign( a );
6218 bSign = extractFloat128Sign( b );
6219 if ( aSign == bSign ) {
6220 return addFloat128Sigs( a, b, aSign STATUS_VAR );
6221 }
6222 else {
6223 return subFloat128Sigs( a, b, aSign STATUS_VAR );
6224 }
6225
6226}
6227
6228/*----------------------------------------------------------------------------
6229| Returns the result of subtracting the quadruple-precision floating-point
6230| values `a' and `b'. The operation is performed according to the IEC/IEEE
6231| Standard for Binary Floating-Point Arithmetic.
6232*----------------------------------------------------------------------------*/
6233
6234float128 float128_sub( float128 a, float128 b STATUS_PARAM )
6235{
6236 flag aSign, bSign;
6237
6238 aSign = extractFloat128Sign( a );
6239 bSign = extractFloat128Sign( b );
6240 if ( aSign == bSign ) {
6241 return subFloat128Sigs( a, b, aSign STATUS_VAR );
6242 }
6243 else {
6244 return addFloat128Sigs( a, b, aSign STATUS_VAR );
6245 }
6246
6247}
6248
6249/*----------------------------------------------------------------------------
6250| Returns the result of multiplying the quadruple-precision floating-point
6251| values `a' and `b'. The operation is performed according to the IEC/IEEE
6252| Standard for Binary Floating-Point Arithmetic.
6253*----------------------------------------------------------------------------*/
6254
6255float128 float128_mul( float128 a, float128 b STATUS_PARAM )
6256{
6257 flag aSign, bSign, zSign;
6258 int32 aExp, bExp, zExp;
bb98fe42 6259 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
158142c2
FB
6260 float128 z;
6261
6262 aSig1 = extractFloat128Frac1( a );
6263 aSig0 = extractFloat128Frac0( a );
6264 aExp = extractFloat128Exp( a );
6265 aSign = extractFloat128Sign( a );
6266 bSig1 = extractFloat128Frac1( b );
6267 bSig0 = extractFloat128Frac0( b );
6268 bExp = extractFloat128Exp( b );
6269 bSign = extractFloat128Sign( b );
6270 zSign = aSign ^ bSign;
6271 if ( aExp == 0x7FFF ) {
6272 if ( ( aSig0 | aSig1 )
6273 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6274 return propagateFloat128NaN( a, b STATUS_VAR );
6275 }
6276 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6277 return packFloat128( zSign, 0x7FFF, 0, 0 );
6278 }
6279 if ( bExp == 0x7FFF ) {
6280 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6281 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6282 invalid:
6283 float_raise( float_flag_invalid STATUS_VAR);
6284 z.low = float128_default_nan_low;
6285 z.high = float128_default_nan_high;
6286 return z;
6287 }
6288 return packFloat128( zSign, 0x7FFF, 0, 0 );
6289 }
6290 if ( aExp == 0 ) {
6291 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6292 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6293 }
6294 if ( bExp == 0 ) {
6295 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6296 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6297 }
6298 zExp = aExp + bExp - 0x4000;
6299 aSig0 |= LIT64( 0x0001000000000000 );
6300 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6301 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6302 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6303 zSig2 |= ( zSig3 != 0 );
6304 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6305 shift128ExtraRightJamming(
6306 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6307 ++zExp;
6308 }
6309 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6310
6311}
6312
6313/*----------------------------------------------------------------------------
6314| Returns the result of dividing the quadruple-precision floating-point value
6315| `a' by the corresponding value `b'. The operation is performed according to
6316| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6317*----------------------------------------------------------------------------*/
6318
6319float128 float128_div( float128 a, float128 b STATUS_PARAM )
6320{
6321 flag aSign, bSign, zSign;
6322 int32 aExp, bExp, zExp;
bb98fe42
AF
6323 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6324 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
158142c2
FB
6325 float128 z;
6326
6327 aSig1 = extractFloat128Frac1( a );
6328 aSig0 = extractFloat128Frac0( a );
6329 aExp = extractFloat128Exp( a );
6330 aSign = extractFloat128Sign( a );
6331 bSig1 = extractFloat128Frac1( b );
6332 bSig0 = extractFloat128Frac0( b );
6333 bExp = extractFloat128Exp( b );
6334 bSign = extractFloat128Sign( b );
6335 zSign = aSign ^ bSign;
6336 if ( aExp == 0x7FFF ) {
6337 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6338 if ( bExp == 0x7FFF ) {
6339 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6340 goto invalid;
6341 }
6342 return packFloat128( zSign, 0x7FFF, 0, 0 );
6343 }
6344 if ( bExp == 0x7FFF ) {
6345 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6346 return packFloat128( zSign, 0, 0, 0 );
6347 }
6348 if ( bExp == 0 ) {
6349 if ( ( bSig0 | bSig1 ) == 0 ) {
6350 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6351 invalid:
6352 float_raise( float_flag_invalid STATUS_VAR);
6353 z.low = float128_default_nan_low;
6354 z.high = float128_default_nan_high;
6355 return z;
6356 }
6357 float_raise( float_flag_divbyzero STATUS_VAR);
6358 return packFloat128( zSign, 0x7FFF, 0, 0 );
6359 }
6360 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6361 }
6362 if ( aExp == 0 ) {
6363 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6364 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6365 }
6366 zExp = aExp - bExp + 0x3FFD;
6367 shortShift128Left(
6368 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6369 shortShift128Left(
6370 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6371 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6372 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6373 ++zExp;
6374 }
6375 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6376 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6377 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
bb98fe42 6378 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
6379 --zSig0;
6380 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6381 }
6382 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6383 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6384 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6385 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
bb98fe42 6386 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
6387 --zSig1;
6388 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6389 }
6390 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6391 }
6392 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6393 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6394
6395}
6396
6397/*----------------------------------------------------------------------------
6398| Returns the remainder of the quadruple-precision floating-point value `a'
6399| with respect to the corresponding value `b'. The operation is performed
6400| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6401*----------------------------------------------------------------------------*/
6402
6403float128 float128_rem( float128 a, float128 b STATUS_PARAM )
6404{
ed086f3d 6405 flag aSign, zSign;
158142c2 6406 int32 aExp, bExp, expDiff;
bb98fe42
AF
6407 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6408 uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6409 int64_t sigMean0;
158142c2
FB
6410 float128 z;
6411
6412 aSig1 = extractFloat128Frac1( a );
6413 aSig0 = extractFloat128Frac0( a );
6414 aExp = extractFloat128Exp( a );
6415 aSign = extractFloat128Sign( a );
6416 bSig1 = extractFloat128Frac1( b );
6417 bSig0 = extractFloat128Frac0( b );
6418 bExp = extractFloat128Exp( b );
158142c2
FB
6419 if ( aExp == 0x7FFF ) {
6420 if ( ( aSig0 | aSig1 )
6421 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6422 return propagateFloat128NaN( a, b STATUS_VAR );
6423 }
6424 goto invalid;
6425 }
6426 if ( bExp == 0x7FFF ) {
6427 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6428 return a;
6429 }
6430 if ( bExp == 0 ) {
6431 if ( ( bSig0 | bSig1 ) == 0 ) {
6432 invalid:
6433 float_raise( float_flag_invalid STATUS_VAR);
6434 z.low = float128_default_nan_low;
6435 z.high = float128_default_nan_high;
6436 return z;
6437 }
6438 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6439 }
6440 if ( aExp == 0 ) {
6441 if ( ( aSig0 | aSig1 ) == 0 ) return a;
6442 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6443 }
6444 expDiff = aExp - bExp;
6445 if ( expDiff < -1 ) return a;
6446 shortShift128Left(
6447 aSig0 | LIT64( 0x0001000000000000 ),
6448 aSig1,
6449 15 - ( expDiff < 0 ),
6450 &aSig0,
6451 &aSig1
6452 );
6453 shortShift128Left(
6454 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6455 q = le128( bSig0, bSig1, aSig0, aSig1 );
6456 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6457 expDiff -= 64;
6458 while ( 0 < expDiff ) {
6459 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6460 q = ( 4 < q ) ? q - 4 : 0;
6461 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6462 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6463 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6464 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6465 expDiff -= 61;
6466 }
6467 if ( -64 < expDiff ) {
6468 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6469 q = ( 4 < q ) ? q - 4 : 0;
6470 q >>= - expDiff;
6471 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6472 expDiff += 52;
6473 if ( expDiff < 0 ) {
6474 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6475 }
6476 else {
6477 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6478 }
6479 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6480 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6481 }
6482 else {
6483 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6484 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6485 }
6486 do {
6487 alternateASig0 = aSig0;
6488 alternateASig1 = aSig1;
6489 ++q;
6490 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
bb98fe42 6491 } while ( 0 <= (int64_t) aSig0 );
158142c2 6492 add128(
bb98fe42 6493 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
158142c2
FB
6494 if ( ( sigMean0 < 0 )
6495 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6496 aSig0 = alternateASig0;
6497 aSig1 = alternateASig1;
6498 }
bb98fe42 6499 zSign = ( (int64_t) aSig0 < 0 );
158142c2
FB
6500 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6501 return
6502 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
6503
6504}
6505
6506/*----------------------------------------------------------------------------
6507| Returns the square root of the quadruple-precision floating-point value `a'.
6508| The operation is performed according to the IEC/IEEE Standard for Binary
6509| Floating-Point Arithmetic.
6510*----------------------------------------------------------------------------*/
6511
6512float128 float128_sqrt( float128 a STATUS_PARAM )
6513{
6514 flag aSign;
6515 int32 aExp, zExp;
bb98fe42
AF
6516 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6517 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
158142c2
FB
6518 float128 z;
6519
6520 aSig1 = extractFloat128Frac1( a );
6521 aSig0 = extractFloat128Frac0( a );
6522 aExp = extractFloat128Exp( a );
6523 aSign = extractFloat128Sign( a );
6524 if ( aExp == 0x7FFF ) {
6525 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
6526 if ( ! aSign ) return a;
6527 goto invalid;
6528 }
6529 if ( aSign ) {
6530 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6531 invalid:
6532 float_raise( float_flag_invalid STATUS_VAR);
6533 z.low = float128_default_nan_low;
6534 z.high = float128_default_nan_high;
6535 return z;
6536 }
6537 if ( aExp == 0 ) {
6538 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6539 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6540 }
6541 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6542 aSig0 |= LIT64( 0x0001000000000000 );
6543 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6544 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6545 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6546 doubleZSig0 = zSig0<<1;
6547 mul64To128( zSig0, zSig0, &term0, &term1 );
6548 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
bb98fe42 6549 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
6550 --zSig0;
6551 doubleZSig0 -= 2;
6552 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6553 }
6554 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6555 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6556 if ( zSig1 == 0 ) zSig1 = 1;
6557 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6558 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6559 mul64To128( zSig1, zSig1, &term2, &term3 );
6560 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
bb98fe42 6561 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
6562 --zSig1;
6563 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6564 term3 |= 1;
6565 term2 |= doubleZSig0;
6566 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6567 }
6568 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6569 }
6570 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6571 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6572
6573}
6574
6575/*----------------------------------------------------------------------------
6576| Returns 1 if the quadruple-precision floating-point value `a' is equal to
b689362d
AJ
6577| the corresponding value `b', and 0 otherwise. The invalid exception is
6578| raised if either operand is a NaN. Otherwise, the comparison is performed
158142c2
FB
6579| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6580*----------------------------------------------------------------------------*/
6581
b689362d 6582int float128_eq( float128 a, float128 b STATUS_PARAM )
158142c2
FB
6583{
6584
6585 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6586 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6587 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6588 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6589 ) {
b689362d 6590 float_raise( float_flag_invalid STATUS_VAR);
158142c2
FB
6591 return 0;
6592 }
6593 return
6594 ( a.low == b.low )
6595 && ( ( a.high == b.high )
6596 || ( ( a.low == 0 )
bb98fe42 6597 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
6598 );
6599
6600}
6601
6602/*----------------------------------------------------------------------------
6603| Returns 1 if the quadruple-precision floating-point value `a' is less than
f5a64251
AJ
6604| or equal to the corresponding value `b', and 0 otherwise. The invalid
6605| exception is raised if either operand is a NaN. The comparison is performed
6606| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
6607*----------------------------------------------------------------------------*/
6608
750afe93 6609int float128_le( float128 a, float128 b STATUS_PARAM )
158142c2
FB
6610{
6611 flag aSign, bSign;
6612
6613 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6614 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6615 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6616 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6617 ) {
6618 float_raise( float_flag_invalid STATUS_VAR);
6619 return 0;
6620 }
6621 aSign = extractFloat128Sign( a );
6622 bSign = extractFloat128Sign( b );
6623 if ( aSign != bSign ) {
6624 return
6625 aSign
bb98fe42 6626 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
6627 == 0 );
6628 }
6629 return
6630 aSign ? le128( b.high, b.low, a.high, a.low )
6631 : le128( a.high, a.low, b.high, b.low );
6632
6633}
6634
6635/*----------------------------------------------------------------------------
6636| Returns 1 if the quadruple-precision floating-point value `a' is less than
f5a64251
AJ
6637| the corresponding value `b', and 0 otherwise. The invalid exception is
6638| raised if either operand is a NaN. The comparison is performed according
6639| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
6640*----------------------------------------------------------------------------*/
6641
750afe93 6642int float128_lt( float128 a, float128 b STATUS_PARAM )
158142c2
FB
6643{
6644 flag aSign, bSign;
6645
6646 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6647 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6648 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6649 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6650 ) {
6651 float_raise( float_flag_invalid STATUS_VAR);
6652 return 0;
6653 }
6654 aSign = extractFloat128Sign( a );
6655 bSign = extractFloat128Sign( b );
6656 if ( aSign != bSign ) {
6657 return
6658 aSign
bb98fe42 6659 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
6660 != 0 );
6661 }
6662 return
6663 aSign ? lt128( b.high, b.low, a.high, a.low )
6664 : lt128( a.high, a.low, b.high, b.low );
6665
6666}
6667
67b7861d
AJ
6668/*----------------------------------------------------------------------------
6669| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
f5a64251
AJ
6670| be compared, and 0 otherwise. The invalid exception is raised if either
6671| operand is a NaN. The comparison is performed according to the IEC/IEEE
6672| Standard for Binary Floating-Point Arithmetic.
67b7861d
AJ
6673*----------------------------------------------------------------------------*/
6674
6675int float128_unordered( float128 a, float128 b STATUS_PARAM )
6676{
6677 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6678 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6679 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6680 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6681 ) {
6682 float_raise( float_flag_invalid STATUS_VAR);
6683 return 1;
6684 }
6685 return 0;
6686}
6687
158142c2
FB
6688/*----------------------------------------------------------------------------
6689| Returns 1 if the quadruple-precision floating-point value `a' is equal to
f5a64251
AJ
6690| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6691| exception. The comparison is performed according to the IEC/IEEE Standard
6692| for Binary Floating-Point Arithmetic.
158142c2
FB
6693*----------------------------------------------------------------------------*/
6694
b689362d 6695int float128_eq_quiet( float128 a, float128 b STATUS_PARAM )
158142c2
FB
6696{
6697
6698 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6699 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6700 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6701 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6702 ) {
b689362d
AJ
6703 if ( float128_is_signaling_nan( a )
6704 || float128_is_signaling_nan( b ) ) {
6705 float_raise( float_flag_invalid STATUS_VAR);
6706 }
158142c2
FB
6707 return 0;
6708 }
6709 return
6710 ( a.low == b.low )
6711 && ( ( a.high == b.high )
6712 || ( ( a.low == 0 )
bb98fe42 6713 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
6714 );
6715
6716}
6717
6718/*----------------------------------------------------------------------------
6719| Returns 1 if the quadruple-precision floating-point value `a' is less than
6720| or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
6721| cause an exception. Otherwise, the comparison is performed according to the
6722| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6723*----------------------------------------------------------------------------*/
6724
750afe93 6725int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
158142c2
FB
6726{
6727 flag aSign, bSign;
6728
6729 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6730 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6731 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6732 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6733 ) {
6734 if ( float128_is_signaling_nan( a )
6735 || float128_is_signaling_nan( b ) ) {
6736 float_raise( float_flag_invalid STATUS_VAR);
6737 }
6738 return 0;
6739 }
6740 aSign = extractFloat128Sign( a );
6741 bSign = extractFloat128Sign( b );
6742 if ( aSign != bSign ) {
6743 return
6744 aSign
bb98fe42 6745 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
6746 == 0 );
6747 }
6748 return
6749 aSign ? le128( b.high, b.low, a.high, a.low )
6750 : le128( a.high, a.low, b.high, b.low );
6751
6752}
6753
6754/*----------------------------------------------------------------------------
6755| Returns 1 if the quadruple-precision floating-point value `a' is less than
6756| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6757| exception. Otherwise, the comparison is performed according to the IEC/IEEE
6758| Standard for Binary Floating-Point Arithmetic.
6759*----------------------------------------------------------------------------*/
6760
750afe93 6761int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
158142c2
FB
6762{
6763 flag aSign, bSign;
6764
6765 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6766 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6767 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6768 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6769 ) {
6770 if ( float128_is_signaling_nan( a )
6771 || float128_is_signaling_nan( b ) ) {
6772 float_raise( float_flag_invalid STATUS_VAR);
6773 }
6774 return 0;
6775 }
6776 aSign = extractFloat128Sign( a );
6777 bSign = extractFloat128Sign( b );
6778 if ( aSign != bSign ) {
6779 return
6780 aSign
bb98fe42 6781 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
6782 != 0 );
6783 }
6784 return
6785 aSign ? lt128( b.high, b.low, a.high, a.low )
6786 : lt128( a.high, a.low, b.high, b.low );
6787
6788}
6789
67b7861d
AJ
6790/*----------------------------------------------------------------------------
6791| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6792| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
6793| comparison is performed according to the IEC/IEEE Standard for Binary
6794| Floating-Point Arithmetic.
6795*----------------------------------------------------------------------------*/
6796
6797int float128_unordered_quiet( float128 a, float128 b STATUS_PARAM )
6798{
6799 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6800 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6801 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6802 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6803 ) {
6804 if ( float128_is_signaling_nan( a )
6805 || float128_is_signaling_nan( b ) ) {
6806 float_raise( float_flag_invalid STATUS_VAR);
6807 }
6808 return 1;
6809 }
6810 return 0;
6811}
6812
1d6bda35 6813/* misc functions */
c4850f9e 6814float32 uint32_to_float32(uint32_t a STATUS_PARAM)
1d6bda35
FB
6815{
6816 return int64_to_float32(a STATUS_VAR);
6817}
6818
c4850f9e 6819float64 uint32_to_float64(uint32_t a STATUS_PARAM)
1d6bda35
FB
6820{
6821 return int64_to_float64(a STATUS_VAR);
6822}
6823
9f8d2a09 6824uint32 float32_to_uint32( float32 a STATUS_PARAM )
1d6bda35
FB
6825{
6826 int64_t v;
9f8d2a09 6827 uint32 res;
34e1c27b 6828 int old_exc_flags = get_float_exception_flags(status);
1d6bda35
FB
6829
6830 v = float32_to_int64(a STATUS_VAR);
6831 if (v < 0) {
6832 res = 0;
1d6bda35
FB
6833 } else if (v > 0xffffffff) {
6834 res = 0xffffffff;
1d6bda35 6835 } else {
34e1c27b 6836 return v;
1d6bda35 6837 }
34e1c27b
PM
6838 set_float_exception_flags(old_exc_flags, status);
6839 float_raise(float_flag_invalid STATUS_VAR);
1d6bda35
FB
6840 return res;
6841}
6842
9f8d2a09 6843uint32 float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
1d6bda35
FB
6844{
6845 int64_t v;
9f8d2a09 6846 uint32 res;
34e1c27b 6847 int old_exc_flags = get_float_exception_flags(status);
1d6bda35
FB
6848
6849 v = float32_to_int64_round_to_zero(a STATUS_VAR);
6850 if (v < 0) {
6851 res = 0;
1d6bda35
FB
6852 } else if (v > 0xffffffff) {
6853 res = 0xffffffff;
1d6bda35 6854 } else {
34e1c27b 6855 return v;
1d6bda35 6856 }
34e1c27b
PM
6857 set_float_exception_flags(old_exc_flags, status);
6858 float_raise(float_flag_invalid STATUS_VAR);
1d6bda35
FB
6859 return res;
6860}
6861
f581bf54
WN
6862int_fast16_t float32_to_int16(float32 a STATUS_PARAM)
6863{
6864 int32_t v;
6865 int_fast16_t res;
6866 int old_exc_flags = get_float_exception_flags(status);
6867
6868 v = float32_to_int32(a STATUS_VAR);
6869 if (v < -0x8000) {
6870 res = -0x8000;
6871 } else if (v > 0x7fff) {
6872 res = 0x7fff;
6873 } else {
6874 return v;
6875 }
6876
6877 set_float_exception_flags(old_exc_flags, status);
6878 float_raise(float_flag_invalid STATUS_VAR);
6879 return res;
6880}
6881
6882uint_fast16_t float32_to_uint16(float32 a STATUS_PARAM)
6883{
6884 int32_t v;
6885 uint_fast16_t res;
6886 int old_exc_flags = get_float_exception_flags(status);
6887
6888 v = float32_to_int32(a STATUS_VAR);
6889 if (v < 0) {
6890 res = 0;
6891 } else if (v > 0xffff) {
6892 res = 0xffff;
6893 } else {
6894 return v;
6895 }
6896
6897 set_float_exception_flags(old_exc_flags, status);
6898 float_raise(float_flag_invalid STATUS_VAR);
6899 return res;
6900}
6901
5aea4c58 6902uint_fast16_t float32_to_uint16_round_to_zero(float32 a STATUS_PARAM)
cbcef455
PM
6903{
6904 int64_t v;
5aea4c58 6905 uint_fast16_t res;
34e1c27b 6906 int old_exc_flags = get_float_exception_flags(status);
cbcef455
PM
6907
6908 v = float32_to_int64_round_to_zero(a STATUS_VAR);
6909 if (v < 0) {
6910 res = 0;
cbcef455
PM
6911 } else if (v > 0xffff) {
6912 res = 0xffff;
cbcef455 6913 } else {
34e1c27b 6914 return v;
cbcef455 6915 }
34e1c27b
PM
6916 set_float_exception_flags(old_exc_flags, status);
6917 float_raise(float_flag_invalid STATUS_VAR);
cbcef455
PM
6918 return res;
6919}
6920
9f8d2a09 6921uint32 float64_to_uint32( float64 a STATUS_PARAM )
1d6bda35 6922{
5e7f654f 6923 uint64_t v;
9f8d2a09 6924 uint32 res;
5e7f654f 6925 int old_exc_flags = get_float_exception_flags(status);
1d6bda35 6926
5e7f654f
TM
6927 v = float64_to_uint64(a STATUS_VAR);
6928 if (v > 0xffffffff) {
1d6bda35 6929 res = 0xffffffff;
1d6bda35 6930 } else {
5e7f654f 6931 return v;
1d6bda35 6932 }
5e7f654f
TM
6933 set_float_exception_flags(old_exc_flags, status);
6934 float_raise(float_flag_invalid STATUS_VAR);
1d6bda35
FB
6935 return res;
6936}
6937
9f8d2a09 6938uint32 float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
1d6bda35 6939{
fd728f2f 6940 uint64_t v;
9f8d2a09 6941 uint32 res;
fd728f2f 6942 int old_exc_flags = get_float_exception_flags(status);
1d6bda35 6943
fd728f2f
TM
6944 v = float64_to_uint64_round_to_zero(a STATUS_VAR);
6945 if (v > 0xffffffff) {
1d6bda35 6946 res = 0xffffffff;
1d6bda35 6947 } else {
fd728f2f 6948 return v;
1d6bda35 6949 }
fd728f2f
TM
6950 set_float_exception_flags(old_exc_flags, status);
6951 float_raise(float_flag_invalid STATUS_VAR);
1d6bda35
FB
6952 return res;
6953}
6954
f581bf54
WN
6955int_fast16_t float64_to_int16(float64 a STATUS_PARAM)
6956{
6957 int64_t v;
6958 int_fast16_t res;
6959 int old_exc_flags = get_float_exception_flags(status);
6960
6961 v = float64_to_int32(a STATUS_VAR);
6962 if (v < -0x8000) {
6963 res = -0x8000;
6964 } else if (v > 0x7fff) {
6965 res = 0x7fff;
6966 } else {
6967 return v;
6968 }
6969
6970 set_float_exception_flags(old_exc_flags, status);
6971 float_raise(float_flag_invalid STATUS_VAR);
6972 return res;
6973}
6974
6975uint_fast16_t float64_to_uint16(float64 a STATUS_PARAM)
6976{
6977 int64_t v;
6978 uint_fast16_t res;
6979 int old_exc_flags = get_float_exception_flags(status);
6980
6981 v = float64_to_int32(a STATUS_VAR);
6982 if (v < 0) {
6983 res = 0;
6984 } else if (v > 0xffff) {
6985 res = 0xffff;
6986 } else {
6987 return v;
6988 }
6989
6990 set_float_exception_flags(old_exc_flags, status);
6991 float_raise(float_flag_invalid STATUS_VAR);
6992 return res;
6993}
6994
5aea4c58 6995uint_fast16_t float64_to_uint16_round_to_zero(float64 a STATUS_PARAM)
cbcef455
PM
6996{
6997 int64_t v;
5aea4c58 6998 uint_fast16_t res;
34e1c27b 6999 int old_exc_flags = get_float_exception_flags(status);
cbcef455
PM
7000
7001 v = float64_to_int64_round_to_zero(a STATUS_VAR);
7002 if (v < 0) {
7003 res = 0;
cbcef455
PM
7004 } else if (v > 0xffff) {
7005 res = 0xffff;
cbcef455 7006 } else {
34e1c27b 7007 return v;
cbcef455 7008 }
34e1c27b
PM
7009 set_float_exception_flags(old_exc_flags, status);
7010 float_raise(float_flag_invalid STATUS_VAR);
cbcef455
PM
7011 return res;
7012}
7013
fb3ea83a
TM
7014/*----------------------------------------------------------------------------
7015| Returns the result of converting the double-precision floating-point value
7016| `a' to the 64-bit unsigned integer format. The conversion is
7017| performed according to the IEC/IEEE Standard for Binary Floating-Point
7018| Arithmetic---which means in particular that the conversion is rounded
7019| according to the current rounding mode. If `a' is a NaN, the largest
7020| positive integer is returned. If the conversion overflows, the
7021| largest unsigned integer is returned. If 'a' is negative, the value is
7022| rounded and zero is returned; negative values that do not round to zero
7023| will raise the inexact exception.
7024*----------------------------------------------------------------------------*/
75d62a58 7025
fb3ea83a
TM
7026uint64_t float64_to_uint64(float64 a STATUS_PARAM)
7027{
7028 flag aSign;
7029 int_fast16_t aExp, shiftCount;
7030 uint64_t aSig, aSigExtra;
7031 a = float64_squash_input_denormal(a STATUS_VAR);
75d62a58 7032
fb3ea83a
TM
7033 aSig = extractFloat64Frac(a);
7034 aExp = extractFloat64Exp(a);
7035 aSign = extractFloat64Sign(a);
7036 if (aSign && (aExp > 1022)) {
7037 float_raise(float_flag_invalid STATUS_VAR);
7038 if (float64_is_any_nan(a)) {
7039 return LIT64(0xFFFFFFFFFFFFFFFF);
7040 } else {
7041 return 0;
7042 }
7043 }
7044 if (aExp) {
7045 aSig |= LIT64(0x0010000000000000);
7046 }
7047 shiftCount = 0x433 - aExp;
7048 if (shiftCount <= 0) {
7049 if (0x43E < aExp) {
7050 float_raise(float_flag_invalid STATUS_VAR);
7051 return LIT64(0xFFFFFFFFFFFFFFFF);
7052 }
7053 aSigExtra = 0;
7054 aSig <<= -shiftCount;
7055 } else {
7056 shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra);
7057 }
7058 return roundAndPackUint64(aSign, aSig, aSigExtra STATUS_VAR);
75d62a58
JM
7059}
7060
7061uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
7062{
0a87a310
TM
7063 signed char current_rounding_mode = STATUS(float_rounding_mode);
7064 set_float_rounding_mode(float_round_to_zero STATUS_VAR);
7065 int64_t v = float64_to_uint64(a STATUS_VAR);
7066 set_float_rounding_mode(current_rounding_mode STATUS_VAR);
7067 return v;
75d62a58
JM
7068}
7069
1d6bda35 7070#define COMPARE(s, nan_exp) \
750afe93 7071INLINE int float ## s ## _compare_internal( float ## s a, float ## s b, \
1d6bda35
FB
7072 int is_quiet STATUS_PARAM ) \
7073{ \
7074 flag aSign, bSign; \
bb98fe42 7075 uint ## s ## _t av, bv; \
37d18660
PM
7076 a = float ## s ## _squash_input_denormal(a STATUS_VAR); \
7077 b = float ## s ## _squash_input_denormal(b STATUS_VAR); \
1d6bda35
FB
7078 \
7079 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
7080 extractFloat ## s ## Frac( a ) ) || \
7081 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
7082 extractFloat ## s ## Frac( b ) )) { \
7083 if (!is_quiet || \
7084 float ## s ## _is_signaling_nan( a ) || \
7085 float ## s ## _is_signaling_nan( b ) ) { \
7086 float_raise( float_flag_invalid STATUS_VAR); \
7087 } \
7088 return float_relation_unordered; \
7089 } \
7090 aSign = extractFloat ## s ## Sign( a ); \
7091 bSign = extractFloat ## s ## Sign( b ); \
f090c9d4 7092 av = float ## s ## _val(a); \
cd8a2533 7093 bv = float ## s ## _val(b); \
1d6bda35 7094 if ( aSign != bSign ) { \
bb98fe42 7095 if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) { \
1d6bda35
FB
7096 /* zero case */ \
7097 return float_relation_equal; \
7098 } else { \
7099 return 1 - (2 * aSign); \
7100 } \
7101 } else { \
f090c9d4 7102 if (av == bv) { \
1d6bda35
FB
7103 return float_relation_equal; \
7104 } else { \
f090c9d4 7105 return 1 - 2 * (aSign ^ ( av < bv )); \
1d6bda35
FB
7106 } \
7107 } \
7108} \
7109 \
750afe93 7110int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM ) \
1d6bda35
FB
7111{ \
7112 return float ## s ## _compare_internal(a, b, 0 STATUS_VAR); \
7113} \
7114 \
750afe93 7115int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \
1d6bda35
FB
7116{ \
7117 return float ## s ## _compare_internal(a, b, 1 STATUS_VAR); \
7118}
7119
7120COMPARE(32, 0xff)
7121COMPARE(64, 0x7ff)
9ee6e8bb 7122
f6714d36
AJ
7123INLINE int floatx80_compare_internal( floatx80 a, floatx80 b,
7124 int is_quiet STATUS_PARAM )
7125{
7126 flag aSign, bSign;
7127
7128 if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7129 ( extractFloatx80Frac( a )<<1 ) ) ||
7130 ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7131 ( extractFloatx80Frac( b )<<1 ) )) {
7132 if (!is_quiet ||
7133 floatx80_is_signaling_nan( a ) ||
7134 floatx80_is_signaling_nan( b ) ) {
7135 float_raise( float_flag_invalid STATUS_VAR);
7136 }
7137 return float_relation_unordered;
7138 }
7139 aSign = extractFloatx80Sign( a );
7140 bSign = extractFloatx80Sign( b );
7141 if ( aSign != bSign ) {
7142
7143 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7144 ( ( a.low | b.low ) == 0 ) ) {
7145 /* zero case */
7146 return float_relation_equal;
7147 } else {
7148 return 1 - (2 * aSign);
7149 }
7150 } else {
7151 if (a.low == b.low && a.high == b.high) {
7152 return float_relation_equal;
7153 } else {
7154 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7155 }
7156 }
7157}
7158
7159int floatx80_compare( floatx80 a, floatx80 b STATUS_PARAM )
7160{
7161 return floatx80_compare_internal(a, b, 0 STATUS_VAR);
7162}
7163
7164int floatx80_compare_quiet( floatx80 a, floatx80 b STATUS_PARAM )
7165{
7166 return floatx80_compare_internal(a, b, 1 STATUS_VAR);
7167}
7168
1f587329
BS
7169INLINE int float128_compare_internal( float128 a, float128 b,
7170 int is_quiet STATUS_PARAM )
7171{
7172 flag aSign, bSign;
7173
7174 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7175 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7176 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7177 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7178 if (!is_quiet ||
7179 float128_is_signaling_nan( a ) ||
7180 float128_is_signaling_nan( b ) ) {
7181 float_raise( float_flag_invalid STATUS_VAR);
7182 }
7183 return float_relation_unordered;
7184 }
7185 aSign = extractFloat128Sign( a );
7186 bSign = extractFloat128Sign( b );
7187 if ( aSign != bSign ) {
7188 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7189 /* zero case */
7190 return float_relation_equal;
7191 } else {
7192 return 1 - (2 * aSign);
7193 }
7194 } else {
7195 if (a.low == b.low && a.high == b.high) {
7196 return float_relation_equal;
7197 } else {
7198 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7199 }
7200 }
7201}
7202
7203int float128_compare( float128 a, float128 b STATUS_PARAM )
7204{
7205 return float128_compare_internal(a, b, 0 STATUS_VAR);
7206}
7207
7208int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
7209{
7210 return float128_compare_internal(a, b, 1 STATUS_VAR);
7211}
7212
274f1b04
PM
7213/* min() and max() functions. These can't be implemented as
7214 * 'compare and pick one input' because that would mishandle
7215 * NaNs and +0 vs -0.
e17ab310
WN
7216 *
7217 * minnum() and maxnum() functions. These are similar to the min()
7218 * and max() functions but if one of the arguments is a QNaN and
7219 * the other is numerical then the numerical argument is returned.
7220 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
7221 * and maxNum() operations. min() and max() are the typical min/max
7222 * semantics provided by many CPUs which predate that specification.
274f1b04 7223 */
e70614ea 7224#define MINMAX(s) \
274f1b04 7225INLINE float ## s float ## s ## _minmax(float ## s a, float ## s b, \
e17ab310 7226 int ismin, int isieee STATUS_PARAM) \
274f1b04
PM
7227{ \
7228 flag aSign, bSign; \
7229 uint ## s ## _t av, bv; \
7230 a = float ## s ## _squash_input_denormal(a STATUS_VAR); \
7231 b = float ## s ## _squash_input_denormal(b STATUS_VAR); \
7232 if (float ## s ## _is_any_nan(a) || \
7233 float ## s ## _is_any_nan(b)) { \
e17ab310
WN
7234 if (isieee) { \
7235 if (float ## s ## _is_quiet_nan(a) && \
7236 !float ## s ##_is_any_nan(b)) { \
7237 return b; \
7238 } else if (float ## s ## _is_quiet_nan(b) && \
7239 !float ## s ## _is_any_nan(a)) { \
7240 return a; \
7241 } \
7242 } \
274f1b04
PM
7243 return propagateFloat ## s ## NaN(a, b STATUS_VAR); \
7244 } \
7245 aSign = extractFloat ## s ## Sign(a); \
7246 bSign = extractFloat ## s ## Sign(b); \
7247 av = float ## s ## _val(a); \
7248 bv = float ## s ## _val(b); \
7249 if (aSign != bSign) { \
7250 if (ismin) { \
7251 return aSign ? a : b; \
7252 } else { \
7253 return aSign ? b : a; \
7254 } \
7255 } else { \
7256 if (ismin) { \
7257 return (aSign ^ (av < bv)) ? a : b; \
7258 } else { \
7259 return (aSign ^ (av < bv)) ? b : a; \
7260 } \
7261 } \
7262} \
7263 \
7264float ## s float ## s ## _min(float ## s a, float ## s b STATUS_PARAM) \
7265{ \
e17ab310 7266 return float ## s ## _minmax(a, b, 1, 0 STATUS_VAR); \
274f1b04
PM
7267} \
7268 \
7269float ## s float ## s ## _max(float ## s a, float ## s b STATUS_PARAM) \
7270{ \
e17ab310
WN
7271 return float ## s ## _minmax(a, b, 0, 0 STATUS_VAR); \
7272} \
7273 \
7274float ## s float ## s ## _minnum(float ## s a, float ## s b STATUS_PARAM) \
7275{ \
7276 return float ## s ## _minmax(a, b, 1, 1 STATUS_VAR); \
7277} \
7278 \
7279float ## s float ## s ## _maxnum(float ## s a, float ## s b STATUS_PARAM) \
7280{ \
7281 return float ## s ## _minmax(a, b, 0, 1 STATUS_VAR); \
274f1b04
PM
7282}
7283
e70614ea
WN
7284MINMAX(32)
7285MINMAX(64)
274f1b04
PM
7286
7287
9ee6e8bb
PB
7288/* Multiply A by 2 raised to the power N. */
7289float32 float32_scalbn( float32 a, int n STATUS_PARAM )
7290{
7291 flag aSign;
326b9e98 7292 int16_t aExp;
bb98fe42 7293 uint32_t aSig;
9ee6e8bb 7294
37d18660 7295 a = float32_squash_input_denormal(a STATUS_VAR);
9ee6e8bb
PB
7296 aSig = extractFloat32Frac( a );
7297 aExp = extractFloat32Exp( a );
7298 aSign = extractFloat32Sign( a );
7299
7300 if ( aExp == 0xFF ) {
326b9e98
AJ
7301 if ( aSig ) {
7302 return propagateFloat32NaN( a, a STATUS_VAR );
7303 }
9ee6e8bb
PB
7304 return a;
7305 }
3c85c37f 7306 if (aExp != 0) {
69397542 7307 aSig |= 0x00800000;
3c85c37f 7308 } else if (aSig == 0) {
69397542 7309 return a;
3c85c37f
PM
7310 } else {
7311 aExp++;
7312 }
69397542 7313
326b9e98
AJ
7314 if (n > 0x200) {
7315 n = 0x200;
7316 } else if (n < -0x200) {
7317 n = -0x200;
7318 }
7319
69397542
PB
7320 aExp += n - 1;
7321 aSig <<= 7;
7322 return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
9ee6e8bb
PB
7323}
7324
7325float64 float64_scalbn( float64 a, int n STATUS_PARAM )
7326{
7327 flag aSign;
326b9e98 7328 int16_t aExp;
bb98fe42 7329 uint64_t aSig;
9ee6e8bb 7330
37d18660 7331 a = float64_squash_input_denormal(a STATUS_VAR);
9ee6e8bb
PB
7332 aSig = extractFloat64Frac( a );
7333 aExp = extractFloat64Exp( a );
7334 aSign = extractFloat64Sign( a );
7335
7336 if ( aExp == 0x7FF ) {
326b9e98
AJ
7337 if ( aSig ) {
7338 return propagateFloat64NaN( a, a STATUS_VAR );
7339 }
9ee6e8bb
PB
7340 return a;
7341 }
3c85c37f 7342 if (aExp != 0) {
69397542 7343 aSig |= LIT64( 0x0010000000000000 );
3c85c37f 7344 } else if (aSig == 0) {
69397542 7345 return a;
3c85c37f
PM
7346 } else {
7347 aExp++;
7348 }
69397542 7349
326b9e98
AJ
7350 if (n > 0x1000) {
7351 n = 0x1000;
7352 } else if (n < -0x1000) {
7353 n = -0x1000;
7354 }
7355
69397542
PB
7356 aExp += n - 1;
7357 aSig <<= 10;
7358 return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
9ee6e8bb
PB
7359}
7360
9ee6e8bb
PB
7361floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
7362{
7363 flag aSign;
326b9e98 7364 int32_t aExp;
bb98fe42 7365 uint64_t aSig;
9ee6e8bb
PB
7366
7367 aSig = extractFloatx80Frac( a );
7368 aExp = extractFloatx80Exp( a );
7369 aSign = extractFloatx80Sign( a );
7370
326b9e98
AJ
7371 if ( aExp == 0x7FFF ) {
7372 if ( aSig<<1 ) {
7373 return propagateFloatx80NaN( a, a STATUS_VAR );
7374 }
9ee6e8bb
PB
7375 return a;
7376 }
326b9e98 7377
3c85c37f
PM
7378 if (aExp == 0) {
7379 if (aSig == 0) {
7380 return a;
7381 }
7382 aExp++;
7383 }
69397542 7384
326b9e98
AJ
7385 if (n > 0x10000) {
7386 n = 0x10000;
7387 } else if (n < -0x10000) {
7388 n = -0x10000;
7389 }
7390
9ee6e8bb 7391 aExp += n;
69397542
PB
7392 return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
7393 aSign, aExp, aSig, 0 STATUS_VAR );
9ee6e8bb 7394}
9ee6e8bb 7395
9ee6e8bb
PB
7396float128 float128_scalbn( float128 a, int n STATUS_PARAM )
7397{
7398 flag aSign;
326b9e98 7399 int32_t aExp;
bb98fe42 7400 uint64_t aSig0, aSig1;
9ee6e8bb
PB
7401
7402 aSig1 = extractFloat128Frac1( a );
7403 aSig0 = extractFloat128Frac0( a );
7404 aExp = extractFloat128Exp( a );
7405 aSign = extractFloat128Sign( a );
7406 if ( aExp == 0x7FFF ) {
326b9e98
AJ
7407 if ( aSig0 | aSig1 ) {
7408 return propagateFloat128NaN( a, a STATUS_VAR );
7409 }
9ee6e8bb
PB
7410 return a;
7411 }
3c85c37f 7412 if (aExp != 0) {
69397542 7413 aSig0 |= LIT64( 0x0001000000000000 );
3c85c37f 7414 } else if (aSig0 == 0 && aSig1 == 0) {
69397542 7415 return a;
3c85c37f
PM
7416 } else {
7417 aExp++;
7418 }
69397542 7419
326b9e98
AJ
7420 if (n > 0x10000) {
7421 n = 0x10000;
7422 } else if (n < -0x10000) {
7423 n = -0x10000;
7424 }
7425
69397542
PB
7426 aExp += n - 1;
7427 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7428 STATUS_VAR );
9ee6e8bb
PB
7429
7430}