]> git.proxmox.com Git - qemu.git/blame - fpu/softfloat.c
softfloat: rename float*_eq_signaling() into float*_eq()
[qemu.git] / fpu / softfloat.c
CommitLineData
8d725fac
AF
1/*
2 * QEMU float support
3 *
4 * Derived from SoftFloat.
5 */
158142c2
FB
6
7/*============================================================================
8
9This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
10Package, Release 2b.
11
12Written by John R. Hauser. This work was made possible in part by the
13International Computer Science Institute, located at Suite 600, 1947 Center
14Street, Berkeley, California 94704. Funding was partially provided by the
15National Science Foundation under grant MIP-9311980. The original version
16of this code was written as part of a project to build a fixed-point vector
17processor in collaboration with the University of California at Berkeley,
18overseen by Profs. Nelson Morgan and John Wawrzynek. More information
19is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
20arithmetic/SoftFloat.html'.
21
22THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
23been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
24RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
25AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
26COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
27EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
28INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
29OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
30
31Derivative works are acceptable, even for commercial purposes, so long as
32(1) the source code for the derivative work includes prominent notice that
33the work is derivative, and (2) the source code includes prominent notice with
34these four paragraphs for those parts of this code that are retained.
35
36=============================================================================*/
37
38#include "softfloat.h"
39
40/*----------------------------------------------------------------------------
41| Primitive arithmetic functions, including multi-word arithmetic, and
42| division and square root approximations. (Can be specialized to target if
43| desired.)
44*----------------------------------------------------------------------------*/
45#include "softfloat-macros.h"
46
47/*----------------------------------------------------------------------------
48| Functions and definitions to determine: (1) whether tininess for underflow
49| is detected before or after rounding by default, (2) what (if anything)
50| happens when exceptions are raised, (3) how signaling NaNs are distinguished
51| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
52| are propagated from function inputs to output. These details are target-
53| specific.
54*----------------------------------------------------------------------------*/
55#include "softfloat-specialize.h"
56
57void set_float_rounding_mode(int val STATUS_PARAM)
58{
59 STATUS(float_rounding_mode) = val;
60}
61
1d6bda35
FB
62void set_float_exception_flags(int val STATUS_PARAM)
63{
64 STATUS(float_exception_flags) = val;
65}
66
158142c2
FB
67#ifdef FLOATX80
68void set_floatx80_rounding_precision(int val STATUS_PARAM)
69{
70 STATUS(floatx80_rounding_precision) = val;
71}
72#endif
73
bb4d4bb3
PM
74/*----------------------------------------------------------------------------
75| Returns the fraction bits of the half-precision floating-point value `a'.
76*----------------------------------------------------------------------------*/
77
78INLINE uint32_t extractFloat16Frac(float16 a)
79{
80 return float16_val(a) & 0x3ff;
81}
82
83/*----------------------------------------------------------------------------
84| Returns the exponent bits of the half-precision floating-point value `a'.
85*----------------------------------------------------------------------------*/
86
87INLINE int16 extractFloat16Exp(float16 a)
88{
89 return (float16_val(a) >> 10) & 0x1f;
90}
91
92/*----------------------------------------------------------------------------
93| Returns the sign bit of the single-precision floating-point value `a'.
94*----------------------------------------------------------------------------*/
95
96INLINE flag extractFloat16Sign(float16 a)
97{
98 return float16_val(a)>>15;
99}
100
158142c2
FB
101/*----------------------------------------------------------------------------
102| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
103| and 7, and returns the properly rounded 32-bit integer corresponding to the
104| input. If `zSign' is 1, the input is negated before being converted to an
105| integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
106| is simply rounded to an integer, with the inexact exception raised if the
107| input cannot be represented exactly as an integer. However, if the fixed-
108| point input is too large, the invalid exception is raised and the largest
109| positive or negative integer is returned.
110*----------------------------------------------------------------------------*/
111
bb98fe42 112static int32 roundAndPackInt32( flag zSign, uint64_t absZ STATUS_PARAM)
158142c2
FB
113{
114 int8 roundingMode;
115 flag roundNearestEven;
116 int8 roundIncrement, roundBits;
117 int32 z;
118
119 roundingMode = STATUS(float_rounding_mode);
120 roundNearestEven = ( roundingMode == float_round_nearest_even );
121 roundIncrement = 0x40;
122 if ( ! roundNearestEven ) {
123 if ( roundingMode == float_round_to_zero ) {
124 roundIncrement = 0;
125 }
126 else {
127 roundIncrement = 0x7F;
128 if ( zSign ) {
129 if ( roundingMode == float_round_up ) roundIncrement = 0;
130 }
131 else {
132 if ( roundingMode == float_round_down ) roundIncrement = 0;
133 }
134 }
135 }
136 roundBits = absZ & 0x7F;
137 absZ = ( absZ + roundIncrement )>>7;
138 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
139 z = absZ;
140 if ( zSign ) z = - z;
141 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
142 float_raise( float_flag_invalid STATUS_VAR);
bb98fe42 143 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2
FB
144 }
145 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
146 return z;
147
148}
149
150/*----------------------------------------------------------------------------
151| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
152| `absZ1', with binary point between bits 63 and 64 (between the input words),
153| and returns the properly rounded 64-bit integer corresponding to the input.
154| If `zSign' is 1, the input is negated before being converted to an integer.
155| Ordinarily, the fixed-point input is simply rounded to an integer, with
156| the inexact exception raised if the input cannot be represented exactly as
157| an integer. However, if the fixed-point input is too large, the invalid
158| exception is raised and the largest positive or negative integer is
159| returned.
160*----------------------------------------------------------------------------*/
161
bb98fe42 162static int64 roundAndPackInt64( flag zSign, uint64_t absZ0, uint64_t absZ1 STATUS_PARAM)
158142c2
FB
163{
164 int8 roundingMode;
165 flag roundNearestEven, increment;
166 int64 z;
167
168 roundingMode = STATUS(float_rounding_mode);
169 roundNearestEven = ( roundingMode == float_round_nearest_even );
bb98fe42 170 increment = ( (int64_t) absZ1 < 0 );
158142c2
FB
171 if ( ! roundNearestEven ) {
172 if ( roundingMode == float_round_to_zero ) {
173 increment = 0;
174 }
175 else {
176 if ( zSign ) {
177 increment = ( roundingMode == float_round_down ) && absZ1;
178 }
179 else {
180 increment = ( roundingMode == float_round_up ) && absZ1;
181 }
182 }
183 }
184 if ( increment ) {
185 ++absZ0;
186 if ( absZ0 == 0 ) goto overflow;
bb98fe42 187 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
158142c2
FB
188 }
189 z = absZ0;
190 if ( zSign ) z = - z;
191 if ( z && ( ( z < 0 ) ^ zSign ) ) {
192 overflow:
193 float_raise( float_flag_invalid STATUS_VAR);
194 return
bb98fe42 195 zSign ? (int64_t) LIT64( 0x8000000000000000 )
158142c2
FB
196 : LIT64( 0x7FFFFFFFFFFFFFFF );
197 }
198 if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
199 return z;
200
201}
202
203/*----------------------------------------------------------------------------
204| Returns the fraction bits of the single-precision floating-point value `a'.
205*----------------------------------------------------------------------------*/
206
bb98fe42 207INLINE uint32_t extractFloat32Frac( float32 a )
158142c2
FB
208{
209
f090c9d4 210 return float32_val(a) & 0x007FFFFF;
158142c2
FB
211
212}
213
214/*----------------------------------------------------------------------------
215| Returns the exponent bits of the single-precision floating-point value `a'.
216*----------------------------------------------------------------------------*/
217
218INLINE int16 extractFloat32Exp( float32 a )
219{
220
f090c9d4 221 return ( float32_val(a)>>23 ) & 0xFF;
158142c2
FB
222
223}
224
225/*----------------------------------------------------------------------------
226| Returns the sign bit of the single-precision floating-point value `a'.
227*----------------------------------------------------------------------------*/
228
229INLINE flag extractFloat32Sign( float32 a )
230{
231
f090c9d4 232 return float32_val(a)>>31;
158142c2
FB
233
234}
235
37d18660
PM
236/*----------------------------------------------------------------------------
237| If `a' is denormal and we are in flush-to-zero mode then set the
238| input-denormal exception and return zero. Otherwise just return the value.
239*----------------------------------------------------------------------------*/
240static float32 float32_squash_input_denormal(float32 a STATUS_PARAM)
241{
242 if (STATUS(flush_inputs_to_zero)) {
243 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
244 float_raise(float_flag_input_denormal STATUS_VAR);
245 return make_float32(float32_val(a) & 0x80000000);
246 }
247 }
248 return a;
249}
250
158142c2
FB
251/*----------------------------------------------------------------------------
252| Normalizes the subnormal single-precision floating-point value represented
253| by the denormalized significand `aSig'. The normalized exponent and
254| significand are stored at the locations pointed to by `zExpPtr' and
255| `zSigPtr', respectively.
256*----------------------------------------------------------------------------*/
257
258static void
bb98fe42 259 normalizeFloat32Subnormal( uint32_t aSig, int16 *zExpPtr, uint32_t *zSigPtr )
158142c2
FB
260{
261 int8 shiftCount;
262
263 shiftCount = countLeadingZeros32( aSig ) - 8;
264 *zSigPtr = aSig<<shiftCount;
265 *zExpPtr = 1 - shiftCount;
266
267}
268
269/*----------------------------------------------------------------------------
270| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
271| single-precision floating-point value, returning the result. After being
272| shifted into the proper positions, the three fields are simply added
273| together to form the result. This means that any integer portion of `zSig'
274| will be added into the exponent. Since a properly normalized significand
275| will have an integer portion equal to 1, the `zExp' input should be 1 less
276| than the desired result exponent whenever `zSig' is a complete, normalized
277| significand.
278*----------------------------------------------------------------------------*/
279
bb98fe42 280INLINE float32 packFloat32( flag zSign, int16 zExp, uint32_t zSig )
158142c2
FB
281{
282
f090c9d4 283 return make_float32(
bb98fe42 284 ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
158142c2
FB
285
286}
287
288/*----------------------------------------------------------------------------
289| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
290| and significand `zSig', and returns the proper single-precision floating-
291| point value corresponding to the abstract input. Ordinarily, the abstract
292| value is simply rounded and packed into the single-precision format, with
293| the inexact exception raised if the abstract input cannot be represented
294| exactly. However, if the abstract value is too large, the overflow and
295| inexact exceptions are raised and an infinity or maximal finite value is
296| returned. If the abstract value is too small, the input value is rounded to
297| a subnormal number, and the underflow and inexact exceptions are raised if
298| the abstract input cannot be represented exactly as a subnormal single-
299| precision floating-point number.
300| The input significand `zSig' has its binary point between bits 30
301| and 29, which is 7 bits to the left of the usual location. This shifted
302| significand must be normalized or smaller. If `zSig' is not normalized,
303| `zExp' must be 0; in that case, the result returned is a subnormal number,
304| and it must not require rounding. In the usual case that `zSig' is
305| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
306| The handling of underflow and overflow follows the IEC/IEEE Standard for
307| Binary Floating-Point Arithmetic.
308*----------------------------------------------------------------------------*/
309
bb98fe42 310static float32 roundAndPackFloat32( flag zSign, int16 zExp, uint32_t zSig STATUS_PARAM)
158142c2
FB
311{
312 int8 roundingMode;
313 flag roundNearestEven;
314 int8 roundIncrement, roundBits;
315 flag isTiny;
316
317 roundingMode = STATUS(float_rounding_mode);
318 roundNearestEven = ( roundingMode == float_round_nearest_even );
319 roundIncrement = 0x40;
320 if ( ! roundNearestEven ) {
321 if ( roundingMode == float_round_to_zero ) {
322 roundIncrement = 0;
323 }
324 else {
325 roundIncrement = 0x7F;
326 if ( zSign ) {
327 if ( roundingMode == float_round_up ) roundIncrement = 0;
328 }
329 else {
330 if ( roundingMode == float_round_down ) roundIncrement = 0;
331 }
332 }
333 }
334 roundBits = zSig & 0x7F;
bb98fe42 335 if ( 0xFD <= (uint16_t) zExp ) {
158142c2
FB
336 if ( ( 0xFD < zExp )
337 || ( ( zExp == 0xFD )
bb98fe42 338 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
158142c2
FB
339 ) {
340 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
f090c9d4 341 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
158142c2
FB
342 }
343 if ( zExp < 0 ) {
fe76d976 344 if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
158142c2
FB
345 isTiny =
346 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
347 || ( zExp < -1 )
348 || ( zSig + roundIncrement < 0x80000000 );
349 shift32RightJamming( zSig, - zExp, &zSig );
350 zExp = 0;
351 roundBits = zSig & 0x7F;
352 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
353 }
354 }
355 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
356 zSig = ( zSig + roundIncrement )>>7;
357 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
358 if ( zSig == 0 ) zExp = 0;
359 return packFloat32( zSign, zExp, zSig );
360
361}
362
363/*----------------------------------------------------------------------------
364| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
365| and significand `zSig', and returns the proper single-precision floating-
366| point value corresponding to the abstract input. This routine is just like
367| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
368| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
369| floating-point exponent.
370*----------------------------------------------------------------------------*/
371
372static float32
bb98fe42 373 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, uint32_t zSig STATUS_PARAM)
158142c2
FB
374{
375 int8 shiftCount;
376
377 shiftCount = countLeadingZeros32( zSig ) - 1;
378 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
379
380}
381
382/*----------------------------------------------------------------------------
383| Returns the fraction bits of the double-precision floating-point value `a'.
384*----------------------------------------------------------------------------*/
385
bb98fe42 386INLINE uint64_t extractFloat64Frac( float64 a )
158142c2
FB
387{
388
f090c9d4 389 return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
158142c2
FB
390
391}
392
393/*----------------------------------------------------------------------------
394| Returns the exponent bits of the double-precision floating-point value `a'.
395*----------------------------------------------------------------------------*/
396
397INLINE int16 extractFloat64Exp( float64 a )
398{
399
f090c9d4 400 return ( float64_val(a)>>52 ) & 0x7FF;
158142c2
FB
401
402}
403
404/*----------------------------------------------------------------------------
405| Returns the sign bit of the double-precision floating-point value `a'.
406*----------------------------------------------------------------------------*/
407
408INLINE flag extractFloat64Sign( float64 a )
409{
410
f090c9d4 411 return float64_val(a)>>63;
158142c2
FB
412
413}
414
37d18660
PM
415/*----------------------------------------------------------------------------
416| If `a' is denormal and we are in flush-to-zero mode then set the
417| input-denormal exception and return zero. Otherwise just return the value.
418*----------------------------------------------------------------------------*/
419static float64 float64_squash_input_denormal(float64 a STATUS_PARAM)
420{
421 if (STATUS(flush_inputs_to_zero)) {
422 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
423 float_raise(float_flag_input_denormal STATUS_VAR);
424 return make_float64(float64_val(a) & (1ULL << 63));
425 }
426 }
427 return a;
428}
429
158142c2
FB
430/*----------------------------------------------------------------------------
431| Normalizes the subnormal double-precision floating-point value represented
432| by the denormalized significand `aSig'. The normalized exponent and
433| significand are stored at the locations pointed to by `zExpPtr' and
434| `zSigPtr', respectively.
435*----------------------------------------------------------------------------*/
436
437static void
bb98fe42 438 normalizeFloat64Subnormal( uint64_t aSig, int16 *zExpPtr, uint64_t *zSigPtr )
158142c2
FB
439{
440 int8 shiftCount;
441
442 shiftCount = countLeadingZeros64( aSig ) - 11;
443 *zSigPtr = aSig<<shiftCount;
444 *zExpPtr = 1 - shiftCount;
445
446}
447
448/*----------------------------------------------------------------------------
449| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
450| double-precision floating-point value, returning the result. After being
451| shifted into the proper positions, the three fields are simply added
452| together to form the result. This means that any integer portion of `zSig'
453| will be added into the exponent. Since a properly normalized significand
454| will have an integer portion equal to 1, the `zExp' input should be 1 less
455| than the desired result exponent whenever `zSig' is a complete, normalized
456| significand.
457*----------------------------------------------------------------------------*/
458
bb98fe42 459INLINE float64 packFloat64( flag zSign, int16 zExp, uint64_t zSig )
158142c2
FB
460{
461
f090c9d4 462 return make_float64(
bb98fe42 463 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
158142c2
FB
464
465}
466
467/*----------------------------------------------------------------------------
468| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
469| and significand `zSig', and returns the proper double-precision floating-
470| point value corresponding to the abstract input. Ordinarily, the abstract
471| value is simply rounded and packed into the double-precision format, with
472| the inexact exception raised if the abstract input cannot be represented
473| exactly. However, if the abstract value is too large, the overflow and
474| inexact exceptions are raised and an infinity or maximal finite value is
475| returned. If the abstract value is too small, the input value is rounded
476| to a subnormal number, and the underflow and inexact exceptions are raised
477| if the abstract input cannot be represented exactly as a subnormal double-
478| precision floating-point number.
479| The input significand `zSig' has its binary point between bits 62
480| and 61, which is 10 bits to the left of the usual location. This shifted
481| significand must be normalized or smaller. If `zSig' is not normalized,
482| `zExp' must be 0; in that case, the result returned is a subnormal number,
483| and it must not require rounding. In the usual case that `zSig' is
484| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
485| The handling of underflow and overflow follows the IEC/IEEE Standard for
486| Binary Floating-Point Arithmetic.
487*----------------------------------------------------------------------------*/
488
bb98fe42 489static float64 roundAndPackFloat64( flag zSign, int16 zExp, uint64_t zSig STATUS_PARAM)
158142c2
FB
490{
491 int8 roundingMode;
492 flag roundNearestEven;
493 int16 roundIncrement, roundBits;
494 flag isTiny;
495
496 roundingMode = STATUS(float_rounding_mode);
497 roundNearestEven = ( roundingMode == float_round_nearest_even );
498 roundIncrement = 0x200;
499 if ( ! roundNearestEven ) {
500 if ( roundingMode == float_round_to_zero ) {
501 roundIncrement = 0;
502 }
503 else {
504 roundIncrement = 0x3FF;
505 if ( zSign ) {
506 if ( roundingMode == float_round_up ) roundIncrement = 0;
507 }
508 else {
509 if ( roundingMode == float_round_down ) roundIncrement = 0;
510 }
511 }
512 }
513 roundBits = zSig & 0x3FF;
bb98fe42 514 if ( 0x7FD <= (uint16_t) zExp ) {
158142c2
FB
515 if ( ( 0x7FD < zExp )
516 || ( ( zExp == 0x7FD )
bb98fe42 517 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
158142c2
FB
518 ) {
519 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
f090c9d4 520 return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
158142c2
FB
521 }
522 if ( zExp < 0 ) {
fe76d976 523 if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
158142c2
FB
524 isTiny =
525 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
526 || ( zExp < -1 )
527 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
528 shift64RightJamming( zSig, - zExp, &zSig );
529 zExp = 0;
530 roundBits = zSig & 0x3FF;
531 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
532 }
533 }
534 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
535 zSig = ( zSig + roundIncrement )>>10;
536 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
537 if ( zSig == 0 ) zExp = 0;
538 return packFloat64( zSign, zExp, zSig );
539
540}
541
542/*----------------------------------------------------------------------------
543| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
544| and significand `zSig', and returns the proper double-precision floating-
545| point value corresponding to the abstract input. This routine is just like
546| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
547| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
548| floating-point exponent.
549*----------------------------------------------------------------------------*/
550
551static float64
bb98fe42 552 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, uint64_t zSig STATUS_PARAM)
158142c2
FB
553{
554 int8 shiftCount;
555
556 shiftCount = countLeadingZeros64( zSig ) - 1;
557 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
558
559}
560
561#ifdef FLOATX80
562
563/*----------------------------------------------------------------------------
564| Returns the fraction bits of the extended double-precision floating-point
565| value `a'.
566*----------------------------------------------------------------------------*/
567
bb98fe42 568INLINE uint64_t extractFloatx80Frac( floatx80 a )
158142c2
FB
569{
570
571 return a.low;
572
573}
574
575/*----------------------------------------------------------------------------
576| Returns the exponent bits of the extended double-precision floating-point
577| value `a'.
578*----------------------------------------------------------------------------*/
579
580INLINE int32 extractFloatx80Exp( floatx80 a )
581{
582
583 return a.high & 0x7FFF;
584
585}
586
587/*----------------------------------------------------------------------------
588| Returns the sign bit of the extended double-precision floating-point value
589| `a'.
590*----------------------------------------------------------------------------*/
591
592INLINE flag extractFloatx80Sign( floatx80 a )
593{
594
595 return a.high>>15;
596
597}
598
599/*----------------------------------------------------------------------------
600| Normalizes the subnormal extended double-precision floating-point value
601| represented by the denormalized significand `aSig'. The normalized exponent
602| and significand are stored at the locations pointed to by `zExpPtr' and
603| `zSigPtr', respectively.
604*----------------------------------------------------------------------------*/
605
606static void
bb98fe42 607 normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr )
158142c2
FB
608{
609 int8 shiftCount;
610
611 shiftCount = countLeadingZeros64( aSig );
612 *zSigPtr = aSig<<shiftCount;
613 *zExpPtr = 1 - shiftCount;
614
615}
616
617/*----------------------------------------------------------------------------
618| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
619| extended double-precision floating-point value, returning the result.
620*----------------------------------------------------------------------------*/
621
bb98fe42 622INLINE floatx80 packFloatx80( flag zSign, int32 zExp, uint64_t zSig )
158142c2
FB
623{
624 floatx80 z;
625
626 z.low = zSig;
bb98fe42 627 z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
158142c2
FB
628 return z;
629
630}
631
632/*----------------------------------------------------------------------------
633| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
634| and extended significand formed by the concatenation of `zSig0' and `zSig1',
635| and returns the proper extended double-precision floating-point value
636| corresponding to the abstract input. Ordinarily, the abstract value is
637| rounded and packed into the extended double-precision format, with the
638| inexact exception raised if the abstract input cannot be represented
639| exactly. However, if the abstract value is too large, the overflow and
640| inexact exceptions are raised and an infinity or maximal finite value is
641| returned. If the abstract value is too small, the input value is rounded to
642| a subnormal number, and the underflow and inexact exceptions are raised if
643| the abstract input cannot be represented exactly as a subnormal extended
644| double-precision floating-point number.
645| If `roundingPrecision' is 32 or 64, the result is rounded to the same
646| number of bits as single or double precision, respectively. Otherwise, the
647| result is rounded to the full precision of the extended double-precision
648| format.
649| The input significand must be normalized or smaller. If the input
650| significand is not normalized, `zExp' must be 0; in that case, the result
651| returned is a subnormal number, and it must not require rounding. The
652| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
653| Floating-Point Arithmetic.
654*----------------------------------------------------------------------------*/
655
656static floatx80
657 roundAndPackFloatx80(
bb98fe42 658 int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
158142c2
FB
659 STATUS_PARAM)
660{
661 int8 roundingMode;
662 flag roundNearestEven, increment, isTiny;
663 int64 roundIncrement, roundMask, roundBits;
664
665 roundingMode = STATUS(float_rounding_mode);
666 roundNearestEven = ( roundingMode == float_round_nearest_even );
667 if ( roundingPrecision == 80 ) goto precision80;
668 if ( roundingPrecision == 64 ) {
669 roundIncrement = LIT64( 0x0000000000000400 );
670 roundMask = LIT64( 0x00000000000007FF );
671 }
672 else if ( roundingPrecision == 32 ) {
673 roundIncrement = LIT64( 0x0000008000000000 );
674 roundMask = LIT64( 0x000000FFFFFFFFFF );
675 }
676 else {
677 goto precision80;
678 }
679 zSig0 |= ( zSig1 != 0 );
680 if ( ! roundNearestEven ) {
681 if ( roundingMode == float_round_to_zero ) {
682 roundIncrement = 0;
683 }
684 else {
685 roundIncrement = roundMask;
686 if ( zSign ) {
687 if ( roundingMode == float_round_up ) roundIncrement = 0;
688 }
689 else {
690 if ( roundingMode == float_round_down ) roundIncrement = 0;
691 }
692 }
693 }
694 roundBits = zSig0 & roundMask;
bb98fe42 695 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
158142c2
FB
696 if ( ( 0x7FFE < zExp )
697 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
698 ) {
699 goto overflow;
700 }
701 if ( zExp <= 0 ) {
fe76d976 702 if ( STATUS(flush_to_zero) ) return packFloatx80( zSign, 0, 0 );
158142c2
FB
703 isTiny =
704 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
705 || ( zExp < 0 )
706 || ( zSig0 <= zSig0 + roundIncrement );
707 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
708 zExp = 0;
709 roundBits = zSig0 & roundMask;
710 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
711 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
712 zSig0 += roundIncrement;
bb98fe42 713 if ( (int64_t) zSig0 < 0 ) zExp = 1;
158142c2
FB
714 roundIncrement = roundMask + 1;
715 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
716 roundMask |= roundIncrement;
717 }
718 zSig0 &= ~ roundMask;
719 return packFloatx80( zSign, zExp, zSig0 );
720 }
721 }
722 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
723 zSig0 += roundIncrement;
724 if ( zSig0 < roundIncrement ) {
725 ++zExp;
726 zSig0 = LIT64( 0x8000000000000000 );
727 }
728 roundIncrement = roundMask + 1;
729 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
730 roundMask |= roundIncrement;
731 }
732 zSig0 &= ~ roundMask;
733 if ( zSig0 == 0 ) zExp = 0;
734 return packFloatx80( zSign, zExp, zSig0 );
735 precision80:
bb98fe42 736 increment = ( (int64_t) zSig1 < 0 );
158142c2
FB
737 if ( ! roundNearestEven ) {
738 if ( roundingMode == float_round_to_zero ) {
739 increment = 0;
740 }
741 else {
742 if ( zSign ) {
743 increment = ( roundingMode == float_round_down ) && zSig1;
744 }
745 else {
746 increment = ( roundingMode == float_round_up ) && zSig1;
747 }
748 }
749 }
bb98fe42 750 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
158142c2
FB
751 if ( ( 0x7FFE < zExp )
752 || ( ( zExp == 0x7FFE )
753 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
754 && increment
755 )
756 ) {
757 roundMask = 0;
758 overflow:
759 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
760 if ( ( roundingMode == float_round_to_zero )
761 || ( zSign && ( roundingMode == float_round_up ) )
762 || ( ! zSign && ( roundingMode == float_round_down ) )
763 ) {
764 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
765 }
766 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
767 }
768 if ( zExp <= 0 ) {
769 isTiny =
770 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
771 || ( zExp < 0 )
772 || ! increment
773 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
774 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
775 zExp = 0;
776 if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
777 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
778 if ( roundNearestEven ) {
bb98fe42 779 increment = ( (int64_t) zSig1 < 0 );
158142c2
FB
780 }
781 else {
782 if ( zSign ) {
783 increment = ( roundingMode == float_round_down ) && zSig1;
784 }
785 else {
786 increment = ( roundingMode == float_round_up ) && zSig1;
787 }
788 }
789 if ( increment ) {
790 ++zSig0;
791 zSig0 &=
bb98fe42
AF
792 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
793 if ( (int64_t) zSig0 < 0 ) zExp = 1;
158142c2
FB
794 }
795 return packFloatx80( zSign, zExp, zSig0 );
796 }
797 }
798 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
799 if ( increment ) {
800 ++zSig0;
801 if ( zSig0 == 0 ) {
802 ++zExp;
803 zSig0 = LIT64( 0x8000000000000000 );
804 }
805 else {
bb98fe42 806 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
158142c2
FB
807 }
808 }
809 else {
810 if ( zSig0 == 0 ) zExp = 0;
811 }
812 return packFloatx80( zSign, zExp, zSig0 );
813
814}
815
816/*----------------------------------------------------------------------------
817| Takes an abstract floating-point value having sign `zSign', exponent
818| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
819| and returns the proper extended double-precision floating-point value
820| corresponding to the abstract input. This routine is just like
821| `roundAndPackFloatx80' except that the input significand does not have to be
822| normalized.
823*----------------------------------------------------------------------------*/
824
825static floatx80
826 normalizeRoundAndPackFloatx80(
bb98fe42 827 int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
158142c2
FB
828 STATUS_PARAM)
829{
830 int8 shiftCount;
831
832 if ( zSig0 == 0 ) {
833 zSig0 = zSig1;
834 zSig1 = 0;
835 zExp -= 64;
836 }
837 shiftCount = countLeadingZeros64( zSig0 );
838 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
839 zExp -= shiftCount;
840 return
841 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
842
843}
844
845#endif
846
847#ifdef FLOAT128
848
849/*----------------------------------------------------------------------------
850| Returns the least-significant 64 fraction bits of the quadruple-precision
851| floating-point value `a'.
852*----------------------------------------------------------------------------*/
853
bb98fe42 854INLINE uint64_t extractFloat128Frac1( float128 a )
158142c2
FB
855{
856
857 return a.low;
858
859}
860
861/*----------------------------------------------------------------------------
862| Returns the most-significant 48 fraction bits of the quadruple-precision
863| floating-point value `a'.
864*----------------------------------------------------------------------------*/
865
bb98fe42 866INLINE uint64_t extractFloat128Frac0( float128 a )
158142c2
FB
867{
868
869 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
870
871}
872
873/*----------------------------------------------------------------------------
874| Returns the exponent bits of the quadruple-precision floating-point value
875| `a'.
876*----------------------------------------------------------------------------*/
877
878INLINE int32 extractFloat128Exp( float128 a )
879{
880
881 return ( a.high>>48 ) & 0x7FFF;
882
883}
884
885/*----------------------------------------------------------------------------
886| Returns the sign bit of the quadruple-precision floating-point value `a'.
887*----------------------------------------------------------------------------*/
888
889INLINE flag extractFloat128Sign( float128 a )
890{
891
892 return a.high>>63;
893
894}
895
896/*----------------------------------------------------------------------------
897| Normalizes the subnormal quadruple-precision floating-point value
898| represented by the denormalized significand formed by the concatenation of
899| `aSig0' and `aSig1'. The normalized exponent is stored at the location
900| pointed to by `zExpPtr'. The most significant 49 bits of the normalized
901| significand are stored at the location pointed to by `zSig0Ptr', and the
902| least significant 64 bits of the normalized significand are stored at the
903| location pointed to by `zSig1Ptr'.
904*----------------------------------------------------------------------------*/
905
906static void
907 normalizeFloat128Subnormal(
bb98fe42
AF
908 uint64_t aSig0,
909 uint64_t aSig1,
158142c2 910 int32 *zExpPtr,
bb98fe42
AF
911 uint64_t *zSig0Ptr,
912 uint64_t *zSig1Ptr
158142c2
FB
913 )
914{
915 int8 shiftCount;
916
917 if ( aSig0 == 0 ) {
918 shiftCount = countLeadingZeros64( aSig1 ) - 15;
919 if ( shiftCount < 0 ) {
920 *zSig0Ptr = aSig1>>( - shiftCount );
921 *zSig1Ptr = aSig1<<( shiftCount & 63 );
922 }
923 else {
924 *zSig0Ptr = aSig1<<shiftCount;
925 *zSig1Ptr = 0;
926 }
927 *zExpPtr = - shiftCount - 63;
928 }
929 else {
930 shiftCount = countLeadingZeros64( aSig0 ) - 15;
931 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
932 *zExpPtr = 1 - shiftCount;
933 }
934
935}
936
937/*----------------------------------------------------------------------------
938| Packs the sign `zSign', the exponent `zExp', and the significand formed
939| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
940| floating-point value, returning the result. After being shifted into the
941| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
942| added together to form the most significant 32 bits of the result. This
943| means that any integer portion of `zSig0' will be added into the exponent.
944| Since a properly normalized significand will have an integer portion equal
945| to 1, the `zExp' input should be 1 less than the desired result exponent
946| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
947| significand.
948*----------------------------------------------------------------------------*/
949
950INLINE float128
bb98fe42 951 packFloat128( flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 )
158142c2
FB
952{
953 float128 z;
954
955 z.low = zSig1;
bb98fe42 956 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
158142c2
FB
957 return z;
958
959}
960
961/*----------------------------------------------------------------------------
962| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
963| and extended significand formed by the concatenation of `zSig0', `zSig1',
964| and `zSig2', and returns the proper quadruple-precision floating-point value
965| corresponding to the abstract input. Ordinarily, the abstract value is
966| simply rounded and packed into the quadruple-precision format, with the
967| inexact exception raised if the abstract input cannot be represented
968| exactly. However, if the abstract value is too large, the overflow and
969| inexact exceptions are raised and an infinity or maximal finite value is
970| returned. If the abstract value is too small, the input value is rounded to
971| a subnormal number, and the underflow and inexact exceptions are raised if
972| the abstract input cannot be represented exactly as a subnormal quadruple-
973| precision floating-point number.
974| The input significand must be normalized or smaller. If the input
975| significand is not normalized, `zExp' must be 0; in that case, the result
976| returned is a subnormal number, and it must not require rounding. In the
977| usual case that the input significand is normalized, `zExp' must be 1 less
978| than the ``true'' floating-point exponent. The handling of underflow and
979| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
980*----------------------------------------------------------------------------*/
981
982static float128
983 roundAndPackFloat128(
bb98fe42 984 flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1, uint64_t zSig2 STATUS_PARAM)
158142c2
FB
985{
986 int8 roundingMode;
987 flag roundNearestEven, increment, isTiny;
988
989 roundingMode = STATUS(float_rounding_mode);
990 roundNearestEven = ( roundingMode == float_round_nearest_even );
bb98fe42 991 increment = ( (int64_t) zSig2 < 0 );
158142c2
FB
992 if ( ! roundNearestEven ) {
993 if ( roundingMode == float_round_to_zero ) {
994 increment = 0;
995 }
996 else {
997 if ( zSign ) {
998 increment = ( roundingMode == float_round_down ) && zSig2;
999 }
1000 else {
1001 increment = ( roundingMode == float_round_up ) && zSig2;
1002 }
1003 }
1004 }
bb98fe42 1005 if ( 0x7FFD <= (uint32_t) zExp ) {
158142c2
FB
1006 if ( ( 0x7FFD < zExp )
1007 || ( ( zExp == 0x7FFD )
1008 && eq128(
1009 LIT64( 0x0001FFFFFFFFFFFF ),
1010 LIT64( 0xFFFFFFFFFFFFFFFF ),
1011 zSig0,
1012 zSig1
1013 )
1014 && increment
1015 )
1016 ) {
1017 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
1018 if ( ( roundingMode == float_round_to_zero )
1019 || ( zSign && ( roundingMode == float_round_up ) )
1020 || ( ! zSign && ( roundingMode == float_round_down ) )
1021 ) {
1022 return
1023 packFloat128(
1024 zSign,
1025 0x7FFE,
1026 LIT64( 0x0000FFFFFFFFFFFF ),
1027 LIT64( 0xFFFFFFFFFFFFFFFF )
1028 );
1029 }
1030 return packFloat128( zSign, 0x7FFF, 0, 0 );
1031 }
1032 if ( zExp < 0 ) {
fe76d976 1033 if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
158142c2
FB
1034 isTiny =
1035 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
1036 || ( zExp < -1 )
1037 || ! increment
1038 || lt128(
1039 zSig0,
1040 zSig1,
1041 LIT64( 0x0001FFFFFFFFFFFF ),
1042 LIT64( 0xFFFFFFFFFFFFFFFF )
1043 );
1044 shift128ExtraRightJamming(
1045 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1046 zExp = 0;
1047 if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
1048 if ( roundNearestEven ) {
bb98fe42 1049 increment = ( (int64_t) zSig2 < 0 );
158142c2
FB
1050 }
1051 else {
1052 if ( zSign ) {
1053 increment = ( roundingMode == float_round_down ) && zSig2;
1054 }
1055 else {
1056 increment = ( roundingMode == float_round_up ) && zSig2;
1057 }
1058 }
1059 }
1060 }
1061 if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1062 if ( increment ) {
1063 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1064 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1065 }
1066 else {
1067 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1068 }
1069 return packFloat128( zSign, zExp, zSig0, zSig1 );
1070
1071}
1072
1073/*----------------------------------------------------------------------------
1074| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1075| and significand formed by the concatenation of `zSig0' and `zSig1', and
1076| returns the proper quadruple-precision floating-point value corresponding
1077| to the abstract input. This routine is just like `roundAndPackFloat128'
1078| except that the input significand has fewer bits and does not have to be
1079| normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1080| point exponent.
1081*----------------------------------------------------------------------------*/
1082
1083static float128
1084 normalizeRoundAndPackFloat128(
bb98fe42 1085 flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 STATUS_PARAM)
158142c2
FB
1086{
1087 int8 shiftCount;
bb98fe42 1088 uint64_t zSig2;
158142c2
FB
1089
1090 if ( zSig0 == 0 ) {
1091 zSig0 = zSig1;
1092 zSig1 = 0;
1093 zExp -= 64;
1094 }
1095 shiftCount = countLeadingZeros64( zSig0 ) - 15;
1096 if ( 0 <= shiftCount ) {
1097 zSig2 = 0;
1098 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1099 }
1100 else {
1101 shift128ExtraRightJamming(
1102 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1103 }
1104 zExp -= shiftCount;
1105 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1106
1107}
1108
1109#endif
1110
1111/*----------------------------------------------------------------------------
1112| Returns the result of converting the 32-bit two's complement integer `a'
1113| to the single-precision floating-point format. The conversion is performed
1114| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1115*----------------------------------------------------------------------------*/
1116
1117float32 int32_to_float32( int32 a STATUS_PARAM )
1118{
1119 flag zSign;
1120
f090c9d4 1121 if ( a == 0 ) return float32_zero;
bb98fe42 1122 if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
158142c2
FB
1123 zSign = ( a < 0 );
1124 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1125
1126}
1127
1128/*----------------------------------------------------------------------------
1129| Returns the result of converting the 32-bit two's complement integer `a'
1130| to the double-precision floating-point format. The conversion is performed
1131| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1132*----------------------------------------------------------------------------*/
1133
1134float64 int32_to_float64( int32 a STATUS_PARAM )
1135{
1136 flag zSign;
1137 uint32 absA;
1138 int8 shiftCount;
bb98fe42 1139 uint64_t zSig;
158142c2 1140
f090c9d4 1141 if ( a == 0 ) return float64_zero;
158142c2
FB
1142 zSign = ( a < 0 );
1143 absA = zSign ? - a : a;
1144 shiftCount = countLeadingZeros32( absA ) + 21;
1145 zSig = absA;
1146 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1147
1148}
1149
1150#ifdef FLOATX80
1151
1152/*----------------------------------------------------------------------------
1153| Returns the result of converting the 32-bit two's complement integer `a'
1154| to the extended double-precision floating-point format. The conversion
1155| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1156| Arithmetic.
1157*----------------------------------------------------------------------------*/
1158
1159floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1160{
1161 flag zSign;
1162 uint32 absA;
1163 int8 shiftCount;
bb98fe42 1164 uint64_t zSig;
158142c2
FB
1165
1166 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1167 zSign = ( a < 0 );
1168 absA = zSign ? - a : a;
1169 shiftCount = countLeadingZeros32( absA ) + 32;
1170 zSig = absA;
1171 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1172
1173}
1174
1175#endif
1176
1177#ifdef FLOAT128
1178
1179/*----------------------------------------------------------------------------
1180| Returns the result of converting the 32-bit two's complement integer `a' to
1181| the quadruple-precision floating-point format. The conversion is performed
1182| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1183*----------------------------------------------------------------------------*/
1184
1185float128 int32_to_float128( int32 a STATUS_PARAM )
1186{
1187 flag zSign;
1188 uint32 absA;
1189 int8 shiftCount;
bb98fe42 1190 uint64_t zSig0;
158142c2
FB
1191
1192 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1193 zSign = ( a < 0 );
1194 absA = zSign ? - a : a;
1195 shiftCount = countLeadingZeros32( absA ) + 17;
1196 zSig0 = absA;
1197 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1198
1199}
1200
1201#endif
1202
1203/*----------------------------------------------------------------------------
1204| Returns the result of converting the 64-bit two's complement integer `a'
1205| to the single-precision floating-point format. The conversion is performed
1206| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1207*----------------------------------------------------------------------------*/
1208
1209float32 int64_to_float32( int64 a STATUS_PARAM )
1210{
1211 flag zSign;
1212 uint64 absA;
1213 int8 shiftCount;
1214
f090c9d4 1215 if ( a == 0 ) return float32_zero;
158142c2
FB
1216 zSign = ( a < 0 );
1217 absA = zSign ? - a : a;
1218 shiftCount = countLeadingZeros64( absA ) - 40;
1219 if ( 0 <= shiftCount ) {
1220 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1221 }
1222 else {
1223 shiftCount += 7;
1224 if ( shiftCount < 0 ) {
1225 shift64RightJamming( absA, - shiftCount, &absA );
1226 }
1227 else {
1228 absA <<= shiftCount;
1229 }
1230 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1231 }
1232
1233}
1234
3430b0be 1235float32 uint64_to_float32( uint64 a STATUS_PARAM )
75d62a58
JM
1236{
1237 int8 shiftCount;
1238
f090c9d4 1239 if ( a == 0 ) return float32_zero;
75d62a58
JM
1240 shiftCount = countLeadingZeros64( a ) - 40;
1241 if ( 0 <= shiftCount ) {
1242 return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount );
1243 }
1244 else {
1245 shiftCount += 7;
1246 if ( shiftCount < 0 ) {
1247 shift64RightJamming( a, - shiftCount, &a );
1248 }
1249 else {
1250 a <<= shiftCount;
1251 }
1252 return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR );
1253 }
1254}
1255
158142c2
FB
1256/*----------------------------------------------------------------------------
1257| Returns the result of converting the 64-bit two's complement integer `a'
1258| to the double-precision floating-point format. The conversion is performed
1259| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1260*----------------------------------------------------------------------------*/
1261
1262float64 int64_to_float64( int64 a STATUS_PARAM )
1263{
1264 flag zSign;
1265
f090c9d4 1266 if ( a == 0 ) return float64_zero;
bb98fe42 1267 if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
158142c2
FB
1268 return packFloat64( 1, 0x43E, 0 );
1269 }
1270 zSign = ( a < 0 );
1271 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1272
1273}
1274
75d62a58
JM
1275float64 uint64_to_float64( uint64 a STATUS_PARAM )
1276{
f090c9d4 1277 if ( a == 0 ) return float64_zero;
75d62a58
JM
1278 return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR );
1279
1280}
1281
158142c2
FB
1282#ifdef FLOATX80
1283
1284/*----------------------------------------------------------------------------
1285| Returns the result of converting the 64-bit two's complement integer `a'
1286| to the extended double-precision floating-point format. The conversion
1287| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1288| Arithmetic.
1289*----------------------------------------------------------------------------*/
1290
1291floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1292{
1293 flag zSign;
1294 uint64 absA;
1295 int8 shiftCount;
1296
1297 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1298 zSign = ( a < 0 );
1299 absA = zSign ? - a : a;
1300 shiftCount = countLeadingZeros64( absA );
1301 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1302
1303}
1304
1305#endif
1306
1307#ifdef FLOAT128
1308
1309/*----------------------------------------------------------------------------
1310| Returns the result of converting the 64-bit two's complement integer `a' to
1311| the quadruple-precision floating-point format. The conversion is performed
1312| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1313*----------------------------------------------------------------------------*/
1314
1315float128 int64_to_float128( int64 a STATUS_PARAM )
1316{
1317 flag zSign;
1318 uint64 absA;
1319 int8 shiftCount;
1320 int32 zExp;
bb98fe42 1321 uint64_t zSig0, zSig1;
158142c2
FB
1322
1323 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1324 zSign = ( a < 0 );
1325 absA = zSign ? - a : a;
1326 shiftCount = countLeadingZeros64( absA ) + 49;
1327 zExp = 0x406E - shiftCount;
1328 if ( 64 <= shiftCount ) {
1329 zSig1 = 0;
1330 zSig0 = absA;
1331 shiftCount -= 64;
1332 }
1333 else {
1334 zSig1 = absA;
1335 zSig0 = 0;
1336 }
1337 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1338 return packFloat128( zSign, zExp, zSig0, zSig1 );
1339
1340}
1341
1342#endif
1343
1344/*----------------------------------------------------------------------------
1345| Returns the result of converting the single-precision floating-point value
1346| `a' to the 32-bit two's complement integer format. The conversion is
1347| performed according to the IEC/IEEE Standard for Binary Floating-Point
1348| Arithmetic---which means in particular that the conversion is rounded
1349| according to the current rounding mode. If `a' is a NaN, the largest
1350| positive integer is returned. Otherwise, if the conversion overflows, the
1351| largest integer with the same sign as `a' is returned.
1352*----------------------------------------------------------------------------*/
1353
1354int32 float32_to_int32( float32 a STATUS_PARAM )
1355{
1356 flag aSign;
1357 int16 aExp, shiftCount;
bb98fe42
AF
1358 uint32_t aSig;
1359 uint64_t aSig64;
158142c2 1360
37d18660 1361 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1362 aSig = extractFloat32Frac( a );
1363 aExp = extractFloat32Exp( a );
1364 aSign = extractFloat32Sign( a );
1365 if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1366 if ( aExp ) aSig |= 0x00800000;
1367 shiftCount = 0xAF - aExp;
1368 aSig64 = aSig;
1369 aSig64 <<= 32;
1370 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1371 return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1372
1373}
1374
1375/*----------------------------------------------------------------------------
1376| Returns the result of converting the single-precision floating-point value
1377| `a' to the 32-bit two's complement integer format. The conversion is
1378| performed according to the IEC/IEEE Standard for Binary Floating-Point
1379| Arithmetic, except that the conversion is always rounded toward zero.
1380| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1381| the conversion overflows, the largest integer with the same sign as `a' is
1382| returned.
1383*----------------------------------------------------------------------------*/
1384
1385int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1386{
1387 flag aSign;
1388 int16 aExp, shiftCount;
bb98fe42 1389 uint32_t aSig;
158142c2 1390 int32 z;
37d18660 1391 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1392
1393 aSig = extractFloat32Frac( a );
1394 aExp = extractFloat32Exp( a );
1395 aSign = extractFloat32Sign( a );
1396 shiftCount = aExp - 0x9E;
1397 if ( 0 <= shiftCount ) {
f090c9d4 1398 if ( float32_val(a) != 0xCF000000 ) {
158142c2
FB
1399 float_raise( float_flag_invalid STATUS_VAR);
1400 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1401 }
bb98fe42 1402 return (int32_t) 0x80000000;
158142c2
FB
1403 }
1404 else if ( aExp <= 0x7E ) {
1405 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1406 return 0;
1407 }
1408 aSig = ( aSig | 0x00800000 )<<8;
1409 z = aSig>>( - shiftCount );
bb98fe42 1410 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
158142c2
FB
1411 STATUS(float_exception_flags) |= float_flag_inexact;
1412 }
1413 if ( aSign ) z = - z;
1414 return z;
1415
1416}
1417
cbcef455
PM
1418/*----------------------------------------------------------------------------
1419| Returns the result of converting the single-precision floating-point value
1420| `a' to the 16-bit two's complement integer format. The conversion is
1421| performed according to the IEC/IEEE Standard for Binary Floating-Point
1422| Arithmetic, except that the conversion is always rounded toward zero.
1423| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1424| the conversion overflows, the largest integer with the same sign as `a' is
1425| returned.
1426*----------------------------------------------------------------------------*/
1427
1428int16 float32_to_int16_round_to_zero( float32 a STATUS_PARAM )
1429{
1430 flag aSign;
1431 int16 aExp, shiftCount;
bb98fe42 1432 uint32_t aSig;
cbcef455
PM
1433 int32 z;
1434
1435 aSig = extractFloat32Frac( a );
1436 aExp = extractFloat32Exp( a );
1437 aSign = extractFloat32Sign( a );
1438 shiftCount = aExp - 0x8E;
1439 if ( 0 <= shiftCount ) {
1440 if ( float32_val(a) != 0xC7000000 ) {
1441 float_raise( float_flag_invalid STATUS_VAR);
1442 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1443 return 0x7FFF;
1444 }
1445 }
bb98fe42 1446 return (int32_t) 0xffff8000;
cbcef455
PM
1447 }
1448 else if ( aExp <= 0x7E ) {
1449 if ( aExp | aSig ) {
1450 STATUS(float_exception_flags) |= float_flag_inexact;
1451 }
1452 return 0;
1453 }
1454 shiftCount -= 0x10;
1455 aSig = ( aSig | 0x00800000 )<<8;
1456 z = aSig>>( - shiftCount );
bb98fe42 1457 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
cbcef455
PM
1458 STATUS(float_exception_flags) |= float_flag_inexact;
1459 }
1460 if ( aSign ) {
1461 z = - z;
1462 }
1463 return z;
1464
1465}
1466
158142c2
FB
1467/*----------------------------------------------------------------------------
1468| Returns the result of converting the single-precision floating-point value
1469| `a' to the 64-bit two's complement integer format. The conversion is
1470| performed according to the IEC/IEEE Standard for Binary Floating-Point
1471| Arithmetic---which means in particular that the conversion is rounded
1472| according to the current rounding mode. If `a' is a NaN, the largest
1473| positive integer is returned. Otherwise, if the conversion overflows, the
1474| largest integer with the same sign as `a' is returned.
1475*----------------------------------------------------------------------------*/
1476
1477int64 float32_to_int64( float32 a STATUS_PARAM )
1478{
1479 flag aSign;
1480 int16 aExp, shiftCount;
bb98fe42
AF
1481 uint32_t aSig;
1482 uint64_t aSig64, aSigExtra;
37d18660 1483 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1484
1485 aSig = extractFloat32Frac( a );
1486 aExp = extractFloat32Exp( a );
1487 aSign = extractFloat32Sign( a );
1488 shiftCount = 0xBE - aExp;
1489 if ( shiftCount < 0 ) {
1490 float_raise( float_flag_invalid STATUS_VAR);
1491 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1492 return LIT64( 0x7FFFFFFFFFFFFFFF );
1493 }
bb98fe42 1494 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
1495 }
1496 if ( aExp ) aSig |= 0x00800000;
1497 aSig64 = aSig;
1498 aSig64 <<= 40;
1499 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1500 return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1501
1502}
1503
1504/*----------------------------------------------------------------------------
1505| Returns the result of converting the single-precision floating-point value
1506| `a' to the 64-bit two's complement integer format. The conversion is
1507| performed according to the IEC/IEEE Standard for Binary Floating-Point
1508| Arithmetic, except that the conversion is always rounded toward zero. If
1509| `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1510| conversion overflows, the largest integer with the same sign as `a' is
1511| returned.
1512*----------------------------------------------------------------------------*/
1513
1514int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1515{
1516 flag aSign;
1517 int16 aExp, shiftCount;
bb98fe42
AF
1518 uint32_t aSig;
1519 uint64_t aSig64;
158142c2 1520 int64 z;
37d18660 1521 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1522
1523 aSig = extractFloat32Frac( a );
1524 aExp = extractFloat32Exp( a );
1525 aSign = extractFloat32Sign( a );
1526 shiftCount = aExp - 0xBE;
1527 if ( 0 <= shiftCount ) {
f090c9d4 1528 if ( float32_val(a) != 0xDF000000 ) {
158142c2
FB
1529 float_raise( float_flag_invalid STATUS_VAR);
1530 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1531 return LIT64( 0x7FFFFFFFFFFFFFFF );
1532 }
1533 }
bb98fe42 1534 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
1535 }
1536 else if ( aExp <= 0x7E ) {
1537 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1538 return 0;
1539 }
1540 aSig64 = aSig | 0x00800000;
1541 aSig64 <<= 40;
1542 z = aSig64>>( - shiftCount );
bb98fe42 1543 if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
158142c2
FB
1544 STATUS(float_exception_flags) |= float_flag_inexact;
1545 }
1546 if ( aSign ) z = - z;
1547 return z;
1548
1549}
1550
1551/*----------------------------------------------------------------------------
1552| Returns the result of converting the single-precision floating-point value
1553| `a' to the double-precision floating-point format. The conversion is
1554| performed according to the IEC/IEEE Standard for Binary Floating-Point
1555| Arithmetic.
1556*----------------------------------------------------------------------------*/
1557
1558float64 float32_to_float64( float32 a STATUS_PARAM )
1559{
1560 flag aSign;
1561 int16 aExp;
bb98fe42 1562 uint32_t aSig;
37d18660 1563 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1564
1565 aSig = extractFloat32Frac( a );
1566 aExp = extractFloat32Exp( a );
1567 aSign = extractFloat32Sign( a );
1568 if ( aExp == 0xFF ) {
bcd4d9af 1569 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
1570 return packFloat64( aSign, 0x7FF, 0 );
1571 }
1572 if ( aExp == 0 ) {
1573 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1574 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1575 --aExp;
1576 }
bb98fe42 1577 return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
158142c2
FB
1578
1579}
1580
1581#ifdef FLOATX80
1582
1583/*----------------------------------------------------------------------------
1584| Returns the result of converting the single-precision floating-point value
1585| `a' to the extended double-precision floating-point format. The conversion
1586| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1587| Arithmetic.
1588*----------------------------------------------------------------------------*/
1589
1590floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1591{
1592 flag aSign;
1593 int16 aExp;
bb98fe42 1594 uint32_t aSig;
158142c2 1595
37d18660 1596 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1597 aSig = extractFloat32Frac( a );
1598 aExp = extractFloat32Exp( a );
1599 aSign = extractFloat32Sign( a );
1600 if ( aExp == 0xFF ) {
bcd4d9af 1601 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
1602 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1603 }
1604 if ( aExp == 0 ) {
1605 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1606 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1607 }
1608 aSig |= 0x00800000;
bb98fe42 1609 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
158142c2
FB
1610
1611}
1612
1613#endif
1614
1615#ifdef FLOAT128
1616
1617/*----------------------------------------------------------------------------
1618| Returns the result of converting the single-precision floating-point value
1619| `a' to the double-precision floating-point format. The conversion is
1620| performed according to the IEC/IEEE Standard for Binary Floating-Point
1621| Arithmetic.
1622*----------------------------------------------------------------------------*/
1623
1624float128 float32_to_float128( float32 a STATUS_PARAM )
1625{
1626 flag aSign;
1627 int16 aExp;
bb98fe42 1628 uint32_t aSig;
158142c2 1629
37d18660 1630 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1631 aSig = extractFloat32Frac( a );
1632 aExp = extractFloat32Exp( a );
1633 aSign = extractFloat32Sign( a );
1634 if ( aExp == 0xFF ) {
bcd4d9af 1635 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
1636 return packFloat128( aSign, 0x7FFF, 0, 0 );
1637 }
1638 if ( aExp == 0 ) {
1639 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1640 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1641 --aExp;
1642 }
bb98fe42 1643 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
158142c2
FB
1644
1645}
1646
1647#endif
1648
1649/*----------------------------------------------------------------------------
1650| Rounds the single-precision floating-point value `a' to an integer, and
1651| returns the result as a single-precision floating-point value. The
1652| operation is performed according to the IEC/IEEE Standard for Binary
1653| Floating-Point Arithmetic.
1654*----------------------------------------------------------------------------*/
1655
1656float32 float32_round_to_int( float32 a STATUS_PARAM)
1657{
1658 flag aSign;
1659 int16 aExp;
bb98fe42 1660 uint32_t lastBitMask, roundBitsMask;
158142c2 1661 int8 roundingMode;
bb98fe42 1662 uint32_t z;
37d18660 1663 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1664
1665 aExp = extractFloat32Exp( a );
1666 if ( 0x96 <= aExp ) {
1667 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1668 return propagateFloat32NaN( a, a STATUS_VAR );
1669 }
1670 return a;
1671 }
1672 if ( aExp <= 0x7E ) {
bb98fe42 1673 if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
158142c2
FB
1674 STATUS(float_exception_flags) |= float_flag_inexact;
1675 aSign = extractFloat32Sign( a );
1676 switch ( STATUS(float_rounding_mode) ) {
1677 case float_round_nearest_even:
1678 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1679 return packFloat32( aSign, 0x7F, 0 );
1680 }
1681 break;
1682 case float_round_down:
f090c9d4 1683 return make_float32(aSign ? 0xBF800000 : 0);
158142c2 1684 case float_round_up:
f090c9d4 1685 return make_float32(aSign ? 0x80000000 : 0x3F800000);
158142c2
FB
1686 }
1687 return packFloat32( aSign, 0, 0 );
1688 }
1689 lastBitMask = 1;
1690 lastBitMask <<= 0x96 - aExp;
1691 roundBitsMask = lastBitMask - 1;
f090c9d4 1692 z = float32_val(a);
158142c2
FB
1693 roundingMode = STATUS(float_rounding_mode);
1694 if ( roundingMode == float_round_nearest_even ) {
1695 z += lastBitMask>>1;
1696 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1697 }
1698 else if ( roundingMode != float_round_to_zero ) {
f090c9d4 1699 if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
158142c2
FB
1700 z += roundBitsMask;
1701 }
1702 }
1703 z &= ~ roundBitsMask;
f090c9d4
PB
1704 if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1705 return make_float32(z);
158142c2
FB
1706
1707}
1708
1709/*----------------------------------------------------------------------------
1710| Returns the result of adding the absolute values of the single-precision
1711| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1712| before being returned. `zSign' is ignored if the result is a NaN.
1713| The addition is performed according to the IEC/IEEE Standard for Binary
1714| Floating-Point Arithmetic.
1715*----------------------------------------------------------------------------*/
1716
1717static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1718{
1719 int16 aExp, bExp, zExp;
bb98fe42 1720 uint32_t aSig, bSig, zSig;
158142c2
FB
1721 int16 expDiff;
1722
1723 aSig = extractFloat32Frac( a );
1724 aExp = extractFloat32Exp( a );
1725 bSig = extractFloat32Frac( b );
1726 bExp = extractFloat32Exp( b );
1727 expDiff = aExp - bExp;
1728 aSig <<= 6;
1729 bSig <<= 6;
1730 if ( 0 < expDiff ) {
1731 if ( aExp == 0xFF ) {
1732 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1733 return a;
1734 }
1735 if ( bExp == 0 ) {
1736 --expDiff;
1737 }
1738 else {
1739 bSig |= 0x20000000;
1740 }
1741 shift32RightJamming( bSig, expDiff, &bSig );
1742 zExp = aExp;
1743 }
1744 else if ( expDiff < 0 ) {
1745 if ( bExp == 0xFF ) {
1746 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1747 return packFloat32( zSign, 0xFF, 0 );
1748 }
1749 if ( aExp == 0 ) {
1750 ++expDiff;
1751 }
1752 else {
1753 aSig |= 0x20000000;
1754 }
1755 shift32RightJamming( aSig, - expDiff, &aSig );
1756 zExp = bExp;
1757 }
1758 else {
1759 if ( aExp == 0xFF ) {
1760 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1761 return a;
1762 }
fe76d976
PB
1763 if ( aExp == 0 ) {
1764 if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
1765 return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1766 }
158142c2
FB
1767 zSig = 0x40000000 + aSig + bSig;
1768 zExp = aExp;
1769 goto roundAndPack;
1770 }
1771 aSig |= 0x20000000;
1772 zSig = ( aSig + bSig )<<1;
1773 --zExp;
bb98fe42 1774 if ( (int32_t) zSig < 0 ) {
158142c2
FB
1775 zSig = aSig + bSig;
1776 ++zExp;
1777 }
1778 roundAndPack:
1779 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1780
1781}
1782
1783/*----------------------------------------------------------------------------
1784| Returns the result of subtracting the absolute values of the single-
1785| precision floating-point values `a' and `b'. If `zSign' is 1, the
1786| difference is negated before being returned. `zSign' is ignored if the
1787| result is a NaN. The subtraction is performed according to the IEC/IEEE
1788| Standard for Binary Floating-Point Arithmetic.
1789*----------------------------------------------------------------------------*/
1790
1791static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1792{
1793 int16 aExp, bExp, zExp;
bb98fe42 1794 uint32_t aSig, bSig, zSig;
158142c2
FB
1795 int16 expDiff;
1796
1797 aSig = extractFloat32Frac( a );
1798 aExp = extractFloat32Exp( a );
1799 bSig = extractFloat32Frac( b );
1800 bExp = extractFloat32Exp( b );
1801 expDiff = aExp - bExp;
1802 aSig <<= 7;
1803 bSig <<= 7;
1804 if ( 0 < expDiff ) goto aExpBigger;
1805 if ( expDiff < 0 ) goto bExpBigger;
1806 if ( aExp == 0xFF ) {
1807 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1808 float_raise( float_flag_invalid STATUS_VAR);
1809 return float32_default_nan;
1810 }
1811 if ( aExp == 0 ) {
1812 aExp = 1;
1813 bExp = 1;
1814 }
1815 if ( bSig < aSig ) goto aBigger;
1816 if ( aSig < bSig ) goto bBigger;
1817 return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1818 bExpBigger:
1819 if ( bExp == 0xFF ) {
1820 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1821 return packFloat32( zSign ^ 1, 0xFF, 0 );
1822 }
1823 if ( aExp == 0 ) {
1824 ++expDiff;
1825 }
1826 else {
1827 aSig |= 0x40000000;
1828 }
1829 shift32RightJamming( aSig, - expDiff, &aSig );
1830 bSig |= 0x40000000;
1831 bBigger:
1832 zSig = bSig - aSig;
1833 zExp = bExp;
1834 zSign ^= 1;
1835 goto normalizeRoundAndPack;
1836 aExpBigger:
1837 if ( aExp == 0xFF ) {
1838 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1839 return a;
1840 }
1841 if ( bExp == 0 ) {
1842 --expDiff;
1843 }
1844 else {
1845 bSig |= 0x40000000;
1846 }
1847 shift32RightJamming( bSig, expDiff, &bSig );
1848 aSig |= 0x40000000;
1849 aBigger:
1850 zSig = aSig - bSig;
1851 zExp = aExp;
1852 normalizeRoundAndPack:
1853 --zExp;
1854 return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1855
1856}
1857
1858/*----------------------------------------------------------------------------
1859| Returns the result of adding the single-precision floating-point values `a'
1860| and `b'. The operation is performed according to the IEC/IEEE Standard for
1861| Binary Floating-Point Arithmetic.
1862*----------------------------------------------------------------------------*/
1863
1864float32 float32_add( float32 a, float32 b STATUS_PARAM )
1865{
1866 flag aSign, bSign;
37d18660
PM
1867 a = float32_squash_input_denormal(a STATUS_VAR);
1868 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
1869
1870 aSign = extractFloat32Sign( a );
1871 bSign = extractFloat32Sign( b );
1872 if ( aSign == bSign ) {
1873 return addFloat32Sigs( a, b, aSign STATUS_VAR);
1874 }
1875 else {
1876 return subFloat32Sigs( a, b, aSign STATUS_VAR );
1877 }
1878
1879}
1880
1881/*----------------------------------------------------------------------------
1882| Returns the result of subtracting the single-precision floating-point values
1883| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1884| for Binary Floating-Point Arithmetic.
1885*----------------------------------------------------------------------------*/
1886
1887float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1888{
1889 flag aSign, bSign;
37d18660
PM
1890 a = float32_squash_input_denormal(a STATUS_VAR);
1891 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
1892
1893 aSign = extractFloat32Sign( a );
1894 bSign = extractFloat32Sign( b );
1895 if ( aSign == bSign ) {
1896 return subFloat32Sigs( a, b, aSign STATUS_VAR );
1897 }
1898 else {
1899 return addFloat32Sigs( a, b, aSign STATUS_VAR );
1900 }
1901
1902}
1903
1904/*----------------------------------------------------------------------------
1905| Returns the result of multiplying the single-precision floating-point values
1906| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1907| for Binary Floating-Point Arithmetic.
1908*----------------------------------------------------------------------------*/
1909
1910float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1911{
1912 flag aSign, bSign, zSign;
1913 int16 aExp, bExp, zExp;
bb98fe42
AF
1914 uint32_t aSig, bSig;
1915 uint64_t zSig64;
1916 uint32_t zSig;
158142c2 1917
37d18660
PM
1918 a = float32_squash_input_denormal(a STATUS_VAR);
1919 b = float32_squash_input_denormal(b STATUS_VAR);
1920
158142c2
FB
1921 aSig = extractFloat32Frac( a );
1922 aExp = extractFloat32Exp( a );
1923 aSign = extractFloat32Sign( a );
1924 bSig = extractFloat32Frac( b );
1925 bExp = extractFloat32Exp( b );
1926 bSign = extractFloat32Sign( b );
1927 zSign = aSign ^ bSign;
1928 if ( aExp == 0xFF ) {
1929 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1930 return propagateFloat32NaN( a, b STATUS_VAR );
1931 }
1932 if ( ( bExp | bSig ) == 0 ) {
1933 float_raise( float_flag_invalid STATUS_VAR);
1934 return float32_default_nan;
1935 }
1936 return packFloat32( zSign, 0xFF, 0 );
1937 }
1938 if ( bExp == 0xFF ) {
1939 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1940 if ( ( aExp | aSig ) == 0 ) {
1941 float_raise( float_flag_invalid STATUS_VAR);
1942 return float32_default_nan;
1943 }
1944 return packFloat32( zSign, 0xFF, 0 );
1945 }
1946 if ( aExp == 0 ) {
1947 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1948 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1949 }
1950 if ( bExp == 0 ) {
1951 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1952 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1953 }
1954 zExp = aExp + bExp - 0x7F;
1955 aSig = ( aSig | 0x00800000 )<<7;
1956 bSig = ( bSig | 0x00800000 )<<8;
bb98fe42 1957 shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
158142c2 1958 zSig = zSig64;
bb98fe42 1959 if ( 0 <= (int32_t) ( zSig<<1 ) ) {
158142c2
FB
1960 zSig <<= 1;
1961 --zExp;
1962 }
1963 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1964
1965}
1966
1967/*----------------------------------------------------------------------------
1968| Returns the result of dividing the single-precision floating-point value `a'
1969| by the corresponding value `b'. The operation is performed according to the
1970| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1971*----------------------------------------------------------------------------*/
1972
1973float32 float32_div( float32 a, float32 b STATUS_PARAM )
1974{
1975 flag aSign, bSign, zSign;
1976 int16 aExp, bExp, zExp;
bb98fe42 1977 uint32_t aSig, bSig, zSig;
37d18660
PM
1978 a = float32_squash_input_denormal(a STATUS_VAR);
1979 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
1980
1981 aSig = extractFloat32Frac( a );
1982 aExp = extractFloat32Exp( a );
1983 aSign = extractFloat32Sign( a );
1984 bSig = extractFloat32Frac( b );
1985 bExp = extractFloat32Exp( b );
1986 bSign = extractFloat32Sign( b );
1987 zSign = aSign ^ bSign;
1988 if ( aExp == 0xFF ) {
1989 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1990 if ( bExp == 0xFF ) {
1991 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1992 float_raise( float_flag_invalid STATUS_VAR);
1993 return float32_default_nan;
1994 }
1995 return packFloat32( zSign, 0xFF, 0 );
1996 }
1997 if ( bExp == 0xFF ) {
1998 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1999 return packFloat32( zSign, 0, 0 );
2000 }
2001 if ( bExp == 0 ) {
2002 if ( bSig == 0 ) {
2003 if ( ( aExp | aSig ) == 0 ) {
2004 float_raise( float_flag_invalid STATUS_VAR);
2005 return float32_default_nan;
2006 }
2007 float_raise( float_flag_divbyzero STATUS_VAR);
2008 return packFloat32( zSign, 0xFF, 0 );
2009 }
2010 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2011 }
2012 if ( aExp == 0 ) {
2013 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2014 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2015 }
2016 zExp = aExp - bExp + 0x7D;
2017 aSig = ( aSig | 0x00800000 )<<7;
2018 bSig = ( bSig | 0x00800000 )<<8;
2019 if ( bSig <= ( aSig + aSig ) ) {
2020 aSig >>= 1;
2021 ++zExp;
2022 }
bb98fe42 2023 zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
158142c2 2024 if ( ( zSig & 0x3F ) == 0 ) {
bb98fe42 2025 zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
158142c2
FB
2026 }
2027 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2028
2029}
2030
2031/*----------------------------------------------------------------------------
2032| Returns the remainder of the single-precision floating-point value `a'
2033| with respect to the corresponding value `b'. The operation is performed
2034| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2035*----------------------------------------------------------------------------*/
2036
2037float32 float32_rem( float32 a, float32 b STATUS_PARAM )
2038{
ed086f3d 2039 flag aSign, zSign;
158142c2 2040 int16 aExp, bExp, expDiff;
bb98fe42
AF
2041 uint32_t aSig, bSig;
2042 uint32_t q;
2043 uint64_t aSig64, bSig64, q64;
2044 uint32_t alternateASig;
2045 int32_t sigMean;
37d18660
PM
2046 a = float32_squash_input_denormal(a STATUS_VAR);
2047 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2048
2049 aSig = extractFloat32Frac( a );
2050 aExp = extractFloat32Exp( a );
2051 aSign = extractFloat32Sign( a );
2052 bSig = extractFloat32Frac( b );
2053 bExp = extractFloat32Exp( b );
158142c2
FB
2054 if ( aExp == 0xFF ) {
2055 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2056 return propagateFloat32NaN( a, b STATUS_VAR );
2057 }
2058 float_raise( float_flag_invalid STATUS_VAR);
2059 return float32_default_nan;
2060 }
2061 if ( bExp == 0xFF ) {
2062 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2063 return a;
2064 }
2065 if ( bExp == 0 ) {
2066 if ( bSig == 0 ) {
2067 float_raise( float_flag_invalid STATUS_VAR);
2068 return float32_default_nan;
2069 }
2070 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2071 }
2072 if ( aExp == 0 ) {
2073 if ( aSig == 0 ) return a;
2074 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2075 }
2076 expDiff = aExp - bExp;
2077 aSig |= 0x00800000;
2078 bSig |= 0x00800000;
2079 if ( expDiff < 32 ) {
2080 aSig <<= 8;
2081 bSig <<= 8;
2082 if ( expDiff < 0 ) {
2083 if ( expDiff < -1 ) return a;
2084 aSig >>= 1;
2085 }
2086 q = ( bSig <= aSig );
2087 if ( q ) aSig -= bSig;
2088 if ( 0 < expDiff ) {
bb98fe42 2089 q = ( ( (uint64_t) aSig )<<32 ) / bSig;
158142c2
FB
2090 q >>= 32 - expDiff;
2091 bSig >>= 2;
2092 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2093 }
2094 else {
2095 aSig >>= 2;
2096 bSig >>= 2;
2097 }
2098 }
2099 else {
2100 if ( bSig <= aSig ) aSig -= bSig;
bb98fe42
AF
2101 aSig64 = ( (uint64_t) aSig )<<40;
2102 bSig64 = ( (uint64_t) bSig )<<40;
158142c2
FB
2103 expDiff -= 64;
2104 while ( 0 < expDiff ) {
2105 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2106 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2107 aSig64 = - ( ( bSig * q64 )<<38 );
2108 expDiff -= 62;
2109 }
2110 expDiff += 64;
2111 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2112 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2113 q = q64>>( 64 - expDiff );
2114 bSig <<= 6;
2115 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2116 }
2117 do {
2118 alternateASig = aSig;
2119 ++q;
2120 aSig -= bSig;
bb98fe42 2121 } while ( 0 <= (int32_t) aSig );
158142c2
FB
2122 sigMean = aSig + alternateASig;
2123 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2124 aSig = alternateASig;
2125 }
bb98fe42 2126 zSign = ( (int32_t) aSig < 0 );
158142c2
FB
2127 if ( zSign ) aSig = - aSig;
2128 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2129
2130}
2131
2132/*----------------------------------------------------------------------------
2133| Returns the square root of the single-precision floating-point value `a'.
2134| The operation is performed according to the IEC/IEEE Standard for Binary
2135| Floating-Point Arithmetic.
2136*----------------------------------------------------------------------------*/
2137
2138float32 float32_sqrt( float32 a STATUS_PARAM )
2139{
2140 flag aSign;
2141 int16 aExp, zExp;
bb98fe42
AF
2142 uint32_t aSig, zSig;
2143 uint64_t rem, term;
37d18660 2144 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
2145
2146 aSig = extractFloat32Frac( a );
2147 aExp = extractFloat32Exp( a );
2148 aSign = extractFloat32Sign( a );
2149 if ( aExp == 0xFF ) {
f090c9d4 2150 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
158142c2
FB
2151 if ( ! aSign ) return a;
2152 float_raise( float_flag_invalid STATUS_VAR);
2153 return float32_default_nan;
2154 }
2155 if ( aSign ) {
2156 if ( ( aExp | aSig ) == 0 ) return a;
2157 float_raise( float_flag_invalid STATUS_VAR);
2158 return float32_default_nan;
2159 }
2160 if ( aExp == 0 ) {
f090c9d4 2161 if ( aSig == 0 ) return float32_zero;
158142c2
FB
2162 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2163 }
2164 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2165 aSig = ( aSig | 0x00800000 )<<8;
2166 zSig = estimateSqrt32( aExp, aSig ) + 2;
2167 if ( ( zSig & 0x7F ) <= 5 ) {
2168 if ( zSig < 2 ) {
2169 zSig = 0x7FFFFFFF;
2170 goto roundAndPack;
2171 }
2172 aSig >>= aExp & 1;
bb98fe42
AF
2173 term = ( (uint64_t) zSig ) * zSig;
2174 rem = ( ( (uint64_t) aSig )<<32 ) - term;
2175 while ( (int64_t) rem < 0 ) {
158142c2 2176 --zSig;
bb98fe42 2177 rem += ( ( (uint64_t) zSig )<<1 ) | 1;
158142c2
FB
2178 }
2179 zSig |= ( rem != 0 );
2180 }
2181 shift32RightJamming( zSig, 1, &zSig );
2182 roundAndPack:
2183 return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2184
2185}
2186
8229c991
AJ
2187/*----------------------------------------------------------------------------
2188| Returns the binary exponential of the single-precision floating-point value
2189| `a'. The operation is performed according to the IEC/IEEE Standard for
2190| Binary Floating-Point Arithmetic.
2191|
2192| Uses the following identities:
2193|
2194| 1. -------------------------------------------------------------------------
2195| x x*ln(2)
2196| 2 = e
2197|
2198| 2. -------------------------------------------------------------------------
2199| 2 3 4 5 n
2200| x x x x x x x
2201| e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2202| 1! 2! 3! 4! 5! n!
2203*----------------------------------------------------------------------------*/
2204
2205static const float64 float32_exp2_coefficients[15] =
2206{
d5138cf4
PM
2207 const_float64( 0x3ff0000000000000ll ), /* 1 */
2208 const_float64( 0x3fe0000000000000ll ), /* 2 */
2209 const_float64( 0x3fc5555555555555ll ), /* 3 */
2210 const_float64( 0x3fa5555555555555ll ), /* 4 */
2211 const_float64( 0x3f81111111111111ll ), /* 5 */
2212 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
2213 const_float64( 0x3f2a01a01a01a01all ), /* 7 */
2214 const_float64( 0x3efa01a01a01a01all ), /* 8 */
2215 const_float64( 0x3ec71de3a556c734ll ), /* 9 */
2216 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2217 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2218 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2219 const_float64( 0x3de6124613a86d09ll ), /* 13 */
2220 const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2221 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
8229c991
AJ
2222};
2223
2224float32 float32_exp2( float32 a STATUS_PARAM )
2225{
2226 flag aSign;
2227 int16 aExp;
bb98fe42 2228 uint32_t aSig;
8229c991
AJ
2229 float64 r, x, xn;
2230 int i;
37d18660 2231 a = float32_squash_input_denormal(a STATUS_VAR);
8229c991
AJ
2232
2233 aSig = extractFloat32Frac( a );
2234 aExp = extractFloat32Exp( a );
2235 aSign = extractFloat32Sign( a );
2236
2237 if ( aExp == 0xFF) {
2238 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2239 return (aSign) ? float32_zero : a;
2240 }
2241 if (aExp == 0) {
2242 if (aSig == 0) return float32_one;
2243 }
2244
2245 float_raise( float_flag_inexact STATUS_VAR);
2246
2247 /* ******************************* */
2248 /* using float64 for approximation */
2249 /* ******************************* */
2250 x = float32_to_float64(a STATUS_VAR);
2251 x = float64_mul(x, float64_ln2 STATUS_VAR);
2252
2253 xn = x;
2254 r = float64_one;
2255 for (i = 0 ; i < 15 ; i++) {
2256 float64 f;
2257
2258 f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2259 r = float64_add(r, f STATUS_VAR);
2260
2261 xn = float64_mul(xn, x STATUS_VAR);
2262 }
2263
2264 return float64_to_float32(r, status);
2265}
2266
374dfc33
AJ
2267/*----------------------------------------------------------------------------
2268| Returns the binary log of the single-precision floating-point value `a'.
2269| The operation is performed according to the IEC/IEEE Standard for Binary
2270| Floating-Point Arithmetic.
2271*----------------------------------------------------------------------------*/
2272float32 float32_log2( float32 a STATUS_PARAM )
2273{
2274 flag aSign, zSign;
2275 int16 aExp;
bb98fe42 2276 uint32_t aSig, zSig, i;
374dfc33 2277
37d18660 2278 a = float32_squash_input_denormal(a STATUS_VAR);
374dfc33
AJ
2279 aSig = extractFloat32Frac( a );
2280 aExp = extractFloat32Exp( a );
2281 aSign = extractFloat32Sign( a );
2282
2283 if ( aExp == 0 ) {
2284 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2285 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2286 }
2287 if ( aSign ) {
2288 float_raise( float_flag_invalid STATUS_VAR);
2289 return float32_default_nan;
2290 }
2291 if ( aExp == 0xFF ) {
2292 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2293 return a;
2294 }
2295
2296 aExp -= 0x7F;
2297 aSig |= 0x00800000;
2298 zSign = aExp < 0;
2299 zSig = aExp << 23;
2300
2301 for (i = 1 << 22; i > 0; i >>= 1) {
bb98fe42 2302 aSig = ( (uint64_t)aSig * aSig ) >> 23;
374dfc33
AJ
2303 if ( aSig & 0x01000000 ) {
2304 aSig >>= 1;
2305 zSig |= i;
2306 }
2307 }
2308
2309 if ( zSign )
2310 zSig = -zSig;
2311
2312 return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2313}
2314
158142c2
FB
2315/*----------------------------------------------------------------------------
2316| Returns 1 if the single-precision floating-point value `a' is equal to
2317| the corresponding value `b', and 0 otherwise. The comparison is performed
2318| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2319*----------------------------------------------------------------------------*/
2320
211315fb 2321int float32_eq_quiet( float32 a, float32 b STATUS_PARAM )
158142c2 2322{
37d18660
PM
2323 a = float32_squash_input_denormal(a STATUS_VAR);
2324 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2325
2326 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2327 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2328 ) {
2329 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2330 float_raise( float_flag_invalid STATUS_VAR);
2331 }
2332 return 0;
2333 }
f090c9d4 2334 return ( float32_val(a) == float32_val(b) ) ||
bb98fe42 2335 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
158142c2
FB
2336
2337}
2338
2339/*----------------------------------------------------------------------------
2340| Returns 1 if the single-precision floating-point value `a' is less than
2341| or equal to the corresponding value `b', and 0 otherwise. The comparison
2342| is performed according to the IEC/IEEE Standard for Binary Floating-Point
2343| Arithmetic.
2344*----------------------------------------------------------------------------*/
2345
750afe93 2346int float32_le( float32 a, float32 b STATUS_PARAM )
158142c2
FB
2347{
2348 flag aSign, bSign;
bb98fe42 2349 uint32_t av, bv;
37d18660
PM
2350 a = float32_squash_input_denormal(a STATUS_VAR);
2351 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2352
2353 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2354 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2355 ) {
2356 float_raise( float_flag_invalid STATUS_VAR);
2357 return 0;
2358 }
2359 aSign = extractFloat32Sign( a );
2360 bSign = extractFloat32Sign( b );
f090c9d4
PB
2361 av = float32_val(a);
2362 bv = float32_val(b);
bb98fe42 2363 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 2364 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
2365
2366}
2367
2368/*----------------------------------------------------------------------------
2369| Returns 1 if the single-precision floating-point value `a' is less than
2370| the corresponding value `b', and 0 otherwise. The comparison is performed
2371| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2372*----------------------------------------------------------------------------*/
2373
750afe93 2374int float32_lt( float32 a, float32 b STATUS_PARAM )
158142c2
FB
2375{
2376 flag aSign, bSign;
bb98fe42 2377 uint32_t av, bv;
37d18660
PM
2378 a = float32_squash_input_denormal(a STATUS_VAR);
2379 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2380
2381 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2382 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2383 ) {
2384 float_raise( float_flag_invalid STATUS_VAR);
2385 return 0;
2386 }
2387 aSign = extractFloat32Sign( a );
2388 bSign = extractFloat32Sign( b );
f090c9d4
PB
2389 av = float32_val(a);
2390 bv = float32_val(b);
bb98fe42 2391 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 2392 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
2393
2394}
2395
67b7861d
AJ
2396/*----------------------------------------------------------------------------
2397| Returns 1 if the single-precision floating-point values `a' and `b' cannot
2398| be compared, and 0 otherwise. The comparison is performed according to the
2399| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2400*----------------------------------------------------------------------------*/
2401
2402int float32_unordered( float32 a, float32 b STATUS_PARAM )
2403{
2404 a = float32_squash_input_denormal(a STATUS_VAR);
2405 b = float32_squash_input_denormal(b STATUS_VAR);
2406
2407 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2408 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2409 ) {
2410 float_raise( float_flag_invalid STATUS_VAR);
2411 return 1;
2412 }
2413 return 0;
2414}
158142c2
FB
2415/*----------------------------------------------------------------------------
2416| Returns 1 if the single-precision floating-point value `a' is equal to
2417| the corresponding value `b', and 0 otherwise. The invalid exception is
2418| raised if either operand is a NaN. Otherwise, the comparison is performed
2419| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2420*----------------------------------------------------------------------------*/
2421
2657d0ff 2422int float32_eq( float32 a, float32 b STATUS_PARAM )
158142c2 2423{
bb98fe42 2424 uint32_t av, bv;
37d18660
PM
2425 a = float32_squash_input_denormal(a STATUS_VAR);
2426 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2427
2428 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2429 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2430 ) {
2431 float_raise( float_flag_invalid STATUS_VAR);
2432 return 0;
2433 }
f090c9d4
PB
2434 av = float32_val(a);
2435 bv = float32_val(b);
bb98fe42 2436 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
158142c2
FB
2437
2438}
2439
2440/*----------------------------------------------------------------------------
2441| Returns 1 if the single-precision floating-point value `a' is less than or
2442| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2443| cause an exception. Otherwise, the comparison is performed according to the
2444| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2445*----------------------------------------------------------------------------*/
2446
750afe93 2447int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
158142c2
FB
2448{
2449 flag aSign, bSign;
bb98fe42 2450 uint32_t av, bv;
37d18660
PM
2451 a = float32_squash_input_denormal(a STATUS_VAR);
2452 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2453
2454 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2455 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2456 ) {
2457 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2458 float_raise( float_flag_invalid STATUS_VAR);
2459 }
2460 return 0;
2461 }
2462 aSign = extractFloat32Sign( a );
2463 bSign = extractFloat32Sign( b );
f090c9d4
PB
2464 av = float32_val(a);
2465 bv = float32_val(b);
bb98fe42 2466 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 2467 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
2468
2469}
2470
2471/*----------------------------------------------------------------------------
2472| Returns 1 if the single-precision floating-point value `a' is less than
2473| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2474| exception. Otherwise, the comparison is performed according to the IEC/IEEE
2475| Standard for Binary Floating-Point Arithmetic.
2476*----------------------------------------------------------------------------*/
2477
750afe93 2478int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
158142c2
FB
2479{
2480 flag aSign, bSign;
bb98fe42 2481 uint32_t av, bv;
37d18660
PM
2482 a = float32_squash_input_denormal(a STATUS_VAR);
2483 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2484
2485 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2486 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2487 ) {
2488 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2489 float_raise( float_flag_invalid STATUS_VAR);
2490 }
2491 return 0;
2492 }
2493 aSign = extractFloat32Sign( a );
2494 bSign = extractFloat32Sign( b );
f090c9d4
PB
2495 av = float32_val(a);
2496 bv = float32_val(b);
bb98fe42 2497 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 2498 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
2499
2500}
2501
67b7861d
AJ
2502/*----------------------------------------------------------------------------
2503| Returns 1 if the single-precision floating-point values `a' and `b' cannot
2504| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
2505| comparison is performed according to the IEC/IEEE Standard for Binary
2506| Floating-Point Arithmetic.
2507*----------------------------------------------------------------------------*/
2508
2509int float32_unordered_quiet( float32 a, float32 b STATUS_PARAM )
2510{
2511 a = float32_squash_input_denormal(a STATUS_VAR);
2512 b = float32_squash_input_denormal(b STATUS_VAR);
2513
2514 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2515 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2516 ) {
2517 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2518 float_raise( float_flag_invalid STATUS_VAR);
2519 }
2520 return 1;
2521 }
2522 return 0;
2523}
2524
158142c2
FB
2525/*----------------------------------------------------------------------------
2526| Returns the result of converting the double-precision floating-point value
2527| `a' to the 32-bit two's complement integer format. The conversion is
2528| performed according to the IEC/IEEE Standard for Binary Floating-Point
2529| Arithmetic---which means in particular that the conversion is rounded
2530| according to the current rounding mode. If `a' is a NaN, the largest
2531| positive integer is returned. Otherwise, if the conversion overflows, the
2532| largest integer with the same sign as `a' is returned.
2533*----------------------------------------------------------------------------*/
2534
2535int32 float64_to_int32( float64 a STATUS_PARAM )
2536{
2537 flag aSign;
2538 int16 aExp, shiftCount;
bb98fe42 2539 uint64_t aSig;
37d18660 2540 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
2541
2542 aSig = extractFloat64Frac( a );
2543 aExp = extractFloat64Exp( a );
2544 aSign = extractFloat64Sign( a );
2545 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2546 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2547 shiftCount = 0x42C - aExp;
2548 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2549 return roundAndPackInt32( aSign, aSig STATUS_VAR );
2550
2551}
2552
2553/*----------------------------------------------------------------------------
2554| Returns the result of converting the double-precision floating-point value
2555| `a' to the 32-bit two's complement integer format. The conversion is
2556| performed according to the IEC/IEEE Standard for Binary Floating-Point
2557| Arithmetic, except that the conversion is always rounded toward zero.
2558| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2559| the conversion overflows, the largest integer with the same sign as `a' is
2560| returned.
2561*----------------------------------------------------------------------------*/
2562
2563int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2564{
2565 flag aSign;
2566 int16 aExp, shiftCount;
bb98fe42 2567 uint64_t aSig, savedASig;
158142c2 2568 int32 z;
37d18660 2569 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
2570
2571 aSig = extractFloat64Frac( a );
2572 aExp = extractFloat64Exp( a );
2573 aSign = extractFloat64Sign( a );
2574 if ( 0x41E < aExp ) {
2575 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2576 goto invalid;
2577 }
2578 else if ( aExp < 0x3FF ) {
2579 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2580 return 0;
2581 }
2582 aSig |= LIT64( 0x0010000000000000 );
2583 shiftCount = 0x433 - aExp;
2584 savedASig = aSig;
2585 aSig >>= shiftCount;
2586 z = aSig;
2587 if ( aSign ) z = - z;
2588 if ( ( z < 0 ) ^ aSign ) {
2589 invalid:
2590 float_raise( float_flag_invalid STATUS_VAR);
bb98fe42 2591 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2
FB
2592 }
2593 if ( ( aSig<<shiftCount ) != savedASig ) {
2594 STATUS(float_exception_flags) |= float_flag_inexact;
2595 }
2596 return z;
2597
2598}
2599
cbcef455
PM
2600/*----------------------------------------------------------------------------
2601| Returns the result of converting the double-precision floating-point value
2602| `a' to the 16-bit two's complement integer format. The conversion is
2603| performed according to the IEC/IEEE Standard for Binary Floating-Point
2604| Arithmetic, except that the conversion is always rounded toward zero.
2605| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2606| the conversion overflows, the largest integer with the same sign as `a' is
2607| returned.
2608*----------------------------------------------------------------------------*/
2609
2610int16 float64_to_int16_round_to_zero( float64 a STATUS_PARAM )
2611{
2612 flag aSign;
2613 int16 aExp, shiftCount;
bb98fe42 2614 uint64_t aSig, savedASig;
cbcef455
PM
2615 int32 z;
2616
2617 aSig = extractFloat64Frac( a );
2618 aExp = extractFloat64Exp( a );
2619 aSign = extractFloat64Sign( a );
2620 if ( 0x40E < aExp ) {
2621 if ( ( aExp == 0x7FF ) && aSig ) {
2622 aSign = 0;
2623 }
2624 goto invalid;
2625 }
2626 else if ( aExp < 0x3FF ) {
2627 if ( aExp || aSig ) {
2628 STATUS(float_exception_flags) |= float_flag_inexact;
2629 }
2630 return 0;
2631 }
2632 aSig |= LIT64( 0x0010000000000000 );
2633 shiftCount = 0x433 - aExp;
2634 savedASig = aSig;
2635 aSig >>= shiftCount;
2636 z = aSig;
2637 if ( aSign ) {
2638 z = - z;
2639 }
2640 if ( ( (int16_t)z < 0 ) ^ aSign ) {
2641 invalid:
2642 float_raise( float_flag_invalid STATUS_VAR);
bb98fe42 2643 return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
cbcef455
PM
2644 }
2645 if ( ( aSig<<shiftCount ) != savedASig ) {
2646 STATUS(float_exception_flags) |= float_flag_inexact;
2647 }
2648 return z;
2649}
2650
158142c2
FB
2651/*----------------------------------------------------------------------------
2652| Returns the result of converting the double-precision floating-point value
2653| `a' to the 64-bit two's complement integer format. The conversion is
2654| performed according to the IEC/IEEE Standard for Binary Floating-Point
2655| Arithmetic---which means in particular that the conversion is rounded
2656| according to the current rounding mode. If `a' is a NaN, the largest
2657| positive integer is returned. Otherwise, if the conversion overflows, the
2658| largest integer with the same sign as `a' is returned.
2659*----------------------------------------------------------------------------*/
2660
2661int64 float64_to_int64( float64 a STATUS_PARAM )
2662{
2663 flag aSign;
2664 int16 aExp, shiftCount;
bb98fe42 2665 uint64_t aSig, aSigExtra;
37d18660 2666 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
2667
2668 aSig = extractFloat64Frac( a );
2669 aExp = extractFloat64Exp( a );
2670 aSign = extractFloat64Sign( a );
2671 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2672 shiftCount = 0x433 - aExp;
2673 if ( shiftCount <= 0 ) {
2674 if ( 0x43E < aExp ) {
2675 float_raise( float_flag_invalid STATUS_VAR);
2676 if ( ! aSign
2677 || ( ( aExp == 0x7FF )
2678 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2679 ) {
2680 return LIT64( 0x7FFFFFFFFFFFFFFF );
2681 }
bb98fe42 2682 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
2683 }
2684 aSigExtra = 0;
2685 aSig <<= - shiftCount;
2686 }
2687 else {
2688 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2689 }
2690 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2691
2692}
2693
2694/*----------------------------------------------------------------------------
2695| Returns the result of converting the double-precision floating-point value
2696| `a' to the 64-bit two's complement integer format. The conversion is
2697| performed according to the IEC/IEEE Standard for Binary Floating-Point
2698| Arithmetic, except that the conversion is always rounded toward zero.
2699| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2700| the conversion overflows, the largest integer with the same sign as `a' is
2701| returned.
2702*----------------------------------------------------------------------------*/
2703
2704int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2705{
2706 flag aSign;
2707 int16 aExp, shiftCount;
bb98fe42 2708 uint64_t aSig;
158142c2 2709 int64 z;
37d18660 2710 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
2711
2712 aSig = extractFloat64Frac( a );
2713 aExp = extractFloat64Exp( a );
2714 aSign = extractFloat64Sign( a );
2715 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2716 shiftCount = aExp - 0x433;
2717 if ( 0 <= shiftCount ) {
2718 if ( 0x43E <= aExp ) {
f090c9d4 2719 if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
158142c2
FB
2720 float_raise( float_flag_invalid STATUS_VAR);
2721 if ( ! aSign
2722 || ( ( aExp == 0x7FF )
2723 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2724 ) {
2725 return LIT64( 0x7FFFFFFFFFFFFFFF );
2726 }
2727 }
bb98fe42 2728 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
2729 }
2730 z = aSig<<shiftCount;
2731 }
2732 else {
2733 if ( aExp < 0x3FE ) {
2734 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2735 return 0;
2736 }
2737 z = aSig>>( - shiftCount );
bb98fe42 2738 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
158142c2
FB
2739 STATUS(float_exception_flags) |= float_flag_inexact;
2740 }
2741 }
2742 if ( aSign ) z = - z;
2743 return z;
2744
2745}
2746
2747/*----------------------------------------------------------------------------
2748| Returns the result of converting the double-precision floating-point value
2749| `a' to the single-precision floating-point format. The conversion is
2750| performed according to the IEC/IEEE Standard for Binary Floating-Point
2751| Arithmetic.
2752*----------------------------------------------------------------------------*/
2753
2754float32 float64_to_float32( float64 a STATUS_PARAM )
2755{
2756 flag aSign;
2757 int16 aExp;
bb98fe42
AF
2758 uint64_t aSig;
2759 uint32_t zSig;
37d18660 2760 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
2761
2762 aSig = extractFloat64Frac( a );
2763 aExp = extractFloat64Exp( a );
2764 aSign = extractFloat64Sign( a );
2765 if ( aExp == 0x7FF ) {
bcd4d9af 2766 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
2767 return packFloat32( aSign, 0xFF, 0 );
2768 }
2769 shift64RightJamming( aSig, 22, &aSig );
2770 zSig = aSig;
2771 if ( aExp || zSig ) {
2772 zSig |= 0x40000000;
2773 aExp -= 0x381;
2774 }
2775 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2776
2777}
2778
60011498
PB
2779
2780/*----------------------------------------------------------------------------
2781| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2782| half-precision floating-point value, returning the result. After being
2783| shifted into the proper positions, the three fields are simply added
2784| together to form the result. This means that any integer portion of `zSig'
2785| will be added into the exponent. Since a properly normalized significand
2786| will have an integer portion equal to 1, the `zExp' input should be 1 less
2787| than the desired result exponent whenever `zSig' is a complete, normalized
2788| significand.
2789*----------------------------------------------------------------------------*/
bb98fe42 2790static float16 packFloat16(flag zSign, int16 zExp, uint16_t zSig)
60011498 2791{
bb4d4bb3 2792 return make_float16(
bb98fe42 2793 (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
60011498
PB
2794}
2795
2796/* Half precision floats come in two formats: standard IEEE and "ARM" format.
2797 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
bb4d4bb3
PM
2798
2799float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM)
60011498
PB
2800{
2801 flag aSign;
2802 int16 aExp;
bb98fe42 2803 uint32_t aSig;
60011498 2804
bb4d4bb3
PM
2805 aSign = extractFloat16Sign(a);
2806 aExp = extractFloat16Exp(a);
2807 aSig = extractFloat16Frac(a);
60011498
PB
2808
2809 if (aExp == 0x1f && ieee) {
2810 if (aSig) {
f591e1be 2811 return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
60011498
PB
2812 }
2813 return packFloat32(aSign, 0xff, aSig << 13);
2814 }
2815 if (aExp == 0) {
2816 int8 shiftCount;
2817
2818 if (aSig == 0) {
2819 return packFloat32(aSign, 0, 0);
2820 }
2821
2822 shiftCount = countLeadingZeros32( aSig ) - 21;
2823 aSig = aSig << shiftCount;
2824 aExp = -shiftCount;
2825 }
2826 return packFloat32( aSign, aExp + 0x70, aSig << 13);
2827}
2828
bb4d4bb3 2829float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM)
60011498
PB
2830{
2831 flag aSign;
2832 int16 aExp;
bb98fe42
AF
2833 uint32_t aSig;
2834 uint32_t mask;
2835 uint32_t increment;
60011498 2836 int8 roundingMode;
37d18660 2837 a = float32_squash_input_denormal(a STATUS_VAR);
60011498
PB
2838
2839 aSig = extractFloat32Frac( a );
2840 aExp = extractFloat32Exp( a );
2841 aSign = extractFloat32Sign( a );
2842 if ( aExp == 0xFF ) {
2843 if (aSig) {
600e30d2
PM
2844 /* Input is a NaN */
2845 float16 r = commonNaNToFloat16( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2846 if (!ieee) {
2847 return packFloat16(aSign, 0, 0);
2848 }
2849 return r;
60011498 2850 }
600e30d2
PM
2851 /* Infinity */
2852 if (!ieee) {
2853 float_raise(float_flag_invalid STATUS_VAR);
2854 return packFloat16(aSign, 0x1f, 0x3ff);
2855 }
2856 return packFloat16(aSign, 0x1f, 0);
60011498 2857 }
600e30d2 2858 if (aExp == 0 && aSig == 0) {
60011498
PB
2859 return packFloat16(aSign, 0, 0);
2860 }
2861 /* Decimal point between bits 22 and 23. */
2862 aSig |= 0x00800000;
2863 aExp -= 0x7f;
2864 if (aExp < -14) {
600e30d2
PM
2865 mask = 0x00ffffff;
2866 if (aExp >= -24) {
2867 mask >>= 25 + aExp;
60011498
PB
2868 }
2869 } else {
2870 mask = 0x00001fff;
2871 }
2872 if (aSig & mask) {
2873 float_raise( float_flag_underflow STATUS_VAR );
2874 roundingMode = STATUS(float_rounding_mode);
2875 switch (roundingMode) {
2876 case float_round_nearest_even:
2877 increment = (mask + 1) >> 1;
2878 if ((aSig & mask) == increment) {
2879 increment = aSig & (increment << 1);
2880 }
2881 break;
2882 case float_round_up:
2883 increment = aSign ? 0 : mask;
2884 break;
2885 case float_round_down:
2886 increment = aSign ? mask : 0;
2887 break;
2888 default: /* round_to_zero */
2889 increment = 0;
2890 break;
2891 }
2892 aSig += increment;
2893 if (aSig >= 0x01000000) {
2894 aSig >>= 1;
2895 aExp++;
2896 }
2897 } else if (aExp < -14
2898 && STATUS(float_detect_tininess) == float_tininess_before_rounding) {
2899 float_raise( float_flag_underflow STATUS_VAR);
2900 }
2901
2902 if (ieee) {
2903 if (aExp > 15) {
2904 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
2905 return packFloat16(aSign, 0x1f, 0);
2906 }
2907 } else {
2908 if (aExp > 16) {
600e30d2 2909 float_raise(float_flag_invalid | float_flag_inexact STATUS_VAR);
60011498
PB
2910 return packFloat16(aSign, 0x1f, 0x3ff);
2911 }
2912 }
2913 if (aExp < -24) {
2914 return packFloat16(aSign, 0, 0);
2915 }
2916 if (aExp < -14) {
2917 aSig >>= -14 - aExp;
2918 aExp = -14;
2919 }
2920 return packFloat16(aSign, aExp + 14, aSig >> 13);
2921}
2922
158142c2
FB
2923#ifdef FLOATX80
2924
2925/*----------------------------------------------------------------------------
2926| Returns the result of converting the double-precision floating-point value
2927| `a' to the extended double-precision floating-point format. The conversion
2928| is performed according to the IEC/IEEE Standard for Binary Floating-Point
2929| Arithmetic.
2930*----------------------------------------------------------------------------*/
2931
2932floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
2933{
2934 flag aSign;
2935 int16 aExp;
bb98fe42 2936 uint64_t aSig;
158142c2 2937
37d18660 2938 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
2939 aSig = extractFloat64Frac( a );
2940 aExp = extractFloat64Exp( a );
2941 aSign = extractFloat64Sign( a );
2942 if ( aExp == 0x7FF ) {
bcd4d9af 2943 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
2944 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2945 }
2946 if ( aExp == 0 ) {
2947 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2948 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2949 }
2950 return
2951 packFloatx80(
2952 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2953
2954}
2955
2956#endif
2957
2958#ifdef FLOAT128
2959
2960/*----------------------------------------------------------------------------
2961| Returns the result of converting the double-precision floating-point value
2962| `a' to the quadruple-precision floating-point format. The conversion is
2963| performed according to the IEC/IEEE Standard for Binary Floating-Point
2964| Arithmetic.
2965*----------------------------------------------------------------------------*/
2966
2967float128 float64_to_float128( float64 a STATUS_PARAM )
2968{
2969 flag aSign;
2970 int16 aExp;
bb98fe42 2971 uint64_t aSig, zSig0, zSig1;
158142c2 2972
37d18660 2973 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
2974 aSig = extractFloat64Frac( a );
2975 aExp = extractFloat64Exp( a );
2976 aSign = extractFloat64Sign( a );
2977 if ( aExp == 0x7FF ) {
bcd4d9af 2978 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
2979 return packFloat128( aSign, 0x7FFF, 0, 0 );
2980 }
2981 if ( aExp == 0 ) {
2982 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2983 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2984 --aExp;
2985 }
2986 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2987 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2988
2989}
2990
2991#endif
2992
2993/*----------------------------------------------------------------------------
2994| Rounds the double-precision floating-point value `a' to an integer, and
2995| returns the result as a double-precision floating-point value. The
2996| operation is performed according to the IEC/IEEE Standard for Binary
2997| Floating-Point Arithmetic.
2998*----------------------------------------------------------------------------*/
2999
3000float64 float64_round_to_int( float64 a STATUS_PARAM )
3001{
3002 flag aSign;
3003 int16 aExp;
bb98fe42 3004 uint64_t lastBitMask, roundBitsMask;
158142c2 3005 int8 roundingMode;
bb98fe42 3006 uint64_t z;
37d18660 3007 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
3008
3009 aExp = extractFloat64Exp( a );
3010 if ( 0x433 <= aExp ) {
3011 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
3012 return propagateFloat64NaN( a, a STATUS_VAR );
3013 }
3014 return a;
3015 }
3016 if ( aExp < 0x3FF ) {
bb98fe42 3017 if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
158142c2
FB
3018 STATUS(float_exception_flags) |= float_flag_inexact;
3019 aSign = extractFloat64Sign( a );
3020 switch ( STATUS(float_rounding_mode) ) {
3021 case float_round_nearest_even:
3022 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3023 return packFloat64( aSign, 0x3FF, 0 );
3024 }
3025 break;
3026 case float_round_down:
f090c9d4 3027 return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
158142c2 3028 case float_round_up:
f090c9d4
PB
3029 return make_float64(
3030 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
158142c2
FB
3031 }
3032 return packFloat64( aSign, 0, 0 );
3033 }
3034 lastBitMask = 1;
3035 lastBitMask <<= 0x433 - aExp;
3036 roundBitsMask = lastBitMask - 1;
f090c9d4 3037 z = float64_val(a);
158142c2
FB
3038 roundingMode = STATUS(float_rounding_mode);
3039 if ( roundingMode == float_round_nearest_even ) {
3040 z += lastBitMask>>1;
3041 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
3042 }
3043 else if ( roundingMode != float_round_to_zero ) {
f090c9d4 3044 if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
158142c2
FB
3045 z += roundBitsMask;
3046 }
3047 }
3048 z &= ~ roundBitsMask;
f090c9d4
PB
3049 if ( z != float64_val(a) )
3050 STATUS(float_exception_flags) |= float_flag_inexact;
3051 return make_float64(z);
158142c2
FB
3052
3053}
3054
e6e5906b
PB
3055float64 float64_trunc_to_int( float64 a STATUS_PARAM)
3056{
3057 int oldmode;
3058 float64 res;
3059 oldmode = STATUS(float_rounding_mode);
3060 STATUS(float_rounding_mode) = float_round_to_zero;
3061 res = float64_round_to_int(a STATUS_VAR);
3062 STATUS(float_rounding_mode) = oldmode;
3063 return res;
3064}
3065
158142c2
FB
3066/*----------------------------------------------------------------------------
3067| Returns the result of adding the absolute values of the double-precision
3068| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
3069| before being returned. `zSign' is ignored if the result is a NaN.
3070| The addition is performed according to the IEC/IEEE Standard for Binary
3071| Floating-Point Arithmetic.
3072*----------------------------------------------------------------------------*/
3073
3074static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3075{
3076 int16 aExp, bExp, zExp;
bb98fe42 3077 uint64_t aSig, bSig, zSig;
158142c2
FB
3078 int16 expDiff;
3079
3080 aSig = extractFloat64Frac( a );
3081 aExp = extractFloat64Exp( a );
3082 bSig = extractFloat64Frac( b );
3083 bExp = extractFloat64Exp( b );
3084 expDiff = aExp - bExp;
3085 aSig <<= 9;
3086 bSig <<= 9;
3087 if ( 0 < expDiff ) {
3088 if ( aExp == 0x7FF ) {
3089 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3090 return a;
3091 }
3092 if ( bExp == 0 ) {
3093 --expDiff;
3094 }
3095 else {
3096 bSig |= LIT64( 0x2000000000000000 );
3097 }
3098 shift64RightJamming( bSig, expDiff, &bSig );
3099 zExp = aExp;
3100 }
3101 else if ( expDiff < 0 ) {
3102 if ( bExp == 0x7FF ) {
3103 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3104 return packFloat64( zSign, 0x7FF, 0 );
3105 }
3106 if ( aExp == 0 ) {
3107 ++expDiff;
3108 }
3109 else {
3110 aSig |= LIT64( 0x2000000000000000 );
3111 }
3112 shift64RightJamming( aSig, - expDiff, &aSig );
3113 zExp = bExp;
3114 }
3115 else {
3116 if ( aExp == 0x7FF ) {
3117 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3118 return a;
3119 }
fe76d976
PB
3120 if ( aExp == 0 ) {
3121 if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
3122 return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3123 }
158142c2
FB
3124 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3125 zExp = aExp;
3126 goto roundAndPack;
3127 }
3128 aSig |= LIT64( 0x2000000000000000 );
3129 zSig = ( aSig + bSig )<<1;
3130 --zExp;
bb98fe42 3131 if ( (int64_t) zSig < 0 ) {
158142c2
FB
3132 zSig = aSig + bSig;
3133 ++zExp;
3134 }
3135 roundAndPack:
3136 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3137
3138}
3139
3140/*----------------------------------------------------------------------------
3141| Returns the result of subtracting the absolute values of the double-
3142| precision floating-point values `a' and `b'. If `zSign' is 1, the
3143| difference is negated before being returned. `zSign' is ignored if the
3144| result is a NaN. The subtraction is performed according to the IEC/IEEE
3145| Standard for Binary Floating-Point Arithmetic.
3146*----------------------------------------------------------------------------*/
3147
3148static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3149{
3150 int16 aExp, bExp, zExp;
bb98fe42 3151 uint64_t aSig, bSig, zSig;
158142c2
FB
3152 int16 expDiff;
3153
3154 aSig = extractFloat64Frac( a );
3155 aExp = extractFloat64Exp( a );
3156 bSig = extractFloat64Frac( b );
3157 bExp = extractFloat64Exp( b );
3158 expDiff = aExp - bExp;
3159 aSig <<= 10;
3160 bSig <<= 10;
3161 if ( 0 < expDiff ) goto aExpBigger;
3162 if ( expDiff < 0 ) goto bExpBigger;
3163 if ( aExp == 0x7FF ) {
3164 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3165 float_raise( float_flag_invalid STATUS_VAR);
3166 return float64_default_nan;
3167 }
3168 if ( aExp == 0 ) {
3169 aExp = 1;
3170 bExp = 1;
3171 }
3172 if ( bSig < aSig ) goto aBigger;
3173 if ( aSig < bSig ) goto bBigger;
3174 return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3175 bExpBigger:
3176 if ( bExp == 0x7FF ) {
3177 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3178 return packFloat64( zSign ^ 1, 0x7FF, 0 );
3179 }
3180 if ( aExp == 0 ) {
3181 ++expDiff;
3182 }
3183 else {
3184 aSig |= LIT64( 0x4000000000000000 );
3185 }
3186 shift64RightJamming( aSig, - expDiff, &aSig );
3187 bSig |= LIT64( 0x4000000000000000 );
3188 bBigger:
3189 zSig = bSig - aSig;
3190 zExp = bExp;
3191 zSign ^= 1;
3192 goto normalizeRoundAndPack;
3193 aExpBigger:
3194 if ( aExp == 0x7FF ) {
3195 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3196 return a;
3197 }
3198 if ( bExp == 0 ) {
3199 --expDiff;
3200 }
3201 else {
3202 bSig |= LIT64( 0x4000000000000000 );
3203 }
3204 shift64RightJamming( bSig, expDiff, &bSig );
3205 aSig |= LIT64( 0x4000000000000000 );
3206 aBigger:
3207 zSig = aSig - bSig;
3208 zExp = aExp;
3209 normalizeRoundAndPack:
3210 --zExp;
3211 return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3212
3213}
3214
3215/*----------------------------------------------------------------------------
3216| Returns the result of adding the double-precision floating-point values `a'
3217| and `b'. The operation is performed according to the IEC/IEEE Standard for
3218| Binary Floating-Point Arithmetic.
3219*----------------------------------------------------------------------------*/
3220
3221float64 float64_add( float64 a, float64 b STATUS_PARAM )
3222{
3223 flag aSign, bSign;
37d18660
PM
3224 a = float64_squash_input_denormal(a STATUS_VAR);
3225 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
3226
3227 aSign = extractFloat64Sign( a );
3228 bSign = extractFloat64Sign( b );
3229 if ( aSign == bSign ) {
3230 return addFloat64Sigs( a, b, aSign STATUS_VAR );
3231 }
3232 else {
3233 return subFloat64Sigs( a, b, aSign STATUS_VAR );
3234 }
3235
3236}
3237
3238/*----------------------------------------------------------------------------
3239| Returns the result of subtracting the double-precision floating-point values
3240| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3241| for Binary Floating-Point Arithmetic.
3242*----------------------------------------------------------------------------*/
3243
3244float64 float64_sub( float64 a, float64 b STATUS_PARAM )
3245{
3246 flag aSign, bSign;
37d18660
PM
3247 a = float64_squash_input_denormal(a STATUS_VAR);
3248 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
3249
3250 aSign = extractFloat64Sign( a );
3251 bSign = extractFloat64Sign( b );
3252 if ( aSign == bSign ) {
3253 return subFloat64Sigs( a, b, aSign STATUS_VAR );
3254 }
3255 else {
3256 return addFloat64Sigs( a, b, aSign STATUS_VAR );
3257 }
3258
3259}
3260
3261/*----------------------------------------------------------------------------
3262| Returns the result of multiplying the double-precision floating-point values
3263| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3264| for Binary Floating-Point Arithmetic.
3265*----------------------------------------------------------------------------*/
3266
3267float64 float64_mul( float64 a, float64 b STATUS_PARAM )
3268{
3269 flag aSign, bSign, zSign;
3270 int16 aExp, bExp, zExp;
bb98fe42 3271 uint64_t aSig, bSig, zSig0, zSig1;
158142c2 3272
37d18660
PM
3273 a = float64_squash_input_denormal(a STATUS_VAR);
3274 b = float64_squash_input_denormal(b STATUS_VAR);
3275
158142c2
FB
3276 aSig = extractFloat64Frac( a );
3277 aExp = extractFloat64Exp( a );
3278 aSign = extractFloat64Sign( a );
3279 bSig = extractFloat64Frac( b );
3280 bExp = extractFloat64Exp( b );
3281 bSign = extractFloat64Sign( b );
3282 zSign = aSign ^ bSign;
3283 if ( aExp == 0x7FF ) {
3284 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3285 return propagateFloat64NaN( a, b STATUS_VAR );
3286 }
3287 if ( ( bExp | bSig ) == 0 ) {
3288 float_raise( float_flag_invalid STATUS_VAR);
3289 return float64_default_nan;
3290 }
3291 return packFloat64( zSign, 0x7FF, 0 );
3292 }
3293 if ( bExp == 0x7FF ) {
3294 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3295 if ( ( aExp | aSig ) == 0 ) {
3296 float_raise( float_flag_invalid STATUS_VAR);
3297 return float64_default_nan;
3298 }
3299 return packFloat64( zSign, 0x7FF, 0 );
3300 }
3301 if ( aExp == 0 ) {
3302 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3303 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3304 }
3305 if ( bExp == 0 ) {
3306 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
3307 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3308 }
3309 zExp = aExp + bExp - 0x3FF;
3310 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3311 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3312 mul64To128( aSig, bSig, &zSig0, &zSig1 );
3313 zSig0 |= ( zSig1 != 0 );
bb98fe42 3314 if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
158142c2
FB
3315 zSig0 <<= 1;
3316 --zExp;
3317 }
3318 return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
3319
3320}
3321
3322/*----------------------------------------------------------------------------
3323| Returns the result of dividing the double-precision floating-point value `a'
3324| by the corresponding value `b'. The operation is performed according to
3325| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3326*----------------------------------------------------------------------------*/
3327
3328float64 float64_div( float64 a, float64 b STATUS_PARAM )
3329{
3330 flag aSign, bSign, zSign;
3331 int16 aExp, bExp, zExp;
bb98fe42
AF
3332 uint64_t aSig, bSig, zSig;
3333 uint64_t rem0, rem1;
3334 uint64_t term0, term1;
37d18660
PM
3335 a = float64_squash_input_denormal(a STATUS_VAR);
3336 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
3337
3338 aSig = extractFloat64Frac( a );
3339 aExp = extractFloat64Exp( a );
3340 aSign = extractFloat64Sign( a );
3341 bSig = extractFloat64Frac( b );
3342 bExp = extractFloat64Exp( b );
3343 bSign = extractFloat64Sign( b );
3344 zSign = aSign ^ bSign;
3345 if ( aExp == 0x7FF ) {
3346 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3347 if ( bExp == 0x7FF ) {
3348 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3349 float_raise( float_flag_invalid STATUS_VAR);
3350 return float64_default_nan;
3351 }
3352 return packFloat64( zSign, 0x7FF, 0 );
3353 }
3354 if ( bExp == 0x7FF ) {
3355 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3356 return packFloat64( zSign, 0, 0 );
3357 }
3358 if ( bExp == 0 ) {
3359 if ( bSig == 0 ) {
3360 if ( ( aExp | aSig ) == 0 ) {
3361 float_raise( float_flag_invalid STATUS_VAR);
3362 return float64_default_nan;
3363 }
3364 float_raise( float_flag_divbyzero STATUS_VAR);
3365 return packFloat64( zSign, 0x7FF, 0 );
3366 }
3367 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3368 }
3369 if ( aExp == 0 ) {
3370 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3371 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3372 }
3373 zExp = aExp - bExp + 0x3FD;
3374 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3375 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3376 if ( bSig <= ( aSig + aSig ) ) {
3377 aSig >>= 1;
3378 ++zExp;
3379 }
3380 zSig = estimateDiv128To64( aSig, 0, bSig );
3381 if ( ( zSig & 0x1FF ) <= 2 ) {
3382 mul64To128( bSig, zSig, &term0, &term1 );
3383 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
bb98fe42 3384 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
3385 --zSig;
3386 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3387 }
3388 zSig |= ( rem1 != 0 );
3389 }
3390 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3391
3392}
3393
3394/*----------------------------------------------------------------------------
3395| Returns the remainder of the double-precision floating-point value `a'
3396| with respect to the corresponding value `b'. The operation is performed
3397| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3398*----------------------------------------------------------------------------*/
3399
3400float64 float64_rem( float64 a, float64 b STATUS_PARAM )
3401{
ed086f3d 3402 flag aSign, zSign;
158142c2 3403 int16 aExp, bExp, expDiff;
bb98fe42
AF
3404 uint64_t aSig, bSig;
3405 uint64_t q, alternateASig;
3406 int64_t sigMean;
158142c2 3407
37d18660
PM
3408 a = float64_squash_input_denormal(a STATUS_VAR);
3409 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
3410 aSig = extractFloat64Frac( a );
3411 aExp = extractFloat64Exp( a );
3412 aSign = extractFloat64Sign( a );
3413 bSig = extractFloat64Frac( b );
3414 bExp = extractFloat64Exp( b );
158142c2
FB
3415 if ( aExp == 0x7FF ) {
3416 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3417 return propagateFloat64NaN( a, b STATUS_VAR );
3418 }
3419 float_raise( float_flag_invalid STATUS_VAR);
3420 return float64_default_nan;
3421 }
3422 if ( bExp == 0x7FF ) {
3423 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3424 return a;
3425 }
3426 if ( bExp == 0 ) {
3427 if ( bSig == 0 ) {
3428 float_raise( float_flag_invalid STATUS_VAR);
3429 return float64_default_nan;
3430 }
3431 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3432 }
3433 if ( aExp == 0 ) {
3434 if ( aSig == 0 ) return a;
3435 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3436 }
3437 expDiff = aExp - bExp;
3438 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3439 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3440 if ( expDiff < 0 ) {
3441 if ( expDiff < -1 ) return a;
3442 aSig >>= 1;
3443 }
3444 q = ( bSig <= aSig );
3445 if ( q ) aSig -= bSig;
3446 expDiff -= 64;
3447 while ( 0 < expDiff ) {
3448 q = estimateDiv128To64( aSig, 0, bSig );
3449 q = ( 2 < q ) ? q - 2 : 0;
3450 aSig = - ( ( bSig>>2 ) * q );
3451 expDiff -= 62;
3452 }
3453 expDiff += 64;
3454 if ( 0 < expDiff ) {
3455 q = estimateDiv128To64( aSig, 0, bSig );
3456 q = ( 2 < q ) ? q - 2 : 0;
3457 q >>= 64 - expDiff;
3458 bSig >>= 2;
3459 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3460 }
3461 else {
3462 aSig >>= 2;
3463 bSig >>= 2;
3464 }
3465 do {
3466 alternateASig = aSig;
3467 ++q;
3468 aSig -= bSig;
bb98fe42 3469 } while ( 0 <= (int64_t) aSig );
158142c2
FB
3470 sigMean = aSig + alternateASig;
3471 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3472 aSig = alternateASig;
3473 }
bb98fe42 3474 zSign = ( (int64_t) aSig < 0 );
158142c2
FB
3475 if ( zSign ) aSig = - aSig;
3476 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
3477
3478}
3479
3480/*----------------------------------------------------------------------------
3481| Returns the square root of the double-precision floating-point value `a'.
3482| The operation is performed according to the IEC/IEEE Standard for Binary
3483| Floating-Point Arithmetic.
3484*----------------------------------------------------------------------------*/
3485
3486float64 float64_sqrt( float64 a STATUS_PARAM )
3487{
3488 flag aSign;
3489 int16 aExp, zExp;
bb98fe42
AF
3490 uint64_t aSig, zSig, doubleZSig;
3491 uint64_t rem0, rem1, term0, term1;
37d18660 3492 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
3493
3494 aSig = extractFloat64Frac( a );
3495 aExp = extractFloat64Exp( a );
3496 aSign = extractFloat64Sign( a );
3497 if ( aExp == 0x7FF ) {
3498 if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
3499 if ( ! aSign ) return a;
3500 float_raise( float_flag_invalid STATUS_VAR);
3501 return float64_default_nan;
3502 }
3503 if ( aSign ) {
3504 if ( ( aExp | aSig ) == 0 ) return a;
3505 float_raise( float_flag_invalid STATUS_VAR);
3506 return float64_default_nan;
3507 }
3508 if ( aExp == 0 ) {
f090c9d4 3509 if ( aSig == 0 ) return float64_zero;
158142c2
FB
3510 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3511 }
3512 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3513 aSig |= LIT64( 0x0010000000000000 );
3514 zSig = estimateSqrt32( aExp, aSig>>21 );
3515 aSig <<= 9 - ( aExp & 1 );
3516 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3517 if ( ( zSig & 0x1FF ) <= 5 ) {
3518 doubleZSig = zSig<<1;
3519 mul64To128( zSig, zSig, &term0, &term1 );
3520 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
bb98fe42 3521 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
3522 --zSig;
3523 doubleZSig -= 2;
3524 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3525 }
3526 zSig |= ( ( rem0 | rem1 ) != 0 );
3527 }
3528 return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
3529
3530}
3531
374dfc33
AJ
3532/*----------------------------------------------------------------------------
3533| Returns the binary log of the double-precision floating-point value `a'.
3534| The operation is performed according to the IEC/IEEE Standard for Binary
3535| Floating-Point Arithmetic.
3536*----------------------------------------------------------------------------*/
3537float64 float64_log2( float64 a STATUS_PARAM )
3538{
3539 flag aSign, zSign;
3540 int16 aExp;
bb98fe42 3541 uint64_t aSig, aSig0, aSig1, zSig, i;
37d18660 3542 a = float64_squash_input_denormal(a STATUS_VAR);
374dfc33
AJ
3543
3544 aSig = extractFloat64Frac( a );
3545 aExp = extractFloat64Exp( a );
3546 aSign = extractFloat64Sign( a );
3547
3548 if ( aExp == 0 ) {
3549 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
3550 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3551 }
3552 if ( aSign ) {
3553 float_raise( float_flag_invalid STATUS_VAR);
3554 return float64_default_nan;
3555 }
3556 if ( aExp == 0x7FF ) {
3557 if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
3558 return a;
3559 }
3560
3561 aExp -= 0x3FF;
3562 aSig |= LIT64( 0x0010000000000000 );
3563 zSign = aExp < 0;
bb98fe42 3564 zSig = (uint64_t)aExp << 52;
374dfc33
AJ
3565 for (i = 1LL << 51; i > 0; i >>= 1) {
3566 mul64To128( aSig, aSig, &aSig0, &aSig1 );
3567 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
3568 if ( aSig & LIT64( 0x0020000000000000 ) ) {
3569 aSig >>= 1;
3570 zSig |= i;
3571 }
3572 }
3573
3574 if ( zSign )
3575 zSig = -zSig;
3576 return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
3577}
3578
158142c2
FB
3579/*----------------------------------------------------------------------------
3580| Returns 1 if the double-precision floating-point value `a' is equal to the
3581| corresponding value `b', and 0 otherwise. The comparison is performed
3582| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3583*----------------------------------------------------------------------------*/
3584
211315fb 3585int float64_eq_quiet( float64 a, float64 b STATUS_PARAM )
158142c2 3586{
bb98fe42 3587 uint64_t av, bv;
37d18660
PM
3588 a = float64_squash_input_denormal(a STATUS_VAR);
3589 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
3590
3591 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3592 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3593 ) {
3594 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3595 float_raise( float_flag_invalid STATUS_VAR);
3596 }
3597 return 0;
3598 }
f090c9d4 3599 av = float64_val(a);
a1b91bb4 3600 bv = float64_val(b);
bb98fe42 3601 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
158142c2
FB
3602
3603}
3604
3605/*----------------------------------------------------------------------------
3606| Returns 1 if the double-precision floating-point value `a' is less than or
3607| equal to the corresponding value `b', and 0 otherwise. The comparison is
3608| performed according to the IEC/IEEE Standard for Binary Floating-Point
3609| Arithmetic.
3610*----------------------------------------------------------------------------*/
3611
750afe93 3612int float64_le( float64 a, float64 b STATUS_PARAM )
158142c2
FB
3613{
3614 flag aSign, bSign;
bb98fe42 3615 uint64_t av, bv;
37d18660
PM
3616 a = float64_squash_input_denormal(a STATUS_VAR);
3617 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
3618
3619 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3620 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3621 ) {
3622 float_raise( float_flag_invalid STATUS_VAR);
3623 return 0;
3624 }
3625 aSign = extractFloat64Sign( a );
3626 bSign = extractFloat64Sign( b );
f090c9d4 3627 av = float64_val(a);
a1b91bb4 3628 bv = float64_val(b);
bb98fe42 3629 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 3630 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
3631
3632}
3633
3634/*----------------------------------------------------------------------------
3635| Returns 1 if the double-precision floating-point value `a' is less than
3636| the corresponding value `b', and 0 otherwise. The comparison is performed
3637| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3638*----------------------------------------------------------------------------*/
3639
750afe93 3640int float64_lt( float64 a, float64 b STATUS_PARAM )
158142c2
FB
3641{
3642 flag aSign, bSign;
bb98fe42 3643 uint64_t av, bv;
158142c2 3644
37d18660
PM
3645 a = float64_squash_input_denormal(a STATUS_VAR);
3646 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
3647 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3648 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3649 ) {
3650 float_raise( float_flag_invalid STATUS_VAR);
3651 return 0;
3652 }
3653 aSign = extractFloat64Sign( a );
3654 bSign = extractFloat64Sign( b );
f090c9d4 3655 av = float64_val(a);
a1b91bb4 3656 bv = float64_val(b);
bb98fe42 3657 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 3658 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
3659
3660}
3661
67b7861d
AJ
3662/*----------------------------------------------------------------------------
3663| Returns 1 if the double-precision floating-point values `a' and `b' cannot
3664| be compared, and 0 otherwise. The comparison is performed according to the
3665| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3666*----------------------------------------------------------------------------*/
3667
3668int float64_unordered( float64 a, float64 b STATUS_PARAM )
3669{
3670 a = float64_squash_input_denormal(a STATUS_VAR);
3671 b = float64_squash_input_denormal(b STATUS_VAR);
3672
3673 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3674 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3675 ) {
3676 float_raise( float_flag_invalid STATUS_VAR);
3677 return 1;
3678 }
3679 return 0;
3680}
3681
158142c2
FB
3682/*----------------------------------------------------------------------------
3683| Returns 1 if the double-precision floating-point value `a' is equal to the
3684| corresponding value `b', and 0 otherwise. The invalid exception is raised
3685| if either operand is a NaN. Otherwise, the comparison is performed
3686| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3687*----------------------------------------------------------------------------*/
3688
2657d0ff 3689int float64_eq( float64 a, float64 b STATUS_PARAM )
158142c2 3690{
bb98fe42 3691 uint64_t av, bv;
37d18660
PM
3692 a = float64_squash_input_denormal(a STATUS_VAR);
3693 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
3694
3695 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3696 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3697 ) {
3698 float_raise( float_flag_invalid STATUS_VAR);
3699 return 0;
3700 }
f090c9d4 3701 av = float64_val(a);
a1b91bb4 3702 bv = float64_val(b);
bb98fe42 3703 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
158142c2
FB
3704
3705}
3706
3707/*----------------------------------------------------------------------------
3708| Returns 1 if the double-precision floating-point value `a' is less than or
3709| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3710| cause an exception. Otherwise, the comparison is performed according to the
3711| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3712*----------------------------------------------------------------------------*/
3713
750afe93 3714int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
158142c2
FB
3715{
3716 flag aSign, bSign;
bb98fe42 3717 uint64_t av, bv;
37d18660
PM
3718 a = float64_squash_input_denormal(a STATUS_VAR);
3719 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
3720
3721 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3722 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3723 ) {
3724 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3725 float_raise( float_flag_invalid STATUS_VAR);
3726 }
3727 return 0;
3728 }
3729 aSign = extractFloat64Sign( a );
3730 bSign = extractFloat64Sign( b );
f090c9d4 3731 av = float64_val(a);
a1b91bb4 3732 bv = float64_val(b);
bb98fe42 3733 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 3734 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
3735
3736}
3737
3738/*----------------------------------------------------------------------------
3739| Returns 1 if the double-precision floating-point value `a' is less than
3740| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3741| exception. Otherwise, the comparison is performed according to the IEC/IEEE
3742| Standard for Binary Floating-Point Arithmetic.
3743*----------------------------------------------------------------------------*/
3744
750afe93 3745int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
158142c2
FB
3746{
3747 flag aSign, bSign;
bb98fe42 3748 uint64_t av, bv;
37d18660
PM
3749 a = float64_squash_input_denormal(a STATUS_VAR);
3750 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
3751
3752 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3753 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3754 ) {
3755 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3756 float_raise( float_flag_invalid STATUS_VAR);
3757 }
3758 return 0;
3759 }
3760 aSign = extractFloat64Sign( a );
3761 bSign = extractFloat64Sign( b );
f090c9d4 3762 av = float64_val(a);
a1b91bb4 3763 bv = float64_val(b);
bb98fe42 3764 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 3765 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
3766
3767}
3768
67b7861d
AJ
3769/*----------------------------------------------------------------------------
3770| Returns 1 if the double-precision floating-point values `a' and `b' cannot
3771| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
3772| comparison is performed according to the IEC/IEEE Standard for Binary
3773| Floating-Point Arithmetic.
3774*----------------------------------------------------------------------------*/
3775
3776int float64_unordered_quiet( float64 a, float64 b STATUS_PARAM )
3777{
3778 a = float64_squash_input_denormal(a STATUS_VAR);
3779 b = float64_squash_input_denormal(b STATUS_VAR);
3780
3781 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3782 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3783 ) {
3784 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3785 float_raise( float_flag_invalid STATUS_VAR);
3786 }
3787 return 1;
3788 }
3789 return 0;
3790}
3791
158142c2
FB
3792#ifdef FLOATX80
3793
3794/*----------------------------------------------------------------------------
3795| Returns the result of converting the extended double-precision floating-
3796| point value `a' to the 32-bit two's complement integer format. The
3797| conversion is performed according to the IEC/IEEE Standard for Binary
3798| Floating-Point Arithmetic---which means in particular that the conversion
3799| is rounded according to the current rounding mode. If `a' is a NaN, the
3800| largest positive integer is returned. Otherwise, if the conversion
3801| overflows, the largest integer with the same sign as `a' is returned.
3802*----------------------------------------------------------------------------*/
3803
3804int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
3805{
3806 flag aSign;
3807 int32 aExp, shiftCount;
bb98fe42 3808 uint64_t aSig;
158142c2
FB
3809
3810 aSig = extractFloatx80Frac( a );
3811 aExp = extractFloatx80Exp( a );
3812 aSign = extractFloatx80Sign( a );
bb98fe42 3813 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
158142c2
FB
3814 shiftCount = 0x4037 - aExp;
3815 if ( shiftCount <= 0 ) shiftCount = 1;
3816 shift64RightJamming( aSig, shiftCount, &aSig );
3817 return roundAndPackInt32( aSign, aSig STATUS_VAR );
3818
3819}
3820
3821/*----------------------------------------------------------------------------
3822| Returns the result of converting the extended double-precision floating-
3823| point value `a' to the 32-bit two's complement integer format. The
3824| conversion is performed according to the IEC/IEEE Standard for Binary
3825| Floating-Point Arithmetic, except that the conversion is always rounded
3826| toward zero. If `a' is a NaN, the largest positive integer is returned.
3827| Otherwise, if the conversion overflows, the largest integer with the same
3828| sign as `a' is returned.
3829*----------------------------------------------------------------------------*/
3830
3831int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
3832{
3833 flag aSign;
3834 int32 aExp, shiftCount;
bb98fe42 3835 uint64_t aSig, savedASig;
158142c2
FB
3836 int32 z;
3837
3838 aSig = extractFloatx80Frac( a );
3839 aExp = extractFloatx80Exp( a );
3840 aSign = extractFloatx80Sign( a );
3841 if ( 0x401E < aExp ) {
bb98fe42 3842 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
158142c2
FB
3843 goto invalid;
3844 }
3845 else if ( aExp < 0x3FFF ) {
3846 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3847 return 0;
3848 }
3849 shiftCount = 0x403E - aExp;
3850 savedASig = aSig;
3851 aSig >>= shiftCount;
3852 z = aSig;
3853 if ( aSign ) z = - z;
3854 if ( ( z < 0 ) ^ aSign ) {
3855 invalid:
3856 float_raise( float_flag_invalid STATUS_VAR);
bb98fe42 3857 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2
FB
3858 }
3859 if ( ( aSig<<shiftCount ) != savedASig ) {
3860 STATUS(float_exception_flags) |= float_flag_inexact;
3861 }
3862 return z;
3863
3864}
3865
3866/*----------------------------------------------------------------------------
3867| Returns the result of converting the extended double-precision floating-
3868| point value `a' to the 64-bit two's complement integer format. The
3869| conversion is performed according to the IEC/IEEE Standard for Binary
3870| Floating-Point Arithmetic---which means in particular that the conversion
3871| is rounded according to the current rounding mode. If `a' is a NaN,
3872| the largest positive integer is returned. Otherwise, if the conversion
3873| overflows, the largest integer with the same sign as `a' is returned.
3874*----------------------------------------------------------------------------*/
3875
3876int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
3877{
3878 flag aSign;
3879 int32 aExp, shiftCount;
bb98fe42 3880 uint64_t aSig, aSigExtra;
158142c2
FB
3881
3882 aSig = extractFloatx80Frac( a );
3883 aExp = extractFloatx80Exp( a );
3884 aSign = extractFloatx80Sign( a );
3885 shiftCount = 0x403E - aExp;
3886 if ( shiftCount <= 0 ) {
3887 if ( shiftCount ) {
3888 float_raise( float_flag_invalid STATUS_VAR);
3889 if ( ! aSign
3890 || ( ( aExp == 0x7FFF )
3891 && ( aSig != LIT64( 0x8000000000000000 ) ) )
3892 ) {
3893 return LIT64( 0x7FFFFFFFFFFFFFFF );
3894 }
bb98fe42 3895 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
3896 }
3897 aSigExtra = 0;
3898 }
3899 else {
3900 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3901 }
3902 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3903
3904}
3905
3906/*----------------------------------------------------------------------------
3907| Returns the result of converting the extended double-precision floating-
3908| point value `a' to the 64-bit two's complement integer format. The
3909| conversion is performed according to the IEC/IEEE Standard for Binary
3910| Floating-Point Arithmetic, except that the conversion is always rounded
3911| toward zero. If `a' is a NaN, the largest positive integer is returned.
3912| Otherwise, if the conversion overflows, the largest integer with the same
3913| sign as `a' is returned.
3914*----------------------------------------------------------------------------*/
3915
3916int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
3917{
3918 flag aSign;
3919 int32 aExp, shiftCount;
bb98fe42 3920 uint64_t aSig;
158142c2
FB
3921 int64 z;
3922
3923 aSig = extractFloatx80Frac( a );
3924 aExp = extractFloatx80Exp( a );
3925 aSign = extractFloatx80Sign( a );
3926 shiftCount = aExp - 0x403E;
3927 if ( 0 <= shiftCount ) {
3928 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3929 if ( ( a.high != 0xC03E ) || aSig ) {
3930 float_raise( float_flag_invalid STATUS_VAR);
3931 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3932 return LIT64( 0x7FFFFFFFFFFFFFFF );
3933 }
3934 }
bb98fe42 3935 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
3936 }
3937 else if ( aExp < 0x3FFF ) {
3938 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3939 return 0;
3940 }
3941 z = aSig>>( - shiftCount );
bb98fe42 3942 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
158142c2
FB
3943 STATUS(float_exception_flags) |= float_flag_inexact;
3944 }
3945 if ( aSign ) z = - z;
3946 return z;
3947
3948}
3949
3950/*----------------------------------------------------------------------------
3951| Returns the result of converting the extended double-precision floating-
3952| point value `a' to the single-precision floating-point format. The
3953| conversion is performed according to the IEC/IEEE Standard for Binary
3954| Floating-Point Arithmetic.
3955*----------------------------------------------------------------------------*/
3956
3957float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
3958{
3959 flag aSign;
3960 int32 aExp;
bb98fe42 3961 uint64_t aSig;
158142c2
FB
3962
3963 aSig = extractFloatx80Frac( a );
3964 aExp = extractFloatx80Exp( a );
3965 aSign = extractFloatx80Sign( a );
3966 if ( aExp == 0x7FFF ) {
bb98fe42 3967 if ( (uint64_t) ( aSig<<1 ) ) {
bcd4d9af 3968 return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
3969 }
3970 return packFloat32( aSign, 0xFF, 0 );
3971 }
3972 shift64RightJamming( aSig, 33, &aSig );
3973 if ( aExp || aSig ) aExp -= 0x3F81;
3974 return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
3975
3976}
3977
3978/*----------------------------------------------------------------------------
3979| Returns the result of converting the extended double-precision floating-
3980| point value `a' to the double-precision floating-point format. The
3981| conversion is performed according to the IEC/IEEE Standard for Binary
3982| Floating-Point Arithmetic.
3983*----------------------------------------------------------------------------*/
3984
3985float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
3986{
3987 flag aSign;
3988 int32 aExp;
bb98fe42 3989 uint64_t aSig, zSig;
158142c2
FB
3990
3991 aSig = extractFloatx80Frac( a );
3992 aExp = extractFloatx80Exp( a );
3993 aSign = extractFloatx80Sign( a );
3994 if ( aExp == 0x7FFF ) {
bb98fe42 3995 if ( (uint64_t) ( aSig<<1 ) ) {
bcd4d9af 3996 return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
3997 }
3998 return packFloat64( aSign, 0x7FF, 0 );
3999 }
4000 shift64RightJamming( aSig, 1, &zSig );
4001 if ( aExp || aSig ) aExp -= 0x3C01;
4002 return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
4003
4004}
4005
4006#ifdef FLOAT128
4007
4008/*----------------------------------------------------------------------------
4009| Returns the result of converting the extended double-precision floating-
4010| point value `a' to the quadruple-precision floating-point format. The
4011| conversion is performed according to the IEC/IEEE Standard for Binary
4012| Floating-Point Arithmetic.
4013*----------------------------------------------------------------------------*/
4014
4015float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
4016{
4017 flag aSign;
4018 int16 aExp;
bb98fe42 4019 uint64_t aSig, zSig0, zSig1;
158142c2
FB
4020
4021 aSig = extractFloatx80Frac( a );
4022 aExp = extractFloatx80Exp( a );
4023 aSign = extractFloatx80Sign( a );
bb98fe42 4024 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
bcd4d9af 4025 return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
4026 }
4027 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4028 return packFloat128( aSign, aExp, zSig0, zSig1 );
4029
4030}
4031
4032#endif
4033
4034/*----------------------------------------------------------------------------
4035| Rounds the extended double-precision floating-point value `a' to an integer,
4036| and returns the result as an extended quadruple-precision floating-point
4037| value. The operation is performed according to the IEC/IEEE Standard for
4038| Binary Floating-Point Arithmetic.
4039*----------------------------------------------------------------------------*/
4040
4041floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
4042{
4043 flag aSign;
4044 int32 aExp;
bb98fe42 4045 uint64_t lastBitMask, roundBitsMask;
158142c2
FB
4046 int8 roundingMode;
4047 floatx80 z;
4048
4049 aExp = extractFloatx80Exp( a );
4050 if ( 0x403E <= aExp ) {
bb98fe42 4051 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
158142c2
FB
4052 return propagateFloatx80NaN( a, a STATUS_VAR );
4053 }
4054 return a;
4055 }
4056 if ( aExp < 0x3FFF ) {
4057 if ( ( aExp == 0 )
bb98fe42 4058 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
158142c2
FB
4059 return a;
4060 }
4061 STATUS(float_exception_flags) |= float_flag_inexact;
4062 aSign = extractFloatx80Sign( a );
4063 switch ( STATUS(float_rounding_mode) ) {
4064 case float_round_nearest_even:
bb98fe42 4065 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
158142c2
FB
4066 ) {
4067 return
4068 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4069 }
4070 break;
4071 case float_round_down:
4072 return
4073 aSign ?
4074 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4075 : packFloatx80( 0, 0, 0 );
4076 case float_round_up:
4077 return
4078 aSign ? packFloatx80( 1, 0, 0 )
4079 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4080 }
4081 return packFloatx80( aSign, 0, 0 );
4082 }
4083 lastBitMask = 1;
4084 lastBitMask <<= 0x403E - aExp;
4085 roundBitsMask = lastBitMask - 1;
4086 z = a;
4087 roundingMode = STATUS(float_rounding_mode);
4088 if ( roundingMode == float_round_nearest_even ) {
4089 z.low += lastBitMask>>1;
4090 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4091 }
4092 else if ( roundingMode != float_round_to_zero ) {
4093 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
4094 z.low += roundBitsMask;
4095 }
4096 }
4097 z.low &= ~ roundBitsMask;
4098 if ( z.low == 0 ) {
4099 ++z.high;
4100 z.low = LIT64( 0x8000000000000000 );
4101 }
4102 if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
4103 return z;
4104
4105}
4106
4107/*----------------------------------------------------------------------------
4108| Returns the result of adding the absolute values of the extended double-
4109| precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
4110| negated before being returned. `zSign' is ignored if the result is a NaN.
4111| The addition is performed according to the IEC/IEEE Standard for Binary
4112| Floating-Point Arithmetic.
4113*----------------------------------------------------------------------------*/
4114
4115static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
4116{
4117 int32 aExp, bExp, zExp;
bb98fe42 4118 uint64_t aSig, bSig, zSig0, zSig1;
158142c2
FB
4119 int32 expDiff;
4120
4121 aSig = extractFloatx80Frac( a );
4122 aExp = extractFloatx80Exp( a );
4123 bSig = extractFloatx80Frac( b );
4124 bExp = extractFloatx80Exp( b );
4125 expDiff = aExp - bExp;
4126 if ( 0 < expDiff ) {
4127 if ( aExp == 0x7FFF ) {
bb98fe42 4128 if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
4129 return a;
4130 }
4131 if ( bExp == 0 ) --expDiff;
4132 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4133 zExp = aExp;
4134 }
4135 else if ( expDiff < 0 ) {
4136 if ( bExp == 0x7FFF ) {
bb98fe42 4137 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
4138 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4139 }
4140 if ( aExp == 0 ) ++expDiff;
4141 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4142 zExp = bExp;
4143 }
4144 else {
4145 if ( aExp == 0x7FFF ) {
bb98fe42 4146 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
158142c2
FB
4147 return propagateFloatx80NaN( a, b STATUS_VAR );
4148 }
4149 return a;
4150 }
4151 zSig1 = 0;
4152 zSig0 = aSig + bSig;
4153 if ( aExp == 0 ) {
4154 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4155 goto roundAndPack;
4156 }
4157 zExp = aExp;
4158 goto shiftRight1;
4159 }
4160 zSig0 = aSig + bSig;
bb98fe42 4161 if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
158142c2
FB
4162 shiftRight1:
4163 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4164 zSig0 |= LIT64( 0x8000000000000000 );
4165 ++zExp;
4166 roundAndPack:
4167 return
4168 roundAndPackFloatx80(
4169 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4170
4171}
4172
4173/*----------------------------------------------------------------------------
4174| Returns the result of subtracting the absolute values of the extended
4175| double-precision floating-point values `a' and `b'. If `zSign' is 1, the
4176| difference is negated before being returned. `zSign' is ignored if the
4177| result is a NaN. The subtraction is performed according to the IEC/IEEE
4178| Standard for Binary Floating-Point Arithmetic.
4179*----------------------------------------------------------------------------*/
4180
4181static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
4182{
4183 int32 aExp, bExp, zExp;
bb98fe42 4184 uint64_t aSig, bSig, zSig0, zSig1;
158142c2
FB
4185 int32 expDiff;
4186 floatx80 z;
4187
4188 aSig = extractFloatx80Frac( a );
4189 aExp = extractFloatx80Exp( a );
4190 bSig = extractFloatx80Frac( b );
4191 bExp = extractFloatx80Exp( b );
4192 expDiff = aExp - bExp;
4193 if ( 0 < expDiff ) goto aExpBigger;
4194 if ( expDiff < 0 ) goto bExpBigger;
4195 if ( aExp == 0x7FFF ) {
bb98fe42 4196 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
158142c2
FB
4197 return propagateFloatx80NaN( a, b STATUS_VAR );
4198 }
4199 float_raise( float_flag_invalid STATUS_VAR);
4200 z.low = floatx80_default_nan_low;
4201 z.high = floatx80_default_nan_high;
4202 return z;
4203 }
4204 if ( aExp == 0 ) {
4205 aExp = 1;
4206 bExp = 1;
4207 }
4208 zSig1 = 0;
4209 if ( bSig < aSig ) goto aBigger;
4210 if ( aSig < bSig ) goto bBigger;
4211 return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
4212 bExpBigger:
4213 if ( bExp == 0x7FFF ) {
bb98fe42 4214 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
4215 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
4216 }
4217 if ( aExp == 0 ) ++expDiff;
4218 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4219 bBigger:
4220 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4221 zExp = bExp;
4222 zSign ^= 1;
4223 goto normalizeRoundAndPack;
4224 aExpBigger:
4225 if ( aExp == 0x7FFF ) {
bb98fe42 4226 if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
4227 return a;
4228 }
4229 if ( bExp == 0 ) --expDiff;
4230 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4231 aBigger:
4232 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4233 zExp = aExp;
4234 normalizeRoundAndPack:
4235 return
4236 normalizeRoundAndPackFloatx80(
4237 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4238
4239}
4240
4241/*----------------------------------------------------------------------------
4242| Returns the result of adding the extended double-precision floating-point
4243| values `a' and `b'. The operation is performed according to the IEC/IEEE
4244| Standard for Binary Floating-Point Arithmetic.
4245*----------------------------------------------------------------------------*/
4246
4247floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
4248{
4249 flag aSign, bSign;
4250
4251 aSign = extractFloatx80Sign( a );
4252 bSign = extractFloatx80Sign( b );
4253 if ( aSign == bSign ) {
4254 return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4255 }
4256 else {
4257 return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4258 }
4259
4260}
4261
4262/*----------------------------------------------------------------------------
4263| Returns the result of subtracting the extended double-precision floating-
4264| point values `a' and `b'. The operation is performed according to the
4265| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4266*----------------------------------------------------------------------------*/
4267
4268floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
4269{
4270 flag aSign, bSign;
4271
4272 aSign = extractFloatx80Sign( a );
4273 bSign = extractFloatx80Sign( b );
4274 if ( aSign == bSign ) {
4275 return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4276 }
4277 else {
4278 return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4279 }
4280
4281}
4282
4283/*----------------------------------------------------------------------------
4284| Returns the result of multiplying the extended double-precision floating-
4285| point values `a' and `b'. The operation is performed according to the
4286| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4287*----------------------------------------------------------------------------*/
4288
4289floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
4290{
4291 flag aSign, bSign, zSign;
4292 int32 aExp, bExp, zExp;
bb98fe42 4293 uint64_t aSig, bSig, zSig0, zSig1;
158142c2
FB
4294 floatx80 z;
4295
4296 aSig = extractFloatx80Frac( a );
4297 aExp = extractFloatx80Exp( a );
4298 aSign = extractFloatx80Sign( a );
4299 bSig = extractFloatx80Frac( b );
4300 bExp = extractFloatx80Exp( b );
4301 bSign = extractFloatx80Sign( b );
4302 zSign = aSign ^ bSign;
4303 if ( aExp == 0x7FFF ) {
bb98fe42
AF
4304 if ( (uint64_t) ( aSig<<1 )
4305 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
158142c2
FB
4306 return propagateFloatx80NaN( a, b STATUS_VAR );
4307 }
4308 if ( ( bExp | bSig ) == 0 ) goto invalid;
4309 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4310 }
4311 if ( bExp == 0x7FFF ) {
bb98fe42 4312 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
4313 if ( ( aExp | aSig ) == 0 ) {
4314 invalid:
4315 float_raise( float_flag_invalid STATUS_VAR);
4316 z.low = floatx80_default_nan_low;
4317 z.high = floatx80_default_nan_high;
4318 return z;
4319 }
4320 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4321 }
4322 if ( aExp == 0 ) {
4323 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4324 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4325 }
4326 if ( bExp == 0 ) {
4327 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
4328 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4329 }
4330 zExp = aExp + bExp - 0x3FFE;
4331 mul64To128( aSig, bSig, &zSig0, &zSig1 );
bb98fe42 4332 if ( 0 < (int64_t) zSig0 ) {
158142c2
FB
4333 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
4334 --zExp;
4335 }
4336 return
4337 roundAndPackFloatx80(
4338 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4339
4340}
4341
4342/*----------------------------------------------------------------------------
4343| Returns the result of dividing the extended double-precision floating-point
4344| value `a' by the corresponding value `b'. The operation is performed
4345| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4346*----------------------------------------------------------------------------*/
4347
4348floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
4349{
4350 flag aSign, bSign, zSign;
4351 int32 aExp, bExp, zExp;
bb98fe42
AF
4352 uint64_t aSig, bSig, zSig0, zSig1;
4353 uint64_t rem0, rem1, rem2, term0, term1, term2;
158142c2
FB
4354 floatx80 z;
4355
4356 aSig = extractFloatx80Frac( a );
4357 aExp = extractFloatx80Exp( a );
4358 aSign = extractFloatx80Sign( a );
4359 bSig = extractFloatx80Frac( b );
4360 bExp = extractFloatx80Exp( b );
4361 bSign = extractFloatx80Sign( b );
4362 zSign = aSign ^ bSign;
4363 if ( aExp == 0x7FFF ) {
bb98fe42 4364 if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2 4365 if ( bExp == 0x7FFF ) {
bb98fe42 4366 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
4367 goto invalid;
4368 }
4369 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4370 }
4371 if ( bExp == 0x7FFF ) {
bb98fe42 4372 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
4373 return packFloatx80( zSign, 0, 0 );
4374 }
4375 if ( bExp == 0 ) {
4376 if ( bSig == 0 ) {
4377 if ( ( aExp | aSig ) == 0 ) {
4378 invalid:
4379 float_raise( float_flag_invalid STATUS_VAR);
4380 z.low = floatx80_default_nan_low;
4381 z.high = floatx80_default_nan_high;
4382 return z;
4383 }
4384 float_raise( float_flag_divbyzero STATUS_VAR);
4385 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4386 }
4387 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4388 }
4389 if ( aExp == 0 ) {
4390 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4391 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4392 }
4393 zExp = aExp - bExp + 0x3FFE;
4394 rem1 = 0;
4395 if ( bSig <= aSig ) {
4396 shift128Right( aSig, 0, 1, &aSig, &rem1 );
4397 ++zExp;
4398 }
4399 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
4400 mul64To128( bSig, zSig0, &term0, &term1 );
4401 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
bb98fe42 4402 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
4403 --zSig0;
4404 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4405 }
4406 zSig1 = estimateDiv128To64( rem1, 0, bSig );
bb98fe42 4407 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
158142c2
FB
4408 mul64To128( bSig, zSig1, &term1, &term2 );
4409 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
bb98fe42 4410 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
4411 --zSig1;
4412 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
4413 }
4414 zSig1 |= ( ( rem1 | rem2 ) != 0 );
4415 }
4416 return
4417 roundAndPackFloatx80(
4418 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4419
4420}
4421
4422/*----------------------------------------------------------------------------
4423| Returns the remainder of the extended double-precision floating-point value
4424| `a' with respect to the corresponding value `b'. The operation is performed
4425| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4426*----------------------------------------------------------------------------*/
4427
4428floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
4429{
ed086f3d 4430 flag aSign, zSign;
158142c2 4431 int32 aExp, bExp, expDiff;
bb98fe42
AF
4432 uint64_t aSig0, aSig1, bSig;
4433 uint64_t q, term0, term1, alternateASig0, alternateASig1;
158142c2
FB
4434 floatx80 z;
4435
4436 aSig0 = extractFloatx80Frac( a );
4437 aExp = extractFloatx80Exp( a );
4438 aSign = extractFloatx80Sign( a );
4439 bSig = extractFloatx80Frac( b );
4440 bExp = extractFloatx80Exp( b );
158142c2 4441 if ( aExp == 0x7FFF ) {
bb98fe42
AF
4442 if ( (uint64_t) ( aSig0<<1 )
4443 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
158142c2
FB
4444 return propagateFloatx80NaN( a, b STATUS_VAR );
4445 }
4446 goto invalid;
4447 }
4448 if ( bExp == 0x7FFF ) {
bb98fe42 4449 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
4450 return a;
4451 }
4452 if ( bExp == 0 ) {
4453 if ( bSig == 0 ) {
4454 invalid:
4455 float_raise( float_flag_invalid STATUS_VAR);
4456 z.low = floatx80_default_nan_low;
4457 z.high = floatx80_default_nan_high;
4458 return z;
4459 }
4460 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4461 }
4462 if ( aExp == 0 ) {
bb98fe42 4463 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
158142c2
FB
4464 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4465 }
4466 bSig |= LIT64( 0x8000000000000000 );
4467 zSign = aSign;
4468 expDiff = aExp - bExp;
4469 aSig1 = 0;
4470 if ( expDiff < 0 ) {
4471 if ( expDiff < -1 ) return a;
4472 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
4473 expDiff = 0;
4474 }
4475 q = ( bSig <= aSig0 );
4476 if ( q ) aSig0 -= bSig;
4477 expDiff -= 64;
4478 while ( 0 < expDiff ) {
4479 q = estimateDiv128To64( aSig0, aSig1, bSig );
4480 q = ( 2 < q ) ? q - 2 : 0;
4481 mul64To128( bSig, q, &term0, &term1 );
4482 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4483 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
4484 expDiff -= 62;
4485 }
4486 expDiff += 64;
4487 if ( 0 < expDiff ) {
4488 q = estimateDiv128To64( aSig0, aSig1, bSig );
4489 q = ( 2 < q ) ? q - 2 : 0;
4490 q >>= 64 - expDiff;
4491 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
4492 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4493 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4494 while ( le128( term0, term1, aSig0, aSig1 ) ) {
4495 ++q;
4496 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4497 }
4498 }
4499 else {
4500 term1 = 0;
4501 term0 = bSig;
4502 }
4503 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4504 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4505 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4506 && ( q & 1 ) )
4507 ) {
4508 aSig0 = alternateASig0;
4509 aSig1 = alternateASig1;
4510 zSign = ! zSign;
4511 }
4512 return
4513 normalizeRoundAndPackFloatx80(
4514 80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
4515
4516}
4517
4518/*----------------------------------------------------------------------------
4519| Returns the square root of the extended double-precision floating-point
4520| value `a'. The operation is performed according to the IEC/IEEE Standard
4521| for Binary Floating-Point Arithmetic.
4522*----------------------------------------------------------------------------*/
4523
4524floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
4525{
4526 flag aSign;
4527 int32 aExp, zExp;
bb98fe42
AF
4528 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4529 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
158142c2
FB
4530 floatx80 z;
4531
4532 aSig0 = extractFloatx80Frac( a );
4533 aExp = extractFloatx80Exp( a );
4534 aSign = extractFloatx80Sign( a );
4535 if ( aExp == 0x7FFF ) {
bb98fe42 4536 if ( (uint64_t) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
158142c2
FB
4537 if ( ! aSign ) return a;
4538 goto invalid;
4539 }
4540 if ( aSign ) {
4541 if ( ( aExp | aSig0 ) == 0 ) return a;
4542 invalid:
4543 float_raise( float_flag_invalid STATUS_VAR);
4544 z.low = floatx80_default_nan_low;
4545 z.high = floatx80_default_nan_high;
4546 return z;
4547 }
4548 if ( aExp == 0 ) {
4549 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4550 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4551 }
4552 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4553 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4554 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4555 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4556 doubleZSig0 = zSig0<<1;
4557 mul64To128( zSig0, zSig0, &term0, &term1 );
4558 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
bb98fe42 4559 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
4560 --zSig0;
4561 doubleZSig0 -= 2;
4562 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4563 }
4564 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4565 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4566 if ( zSig1 == 0 ) zSig1 = 1;
4567 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4568 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4569 mul64To128( zSig1, zSig1, &term2, &term3 );
4570 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
bb98fe42 4571 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
4572 --zSig1;
4573 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4574 term3 |= 1;
4575 term2 |= doubleZSig0;
4576 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4577 }
4578 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4579 }
4580 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4581 zSig0 |= doubleZSig0;
4582 return
4583 roundAndPackFloatx80(
4584 STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
4585
4586}
4587
4588/*----------------------------------------------------------------------------
4589| Returns 1 if the extended double-precision floating-point value `a' is
4590| equal to the corresponding value `b', and 0 otherwise. The comparison is
4591| performed according to the IEC/IEEE Standard for Binary Floating-Point
4592| Arithmetic.
4593*----------------------------------------------------------------------------*/
4594
211315fb 4595int floatx80_eq_quiet( floatx80 a, floatx80 b STATUS_PARAM )
158142c2
FB
4596{
4597
4598 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 4599 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 4600 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 4601 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2
FB
4602 ) {
4603 if ( floatx80_is_signaling_nan( a )
4604 || floatx80_is_signaling_nan( b ) ) {
4605 float_raise( float_flag_invalid STATUS_VAR);
4606 }
4607 return 0;
4608 }
4609 return
4610 ( a.low == b.low )
4611 && ( ( a.high == b.high )
4612 || ( ( a.low == 0 )
bb98fe42 4613 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
4614 );
4615
4616}
4617
4618/*----------------------------------------------------------------------------
4619| Returns 1 if the extended double-precision floating-point value `a' is
4620| less than or equal to the corresponding value `b', and 0 otherwise. The
4621| comparison is performed according to the IEC/IEEE Standard for Binary
4622| Floating-Point Arithmetic.
4623*----------------------------------------------------------------------------*/
4624
750afe93 4625int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
158142c2
FB
4626{
4627 flag aSign, bSign;
4628
4629 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 4630 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 4631 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 4632 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2
FB
4633 ) {
4634 float_raise( float_flag_invalid STATUS_VAR);
4635 return 0;
4636 }
4637 aSign = extractFloatx80Sign( a );
4638 bSign = extractFloatx80Sign( b );
4639 if ( aSign != bSign ) {
4640 return
4641 aSign
bb98fe42 4642 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
4643 == 0 );
4644 }
4645 return
4646 aSign ? le128( b.high, b.low, a.high, a.low )
4647 : le128( a.high, a.low, b.high, b.low );
4648
4649}
4650
4651/*----------------------------------------------------------------------------
4652| Returns 1 if the extended double-precision floating-point value `a' is
4653| less than the corresponding value `b', and 0 otherwise. The comparison
4654| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4655| Arithmetic.
4656*----------------------------------------------------------------------------*/
4657
750afe93 4658int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
158142c2
FB
4659{
4660 flag aSign, bSign;
4661
4662 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 4663 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 4664 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 4665 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2
FB
4666 ) {
4667 float_raise( float_flag_invalid STATUS_VAR);
4668 return 0;
4669 }
4670 aSign = extractFloatx80Sign( a );
4671 bSign = extractFloatx80Sign( b );
4672 if ( aSign != bSign ) {
4673 return
4674 aSign
bb98fe42 4675 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
4676 != 0 );
4677 }
4678 return
4679 aSign ? lt128( b.high, b.low, a.high, a.low )
4680 : lt128( a.high, a.low, b.high, b.low );
4681
4682}
4683
67b7861d
AJ
4684/*----------------------------------------------------------------------------
4685| Returns 1 if the extended double-precision floating-point values `a' and `b'
4686| cannot be compared, and 0 otherwise. The comparison is performed according
4687| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4688*----------------------------------------------------------------------------*/
4689int floatx80_unordered( floatx80 a, floatx80 b STATUS_PARAM )
4690{
4691 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4692 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4693 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4694 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4695 ) {
4696 float_raise( float_flag_invalid STATUS_VAR);
4697 return 1;
4698 }
4699 return 0;
4700}
4701
158142c2
FB
4702/*----------------------------------------------------------------------------
4703| Returns 1 if the extended double-precision floating-point value `a' is equal
4704| to the corresponding value `b', and 0 otherwise. The invalid exception is
4705| raised if either operand is a NaN. Otherwise, the comparison is performed
4706| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4707*----------------------------------------------------------------------------*/
4708
2657d0ff 4709int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
158142c2
FB
4710{
4711
4712 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 4713 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 4714 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 4715 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2
FB
4716 ) {
4717 float_raise( float_flag_invalid STATUS_VAR);
4718 return 0;
4719 }
4720 return
4721 ( a.low == b.low )
4722 && ( ( a.high == b.high )
4723 || ( ( a.low == 0 )
bb98fe42 4724 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
4725 );
4726
4727}
4728
4729/*----------------------------------------------------------------------------
4730| Returns 1 if the extended double-precision floating-point value `a' is less
4731| than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
4732| do not cause an exception. Otherwise, the comparison is performed according
4733| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4734*----------------------------------------------------------------------------*/
4735
750afe93 4736int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
158142c2
FB
4737{
4738 flag aSign, bSign;
4739
4740 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 4741 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 4742 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 4743 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2
FB
4744 ) {
4745 if ( floatx80_is_signaling_nan( a )
4746 || floatx80_is_signaling_nan( b ) ) {
4747 float_raise( float_flag_invalid STATUS_VAR);
4748 }
4749 return 0;
4750 }
4751 aSign = extractFloatx80Sign( a );
4752 bSign = extractFloatx80Sign( b );
4753 if ( aSign != bSign ) {
4754 return
4755 aSign
bb98fe42 4756 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
4757 == 0 );
4758 }
4759 return
4760 aSign ? le128( b.high, b.low, a.high, a.low )
4761 : le128( a.high, a.low, b.high, b.low );
4762
4763}
4764
4765/*----------------------------------------------------------------------------
4766| Returns 1 if the extended double-precision floating-point value `a' is less
4767| than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
4768| an exception. Otherwise, the comparison is performed according to the
4769| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4770*----------------------------------------------------------------------------*/
4771
750afe93 4772int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
158142c2
FB
4773{
4774 flag aSign, bSign;
4775
4776 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 4777 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 4778 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 4779 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2
FB
4780 ) {
4781 if ( floatx80_is_signaling_nan( a )
4782 || floatx80_is_signaling_nan( b ) ) {
4783 float_raise( float_flag_invalid STATUS_VAR);
4784 }
4785 return 0;
4786 }
4787 aSign = extractFloatx80Sign( a );
4788 bSign = extractFloatx80Sign( b );
4789 if ( aSign != bSign ) {
4790 return
4791 aSign
bb98fe42 4792 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
4793 != 0 );
4794 }
4795 return
4796 aSign ? lt128( b.high, b.low, a.high, a.low )
4797 : lt128( a.high, a.low, b.high, b.low );
4798
4799}
4800
67b7861d
AJ
4801/*----------------------------------------------------------------------------
4802| Returns 1 if the extended double-precision floating-point values `a' and `b'
4803| cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
4804| The comparison is performed according to the IEC/IEEE Standard for Binary
4805| Floating-Point Arithmetic.
4806*----------------------------------------------------------------------------*/
4807int floatx80_unordered_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4808{
4809 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4810 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4811 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4812 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4813 ) {
4814 if ( floatx80_is_signaling_nan( a )
4815 || floatx80_is_signaling_nan( b ) ) {
4816 float_raise( float_flag_invalid STATUS_VAR);
4817 }
4818 return 1;
4819 }
4820 return 0;
4821}
4822
158142c2
FB
4823#endif
4824
4825#ifdef FLOAT128
4826
4827/*----------------------------------------------------------------------------
4828| Returns the result of converting the quadruple-precision floating-point
4829| value `a' to the 32-bit two's complement integer format. The conversion
4830| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4831| Arithmetic---which means in particular that the conversion is rounded
4832| according to the current rounding mode. If `a' is a NaN, the largest
4833| positive integer is returned. Otherwise, if the conversion overflows, the
4834| largest integer with the same sign as `a' is returned.
4835*----------------------------------------------------------------------------*/
4836
4837int32 float128_to_int32( float128 a STATUS_PARAM )
4838{
4839 flag aSign;
4840 int32 aExp, shiftCount;
bb98fe42 4841 uint64_t aSig0, aSig1;
158142c2
FB
4842
4843 aSig1 = extractFloat128Frac1( a );
4844 aSig0 = extractFloat128Frac0( a );
4845 aExp = extractFloat128Exp( a );
4846 aSign = extractFloat128Sign( a );
4847 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4848 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4849 aSig0 |= ( aSig1 != 0 );
4850 shiftCount = 0x4028 - aExp;
4851 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4852 return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
4853
4854}
4855
4856/*----------------------------------------------------------------------------
4857| Returns the result of converting the quadruple-precision floating-point
4858| value `a' to the 32-bit two's complement integer format. The conversion
4859| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4860| Arithmetic, except that the conversion is always rounded toward zero. If
4861| `a' is a NaN, the largest positive integer is returned. Otherwise, if the
4862| conversion overflows, the largest integer with the same sign as `a' is
4863| returned.
4864*----------------------------------------------------------------------------*/
4865
4866int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
4867{
4868 flag aSign;
4869 int32 aExp, shiftCount;
bb98fe42 4870 uint64_t aSig0, aSig1, savedASig;
158142c2
FB
4871 int32 z;
4872
4873 aSig1 = extractFloat128Frac1( a );
4874 aSig0 = extractFloat128Frac0( a );
4875 aExp = extractFloat128Exp( a );
4876 aSign = extractFloat128Sign( a );
4877 aSig0 |= ( aSig1 != 0 );
4878 if ( 0x401E < aExp ) {
4879 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4880 goto invalid;
4881 }
4882 else if ( aExp < 0x3FFF ) {
4883 if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
4884 return 0;
4885 }
4886 aSig0 |= LIT64( 0x0001000000000000 );
4887 shiftCount = 0x402F - aExp;
4888 savedASig = aSig0;
4889 aSig0 >>= shiftCount;
4890 z = aSig0;
4891 if ( aSign ) z = - z;
4892 if ( ( z < 0 ) ^ aSign ) {
4893 invalid:
4894 float_raise( float_flag_invalid STATUS_VAR);
bb98fe42 4895 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2
FB
4896 }
4897 if ( ( aSig0<<shiftCount ) != savedASig ) {
4898 STATUS(float_exception_flags) |= float_flag_inexact;
4899 }
4900 return z;
4901
4902}
4903
4904/*----------------------------------------------------------------------------
4905| Returns the result of converting the quadruple-precision floating-point
4906| value `a' to the 64-bit two's complement integer format. The conversion
4907| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4908| Arithmetic---which means in particular that the conversion is rounded
4909| according to the current rounding mode. If `a' is a NaN, the largest
4910| positive integer is returned. Otherwise, if the conversion overflows, the
4911| largest integer with the same sign as `a' is returned.
4912*----------------------------------------------------------------------------*/
4913
4914int64 float128_to_int64( float128 a STATUS_PARAM )
4915{
4916 flag aSign;
4917 int32 aExp, shiftCount;
bb98fe42 4918 uint64_t aSig0, aSig1;
158142c2
FB
4919
4920 aSig1 = extractFloat128Frac1( a );
4921 aSig0 = extractFloat128Frac0( a );
4922 aExp = extractFloat128Exp( a );
4923 aSign = extractFloat128Sign( a );
4924 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4925 shiftCount = 0x402F - aExp;
4926 if ( shiftCount <= 0 ) {
4927 if ( 0x403E < aExp ) {
4928 float_raise( float_flag_invalid STATUS_VAR);
4929 if ( ! aSign
4930 || ( ( aExp == 0x7FFF )
4931 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4932 )
4933 ) {
4934 return LIT64( 0x7FFFFFFFFFFFFFFF );
4935 }
bb98fe42 4936 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
4937 }
4938 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4939 }
4940 else {
4941 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4942 }
4943 return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
4944
4945}
4946
4947/*----------------------------------------------------------------------------
4948| Returns the result of converting the quadruple-precision floating-point
4949| value `a' to the 64-bit two's complement integer format. The conversion
4950| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4951| Arithmetic, except that the conversion is always rounded toward zero.
4952| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
4953| the conversion overflows, the largest integer with the same sign as `a' is
4954| returned.
4955*----------------------------------------------------------------------------*/
4956
4957int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
4958{
4959 flag aSign;
4960 int32 aExp, shiftCount;
bb98fe42 4961 uint64_t aSig0, aSig1;
158142c2
FB
4962 int64 z;
4963
4964 aSig1 = extractFloat128Frac1( a );
4965 aSig0 = extractFloat128Frac0( a );
4966 aExp = extractFloat128Exp( a );
4967 aSign = extractFloat128Sign( a );
4968 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4969 shiftCount = aExp - 0x402F;
4970 if ( 0 < shiftCount ) {
4971 if ( 0x403E <= aExp ) {
4972 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4973 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
4974 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4975 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
4976 }
4977 else {
4978 float_raise( float_flag_invalid STATUS_VAR);
4979 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4980 return LIT64( 0x7FFFFFFFFFFFFFFF );
4981 }
4982 }
bb98fe42 4983 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
4984 }
4985 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
bb98fe42 4986 if ( (uint64_t) ( aSig1<<shiftCount ) ) {
158142c2
FB
4987 STATUS(float_exception_flags) |= float_flag_inexact;
4988 }
4989 }
4990 else {
4991 if ( aExp < 0x3FFF ) {
4992 if ( aExp | aSig0 | aSig1 ) {
4993 STATUS(float_exception_flags) |= float_flag_inexact;
4994 }
4995 return 0;
4996 }
4997 z = aSig0>>( - shiftCount );
4998 if ( aSig1
bb98fe42 4999 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
158142c2
FB
5000 STATUS(float_exception_flags) |= float_flag_inexact;
5001 }
5002 }
5003 if ( aSign ) z = - z;
5004 return z;
5005
5006}
5007
5008/*----------------------------------------------------------------------------
5009| Returns the result of converting the quadruple-precision floating-point
5010| value `a' to the single-precision floating-point format. The conversion
5011| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5012| Arithmetic.
5013*----------------------------------------------------------------------------*/
5014
5015float32 float128_to_float32( float128 a STATUS_PARAM )
5016{
5017 flag aSign;
5018 int32 aExp;
bb98fe42
AF
5019 uint64_t aSig0, aSig1;
5020 uint32_t zSig;
158142c2
FB
5021
5022 aSig1 = extractFloat128Frac1( a );
5023 aSig0 = extractFloat128Frac0( a );
5024 aExp = extractFloat128Exp( a );
5025 aSign = extractFloat128Sign( a );
5026 if ( aExp == 0x7FFF ) {
5027 if ( aSig0 | aSig1 ) {
bcd4d9af 5028 return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
5029 }
5030 return packFloat32( aSign, 0xFF, 0 );
5031 }
5032 aSig0 |= ( aSig1 != 0 );
5033 shift64RightJamming( aSig0, 18, &aSig0 );
5034 zSig = aSig0;
5035 if ( aExp || zSig ) {
5036 zSig |= 0x40000000;
5037 aExp -= 0x3F81;
5038 }
5039 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
5040
5041}
5042
5043/*----------------------------------------------------------------------------
5044| Returns the result of converting the quadruple-precision floating-point
5045| value `a' to the double-precision floating-point format. The conversion
5046| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5047| Arithmetic.
5048*----------------------------------------------------------------------------*/
5049
5050float64 float128_to_float64( float128 a STATUS_PARAM )
5051{
5052 flag aSign;
5053 int32 aExp;
bb98fe42 5054 uint64_t aSig0, aSig1;
158142c2
FB
5055
5056 aSig1 = extractFloat128Frac1( a );
5057 aSig0 = extractFloat128Frac0( a );
5058 aExp = extractFloat128Exp( a );
5059 aSign = extractFloat128Sign( a );
5060 if ( aExp == 0x7FFF ) {
5061 if ( aSig0 | aSig1 ) {
bcd4d9af 5062 return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
5063 }
5064 return packFloat64( aSign, 0x7FF, 0 );
5065 }
5066 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5067 aSig0 |= ( aSig1 != 0 );
5068 if ( aExp || aSig0 ) {
5069 aSig0 |= LIT64( 0x4000000000000000 );
5070 aExp -= 0x3C01;
5071 }
5072 return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
5073
5074}
5075
5076#ifdef FLOATX80
5077
5078/*----------------------------------------------------------------------------
5079| Returns the result of converting the quadruple-precision floating-point
5080| value `a' to the extended double-precision floating-point format. The
5081| conversion is performed according to the IEC/IEEE Standard for Binary
5082| Floating-Point Arithmetic.
5083*----------------------------------------------------------------------------*/
5084
5085floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
5086{
5087 flag aSign;
5088 int32 aExp;
bb98fe42 5089 uint64_t aSig0, aSig1;
158142c2
FB
5090
5091 aSig1 = extractFloat128Frac1( a );
5092 aSig0 = extractFloat128Frac0( a );
5093 aExp = extractFloat128Exp( a );
5094 aSign = extractFloat128Sign( a );
5095 if ( aExp == 0x7FFF ) {
5096 if ( aSig0 | aSig1 ) {
bcd4d9af 5097 return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
5098 }
5099 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5100 }
5101 if ( aExp == 0 ) {
5102 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5103 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5104 }
5105 else {
5106 aSig0 |= LIT64( 0x0001000000000000 );
5107 }
5108 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5109 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
5110
5111}
5112
5113#endif
5114
5115/*----------------------------------------------------------------------------
5116| Rounds the quadruple-precision floating-point value `a' to an integer, and
5117| returns the result as a quadruple-precision floating-point value. The
5118| operation is performed according to the IEC/IEEE Standard for Binary
5119| Floating-Point Arithmetic.
5120*----------------------------------------------------------------------------*/
5121
5122float128 float128_round_to_int( float128 a STATUS_PARAM )
5123{
5124 flag aSign;
5125 int32 aExp;
bb98fe42 5126 uint64_t lastBitMask, roundBitsMask;
158142c2
FB
5127 int8 roundingMode;
5128 float128 z;
5129
5130 aExp = extractFloat128Exp( a );
5131 if ( 0x402F <= aExp ) {
5132 if ( 0x406F <= aExp ) {
5133 if ( ( aExp == 0x7FFF )
5134 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5135 ) {
5136 return propagateFloat128NaN( a, a STATUS_VAR );
5137 }
5138 return a;
5139 }
5140 lastBitMask = 1;
5141 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5142 roundBitsMask = lastBitMask - 1;
5143 z = a;
5144 roundingMode = STATUS(float_rounding_mode);
5145 if ( roundingMode == float_round_nearest_even ) {
5146 if ( lastBitMask ) {
5147 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
5148 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
5149 }
5150 else {
bb98fe42 5151 if ( (int64_t) z.low < 0 ) {
158142c2 5152 ++z.high;
bb98fe42 5153 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
158142c2
FB
5154 }
5155 }
5156 }
5157 else if ( roundingMode != float_round_to_zero ) {
5158 if ( extractFloat128Sign( z )
5159 ^ ( roundingMode == float_round_up ) ) {
5160 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
5161 }
5162 }
5163 z.low &= ~ roundBitsMask;
5164 }
5165 else {
5166 if ( aExp < 0x3FFF ) {
bb98fe42 5167 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
158142c2
FB
5168 STATUS(float_exception_flags) |= float_flag_inexact;
5169 aSign = extractFloat128Sign( a );
5170 switch ( STATUS(float_rounding_mode) ) {
5171 case float_round_nearest_even:
5172 if ( ( aExp == 0x3FFE )
5173 && ( extractFloat128Frac0( a )
5174 | extractFloat128Frac1( a ) )
5175 ) {
5176 return packFloat128( aSign, 0x3FFF, 0, 0 );
5177 }
5178 break;
5179 case float_round_down:
5180 return
5181 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
5182 : packFloat128( 0, 0, 0, 0 );
5183 case float_round_up:
5184 return
5185 aSign ? packFloat128( 1, 0, 0, 0 )
5186 : packFloat128( 0, 0x3FFF, 0, 0 );
5187 }
5188 return packFloat128( aSign, 0, 0, 0 );
5189 }
5190 lastBitMask = 1;
5191 lastBitMask <<= 0x402F - aExp;
5192 roundBitsMask = lastBitMask - 1;
5193 z.low = 0;
5194 z.high = a.high;
5195 roundingMode = STATUS(float_rounding_mode);
5196 if ( roundingMode == float_round_nearest_even ) {
5197 z.high += lastBitMask>>1;
5198 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
5199 z.high &= ~ lastBitMask;
5200 }
5201 }
5202 else if ( roundingMode != float_round_to_zero ) {
5203 if ( extractFloat128Sign( z )
5204 ^ ( roundingMode == float_round_up ) ) {
5205 z.high |= ( a.low != 0 );
5206 z.high += roundBitsMask;
5207 }
5208 }
5209 z.high &= ~ roundBitsMask;
5210 }
5211 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
5212 STATUS(float_exception_flags) |= float_flag_inexact;
5213 }
5214 return z;
5215
5216}
5217
5218/*----------------------------------------------------------------------------
5219| Returns the result of adding the absolute values of the quadruple-precision
5220| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
5221| before being returned. `zSign' is ignored if the result is a NaN.
5222| The addition is performed according to the IEC/IEEE Standard for Binary
5223| Floating-Point Arithmetic.
5224*----------------------------------------------------------------------------*/
5225
5226static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5227{
5228 int32 aExp, bExp, zExp;
bb98fe42 5229 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
158142c2
FB
5230 int32 expDiff;
5231
5232 aSig1 = extractFloat128Frac1( a );
5233 aSig0 = extractFloat128Frac0( a );
5234 aExp = extractFloat128Exp( a );
5235 bSig1 = extractFloat128Frac1( b );
5236 bSig0 = extractFloat128Frac0( b );
5237 bExp = extractFloat128Exp( b );
5238 expDiff = aExp - bExp;
5239 if ( 0 < expDiff ) {
5240 if ( aExp == 0x7FFF ) {
5241 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5242 return a;
5243 }
5244 if ( bExp == 0 ) {
5245 --expDiff;
5246 }
5247 else {
5248 bSig0 |= LIT64( 0x0001000000000000 );
5249 }
5250 shift128ExtraRightJamming(
5251 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
5252 zExp = aExp;
5253 }
5254 else if ( expDiff < 0 ) {
5255 if ( bExp == 0x7FFF ) {
5256 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5257 return packFloat128( zSign, 0x7FFF, 0, 0 );
5258 }
5259 if ( aExp == 0 ) {
5260 ++expDiff;
5261 }
5262 else {
5263 aSig0 |= LIT64( 0x0001000000000000 );
5264 }
5265 shift128ExtraRightJamming(
5266 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
5267 zExp = bExp;
5268 }
5269 else {
5270 if ( aExp == 0x7FFF ) {
5271 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5272 return propagateFloat128NaN( a, b STATUS_VAR );
5273 }
5274 return a;
5275 }
5276 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
fe76d976
PB
5277 if ( aExp == 0 ) {
5278 if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
5279 return packFloat128( zSign, 0, zSig0, zSig1 );
5280 }
158142c2
FB
5281 zSig2 = 0;
5282 zSig0 |= LIT64( 0x0002000000000000 );
5283 zExp = aExp;
5284 goto shiftRight1;
5285 }
5286 aSig0 |= LIT64( 0x0001000000000000 );
5287 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5288 --zExp;
5289 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
5290 ++zExp;
5291 shiftRight1:
5292 shift128ExtraRightJamming(
5293 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5294 roundAndPack:
5295 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5296
5297}
5298
5299/*----------------------------------------------------------------------------
5300| Returns the result of subtracting the absolute values of the quadruple-
5301| precision floating-point values `a' and `b'. If `zSign' is 1, the
5302| difference is negated before being returned. `zSign' is ignored if the
5303| result is a NaN. The subtraction is performed according to the IEC/IEEE
5304| Standard for Binary Floating-Point Arithmetic.
5305*----------------------------------------------------------------------------*/
5306
5307static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5308{
5309 int32 aExp, bExp, zExp;
bb98fe42 5310 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
158142c2
FB
5311 int32 expDiff;
5312 float128 z;
5313
5314 aSig1 = extractFloat128Frac1( a );
5315 aSig0 = extractFloat128Frac0( a );
5316 aExp = extractFloat128Exp( a );
5317 bSig1 = extractFloat128Frac1( b );
5318 bSig0 = extractFloat128Frac0( b );
5319 bExp = extractFloat128Exp( b );
5320 expDiff = aExp - bExp;
5321 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5322 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
5323 if ( 0 < expDiff ) goto aExpBigger;
5324 if ( expDiff < 0 ) goto bExpBigger;
5325 if ( aExp == 0x7FFF ) {
5326 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5327 return propagateFloat128NaN( a, b STATUS_VAR );
5328 }
5329 float_raise( float_flag_invalid STATUS_VAR);
5330 z.low = float128_default_nan_low;
5331 z.high = float128_default_nan_high;
5332 return z;
5333 }
5334 if ( aExp == 0 ) {
5335 aExp = 1;
5336 bExp = 1;
5337 }
5338 if ( bSig0 < aSig0 ) goto aBigger;
5339 if ( aSig0 < bSig0 ) goto bBigger;
5340 if ( bSig1 < aSig1 ) goto aBigger;
5341 if ( aSig1 < bSig1 ) goto bBigger;
5342 return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
5343 bExpBigger:
5344 if ( bExp == 0x7FFF ) {
5345 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5346 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
5347 }
5348 if ( aExp == 0 ) {
5349 ++expDiff;
5350 }
5351 else {
5352 aSig0 |= LIT64( 0x4000000000000000 );
5353 }
5354 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5355 bSig0 |= LIT64( 0x4000000000000000 );
5356 bBigger:
5357 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
5358 zExp = bExp;
5359 zSign ^= 1;
5360 goto normalizeRoundAndPack;
5361 aExpBigger:
5362 if ( aExp == 0x7FFF ) {
5363 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5364 return a;
5365 }
5366 if ( bExp == 0 ) {
5367 --expDiff;
5368 }
5369 else {
5370 bSig0 |= LIT64( 0x4000000000000000 );
5371 }
5372 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
5373 aSig0 |= LIT64( 0x4000000000000000 );
5374 aBigger:
5375 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5376 zExp = aExp;
5377 normalizeRoundAndPack:
5378 --zExp;
5379 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
5380
5381}
5382
5383/*----------------------------------------------------------------------------
5384| Returns the result of adding the quadruple-precision floating-point values
5385| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
5386| for Binary Floating-Point Arithmetic.
5387*----------------------------------------------------------------------------*/
5388
5389float128 float128_add( float128 a, float128 b STATUS_PARAM )
5390{
5391 flag aSign, bSign;
5392
5393 aSign = extractFloat128Sign( a );
5394 bSign = extractFloat128Sign( b );
5395 if ( aSign == bSign ) {
5396 return addFloat128Sigs( a, b, aSign STATUS_VAR );
5397 }
5398 else {
5399 return subFloat128Sigs( a, b, aSign STATUS_VAR );
5400 }
5401
5402}
5403
5404/*----------------------------------------------------------------------------
5405| Returns the result of subtracting the quadruple-precision floating-point
5406| values `a' and `b'. The operation is performed according to the IEC/IEEE
5407| Standard for Binary Floating-Point Arithmetic.
5408*----------------------------------------------------------------------------*/
5409
5410float128 float128_sub( float128 a, float128 b STATUS_PARAM )
5411{
5412 flag aSign, bSign;
5413
5414 aSign = extractFloat128Sign( a );
5415 bSign = extractFloat128Sign( b );
5416 if ( aSign == bSign ) {
5417 return subFloat128Sigs( a, b, aSign STATUS_VAR );
5418 }
5419 else {
5420 return addFloat128Sigs( a, b, aSign STATUS_VAR );
5421 }
5422
5423}
5424
5425/*----------------------------------------------------------------------------
5426| Returns the result of multiplying the quadruple-precision floating-point
5427| values `a' and `b'. The operation is performed according to the IEC/IEEE
5428| Standard for Binary Floating-Point Arithmetic.
5429*----------------------------------------------------------------------------*/
5430
5431float128 float128_mul( float128 a, float128 b STATUS_PARAM )
5432{
5433 flag aSign, bSign, zSign;
5434 int32 aExp, bExp, zExp;
bb98fe42 5435 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
158142c2
FB
5436 float128 z;
5437
5438 aSig1 = extractFloat128Frac1( a );
5439 aSig0 = extractFloat128Frac0( a );
5440 aExp = extractFloat128Exp( a );
5441 aSign = extractFloat128Sign( a );
5442 bSig1 = extractFloat128Frac1( b );
5443 bSig0 = extractFloat128Frac0( b );
5444 bExp = extractFloat128Exp( b );
5445 bSign = extractFloat128Sign( b );
5446 zSign = aSign ^ bSign;
5447 if ( aExp == 0x7FFF ) {
5448 if ( ( aSig0 | aSig1 )
5449 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5450 return propagateFloat128NaN( a, b STATUS_VAR );
5451 }
5452 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
5453 return packFloat128( zSign, 0x7FFF, 0, 0 );
5454 }
5455 if ( bExp == 0x7FFF ) {
5456 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5457 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5458 invalid:
5459 float_raise( float_flag_invalid STATUS_VAR);
5460 z.low = float128_default_nan_low;
5461 z.high = float128_default_nan_high;
5462 return z;
5463 }
5464 return packFloat128( zSign, 0x7FFF, 0, 0 );
5465 }
5466 if ( aExp == 0 ) {
5467 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5468 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5469 }
5470 if ( bExp == 0 ) {
5471 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5472 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5473 }
5474 zExp = aExp + bExp - 0x4000;
5475 aSig0 |= LIT64( 0x0001000000000000 );
5476 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5477 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5478 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5479 zSig2 |= ( zSig3 != 0 );
5480 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5481 shift128ExtraRightJamming(
5482 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5483 ++zExp;
5484 }
5485 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5486
5487}
5488
5489/*----------------------------------------------------------------------------
5490| Returns the result of dividing the quadruple-precision floating-point value
5491| `a' by the corresponding value `b'. The operation is performed according to
5492| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5493*----------------------------------------------------------------------------*/
5494
5495float128 float128_div( float128 a, float128 b STATUS_PARAM )
5496{
5497 flag aSign, bSign, zSign;
5498 int32 aExp, bExp, zExp;
bb98fe42
AF
5499 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5500 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
158142c2
FB
5501 float128 z;
5502
5503 aSig1 = extractFloat128Frac1( a );
5504 aSig0 = extractFloat128Frac0( a );
5505 aExp = extractFloat128Exp( a );
5506 aSign = extractFloat128Sign( a );
5507 bSig1 = extractFloat128Frac1( b );
5508 bSig0 = extractFloat128Frac0( b );
5509 bExp = extractFloat128Exp( b );
5510 bSign = extractFloat128Sign( b );
5511 zSign = aSign ^ bSign;
5512 if ( aExp == 0x7FFF ) {
5513 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5514 if ( bExp == 0x7FFF ) {
5515 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5516 goto invalid;
5517 }
5518 return packFloat128( zSign, 0x7FFF, 0, 0 );
5519 }
5520 if ( bExp == 0x7FFF ) {
5521 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5522 return packFloat128( zSign, 0, 0, 0 );
5523 }
5524 if ( bExp == 0 ) {
5525 if ( ( bSig0 | bSig1 ) == 0 ) {
5526 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5527 invalid:
5528 float_raise( float_flag_invalid STATUS_VAR);
5529 z.low = float128_default_nan_low;
5530 z.high = float128_default_nan_high;
5531 return z;
5532 }
5533 float_raise( float_flag_divbyzero STATUS_VAR);
5534 return packFloat128( zSign, 0x7FFF, 0, 0 );
5535 }
5536 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5537 }
5538 if ( aExp == 0 ) {
5539 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5540 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5541 }
5542 zExp = aExp - bExp + 0x3FFD;
5543 shortShift128Left(
5544 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5545 shortShift128Left(
5546 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5547 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5548 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5549 ++zExp;
5550 }
5551 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5552 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5553 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
bb98fe42 5554 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
5555 --zSig0;
5556 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5557 }
5558 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5559 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5560 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5561 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
bb98fe42 5562 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
5563 --zSig1;
5564 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5565 }
5566 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5567 }
5568 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5569 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5570
5571}
5572
5573/*----------------------------------------------------------------------------
5574| Returns the remainder of the quadruple-precision floating-point value `a'
5575| with respect to the corresponding value `b'. The operation is performed
5576| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5577*----------------------------------------------------------------------------*/
5578
5579float128 float128_rem( float128 a, float128 b STATUS_PARAM )
5580{
ed086f3d 5581 flag aSign, zSign;
158142c2 5582 int32 aExp, bExp, expDiff;
bb98fe42
AF
5583 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5584 uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
5585 int64_t sigMean0;
158142c2
FB
5586 float128 z;
5587
5588 aSig1 = extractFloat128Frac1( a );
5589 aSig0 = extractFloat128Frac0( a );
5590 aExp = extractFloat128Exp( a );
5591 aSign = extractFloat128Sign( a );
5592 bSig1 = extractFloat128Frac1( b );
5593 bSig0 = extractFloat128Frac0( b );
5594 bExp = extractFloat128Exp( b );
158142c2
FB
5595 if ( aExp == 0x7FFF ) {
5596 if ( ( aSig0 | aSig1 )
5597 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5598 return propagateFloat128NaN( a, b STATUS_VAR );
5599 }
5600 goto invalid;
5601 }
5602 if ( bExp == 0x7FFF ) {
5603 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5604 return a;
5605 }
5606 if ( bExp == 0 ) {
5607 if ( ( bSig0 | bSig1 ) == 0 ) {
5608 invalid:
5609 float_raise( float_flag_invalid STATUS_VAR);
5610 z.low = float128_default_nan_low;
5611 z.high = float128_default_nan_high;
5612 return z;
5613 }
5614 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5615 }
5616 if ( aExp == 0 ) {
5617 if ( ( aSig0 | aSig1 ) == 0 ) return a;
5618 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5619 }
5620 expDiff = aExp - bExp;
5621 if ( expDiff < -1 ) return a;
5622 shortShift128Left(
5623 aSig0 | LIT64( 0x0001000000000000 ),
5624 aSig1,
5625 15 - ( expDiff < 0 ),
5626 &aSig0,
5627 &aSig1
5628 );
5629 shortShift128Left(
5630 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5631 q = le128( bSig0, bSig1, aSig0, aSig1 );
5632 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5633 expDiff -= 64;
5634 while ( 0 < expDiff ) {
5635 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5636 q = ( 4 < q ) ? q - 4 : 0;
5637 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5638 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5639 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5640 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5641 expDiff -= 61;
5642 }
5643 if ( -64 < expDiff ) {
5644 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5645 q = ( 4 < q ) ? q - 4 : 0;
5646 q >>= - expDiff;
5647 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5648 expDiff += 52;
5649 if ( expDiff < 0 ) {
5650 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5651 }
5652 else {
5653 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5654 }
5655 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5656 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5657 }
5658 else {
5659 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5660 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5661 }
5662 do {
5663 alternateASig0 = aSig0;
5664 alternateASig1 = aSig1;
5665 ++q;
5666 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
bb98fe42 5667 } while ( 0 <= (int64_t) aSig0 );
158142c2 5668 add128(
bb98fe42 5669 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
158142c2
FB
5670 if ( ( sigMean0 < 0 )
5671 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5672 aSig0 = alternateASig0;
5673 aSig1 = alternateASig1;
5674 }
bb98fe42 5675 zSign = ( (int64_t) aSig0 < 0 );
158142c2
FB
5676 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5677 return
5678 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
5679
5680}
5681
5682/*----------------------------------------------------------------------------
5683| Returns the square root of the quadruple-precision floating-point value `a'.
5684| The operation is performed according to the IEC/IEEE Standard for Binary
5685| Floating-Point Arithmetic.
5686*----------------------------------------------------------------------------*/
5687
5688float128 float128_sqrt( float128 a STATUS_PARAM )
5689{
5690 flag aSign;
5691 int32 aExp, zExp;
bb98fe42
AF
5692 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5693 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
158142c2
FB
5694 float128 z;
5695
5696 aSig1 = extractFloat128Frac1( a );
5697 aSig0 = extractFloat128Frac0( a );
5698 aExp = extractFloat128Exp( a );
5699 aSign = extractFloat128Sign( a );
5700 if ( aExp == 0x7FFF ) {
5701 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
5702 if ( ! aSign ) return a;
5703 goto invalid;
5704 }
5705 if ( aSign ) {
5706 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5707 invalid:
5708 float_raise( float_flag_invalid STATUS_VAR);
5709 z.low = float128_default_nan_low;
5710 z.high = float128_default_nan_high;
5711 return z;
5712 }
5713 if ( aExp == 0 ) {
5714 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5715 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5716 }
5717 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5718 aSig0 |= LIT64( 0x0001000000000000 );
5719 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5720 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5721 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5722 doubleZSig0 = zSig0<<1;
5723 mul64To128( zSig0, zSig0, &term0, &term1 );
5724 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
bb98fe42 5725 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
5726 --zSig0;
5727 doubleZSig0 -= 2;
5728 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5729 }
5730 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5731 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5732 if ( zSig1 == 0 ) zSig1 = 1;
5733 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5734 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5735 mul64To128( zSig1, zSig1, &term2, &term3 );
5736 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
bb98fe42 5737 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
5738 --zSig1;
5739 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5740 term3 |= 1;
5741 term2 |= doubleZSig0;
5742 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5743 }
5744 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5745 }
5746 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5747 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5748
5749}
5750
5751/*----------------------------------------------------------------------------
5752| Returns 1 if the quadruple-precision floating-point value `a' is equal to
5753| the corresponding value `b', and 0 otherwise. The comparison is performed
5754| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5755*----------------------------------------------------------------------------*/
5756
211315fb 5757int float128_eq_quiet( float128 a, float128 b STATUS_PARAM )
158142c2
FB
5758{
5759
5760 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5761 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5762 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5763 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5764 ) {
5765 if ( float128_is_signaling_nan( a )
5766 || float128_is_signaling_nan( b ) ) {
5767 float_raise( float_flag_invalid STATUS_VAR);
5768 }
5769 return 0;
5770 }
5771 return
5772 ( a.low == b.low )
5773 && ( ( a.high == b.high )
5774 || ( ( a.low == 0 )
bb98fe42 5775 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
5776 );
5777
5778}
5779
5780/*----------------------------------------------------------------------------
5781| Returns 1 if the quadruple-precision floating-point value `a' is less than
5782| or equal to the corresponding value `b', and 0 otherwise. The comparison
5783| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5784| Arithmetic.
5785*----------------------------------------------------------------------------*/
5786
750afe93 5787int float128_le( float128 a, float128 b STATUS_PARAM )
158142c2
FB
5788{
5789 flag aSign, bSign;
5790
5791 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5792 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5793 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5794 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5795 ) {
5796 float_raise( float_flag_invalid STATUS_VAR);
5797 return 0;
5798 }
5799 aSign = extractFloat128Sign( a );
5800 bSign = extractFloat128Sign( b );
5801 if ( aSign != bSign ) {
5802 return
5803 aSign
bb98fe42 5804 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5805 == 0 );
5806 }
5807 return
5808 aSign ? le128( b.high, b.low, a.high, a.low )
5809 : le128( a.high, a.low, b.high, b.low );
5810
5811}
5812
5813/*----------------------------------------------------------------------------
5814| Returns 1 if the quadruple-precision floating-point value `a' is less than
5815| the corresponding value `b', and 0 otherwise. The comparison is performed
5816| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5817*----------------------------------------------------------------------------*/
5818
750afe93 5819int float128_lt( float128 a, float128 b STATUS_PARAM )
158142c2
FB
5820{
5821 flag aSign, bSign;
5822
5823 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5824 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5825 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5826 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5827 ) {
5828 float_raise( float_flag_invalid STATUS_VAR);
5829 return 0;
5830 }
5831 aSign = extractFloat128Sign( a );
5832 bSign = extractFloat128Sign( b );
5833 if ( aSign != bSign ) {
5834 return
5835 aSign
bb98fe42 5836 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5837 != 0 );
5838 }
5839 return
5840 aSign ? lt128( b.high, b.low, a.high, a.low )
5841 : lt128( a.high, a.low, b.high, b.low );
5842
5843}
5844
67b7861d
AJ
5845/*----------------------------------------------------------------------------
5846| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
5847| be compared, and 0 otherwise. The comparison is performed according to the
5848| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5849*----------------------------------------------------------------------------*/
5850
5851int float128_unordered( float128 a, float128 b STATUS_PARAM )
5852{
5853 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5854 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5855 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5856 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5857 ) {
5858 float_raise( float_flag_invalid STATUS_VAR);
5859 return 1;
5860 }
5861 return 0;
5862}
5863
158142c2
FB
5864/*----------------------------------------------------------------------------
5865| Returns 1 if the quadruple-precision floating-point value `a' is equal to
5866| the corresponding value `b', and 0 otherwise. The invalid exception is
5867| raised if either operand is a NaN. Otherwise, the comparison is performed
5868| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5869*----------------------------------------------------------------------------*/
5870
2657d0ff 5871int float128_eq( float128 a, float128 b STATUS_PARAM )
158142c2
FB
5872{
5873
5874 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5875 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5876 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5877 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5878 ) {
5879 float_raise( float_flag_invalid STATUS_VAR);
5880 return 0;
5881 }
5882 return
5883 ( a.low == b.low )
5884 && ( ( a.high == b.high )
5885 || ( ( a.low == 0 )
bb98fe42 5886 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
5887 );
5888
5889}
5890
5891/*----------------------------------------------------------------------------
5892| Returns 1 if the quadruple-precision floating-point value `a' is less than
5893| or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5894| cause an exception. Otherwise, the comparison is performed according to the
5895| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5896*----------------------------------------------------------------------------*/
5897
750afe93 5898int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
158142c2
FB
5899{
5900 flag aSign, bSign;
5901
5902 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5903 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5904 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5905 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5906 ) {
5907 if ( float128_is_signaling_nan( a )
5908 || float128_is_signaling_nan( b ) ) {
5909 float_raise( float_flag_invalid STATUS_VAR);
5910 }
5911 return 0;
5912 }
5913 aSign = extractFloat128Sign( a );
5914 bSign = extractFloat128Sign( b );
5915 if ( aSign != bSign ) {
5916 return
5917 aSign
bb98fe42 5918 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5919 == 0 );
5920 }
5921 return
5922 aSign ? le128( b.high, b.low, a.high, a.low )
5923 : le128( a.high, a.low, b.high, b.low );
5924
5925}
5926
5927/*----------------------------------------------------------------------------
5928| Returns 1 if the quadruple-precision floating-point value `a' is less than
5929| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5930| exception. Otherwise, the comparison is performed according to the IEC/IEEE
5931| Standard for Binary Floating-Point Arithmetic.
5932*----------------------------------------------------------------------------*/
5933
750afe93 5934int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
158142c2
FB
5935{
5936 flag aSign, bSign;
5937
5938 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5939 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5940 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5941 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5942 ) {
5943 if ( float128_is_signaling_nan( a )
5944 || float128_is_signaling_nan( b ) ) {
5945 float_raise( float_flag_invalid STATUS_VAR);
5946 }
5947 return 0;
5948 }
5949 aSign = extractFloat128Sign( a );
5950 bSign = extractFloat128Sign( b );
5951 if ( aSign != bSign ) {
5952 return
5953 aSign
bb98fe42 5954 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5955 != 0 );
5956 }
5957 return
5958 aSign ? lt128( b.high, b.low, a.high, a.low )
5959 : lt128( a.high, a.low, b.high, b.low );
5960
5961}
5962
67b7861d
AJ
5963/*----------------------------------------------------------------------------
5964| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
5965| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
5966| comparison is performed according to the IEC/IEEE Standard for Binary
5967| Floating-Point Arithmetic.
5968*----------------------------------------------------------------------------*/
5969
5970int float128_unordered_quiet( float128 a, float128 b STATUS_PARAM )
5971{
5972 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5973 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5974 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5975 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5976 ) {
5977 if ( float128_is_signaling_nan( a )
5978 || float128_is_signaling_nan( b ) ) {
5979 float_raise( float_flag_invalid STATUS_VAR);
5980 }
5981 return 1;
5982 }
5983 return 0;
5984}
5985
158142c2
FB
5986#endif
5987
1d6bda35
FB
5988/* misc functions */
5989float32 uint32_to_float32( unsigned int a STATUS_PARAM )
5990{
5991 return int64_to_float32(a STATUS_VAR);
5992}
5993
5994float64 uint32_to_float64( unsigned int a STATUS_PARAM )
5995{
5996 return int64_to_float64(a STATUS_VAR);
5997}
5998
5999unsigned int float32_to_uint32( float32 a STATUS_PARAM )
6000{
6001 int64_t v;
6002 unsigned int res;
6003
6004 v = float32_to_int64(a STATUS_VAR);
6005 if (v < 0) {
6006 res = 0;
6007 float_raise( float_flag_invalid STATUS_VAR);
6008 } else if (v > 0xffffffff) {
6009 res = 0xffffffff;
6010 float_raise( float_flag_invalid STATUS_VAR);
6011 } else {
6012 res = v;
6013 }
6014 return res;
6015}
6016
6017unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
6018{
6019 int64_t v;
6020 unsigned int res;
6021
6022 v = float32_to_int64_round_to_zero(a STATUS_VAR);
6023 if (v < 0) {
6024 res = 0;
6025 float_raise( float_flag_invalid STATUS_VAR);
6026 } else if (v > 0xffffffff) {
6027 res = 0xffffffff;
6028 float_raise( float_flag_invalid STATUS_VAR);
6029 } else {
6030 res = v;
6031 }
6032 return res;
6033}
6034
cbcef455
PM
6035unsigned int float32_to_uint16_round_to_zero( float32 a STATUS_PARAM )
6036{
6037 int64_t v;
6038 unsigned int res;
6039
6040 v = float32_to_int64_round_to_zero(a STATUS_VAR);
6041 if (v < 0) {
6042 res = 0;
6043 float_raise( float_flag_invalid STATUS_VAR);
6044 } else if (v > 0xffff) {
6045 res = 0xffff;
6046 float_raise( float_flag_invalid STATUS_VAR);
6047 } else {
6048 res = v;
6049 }
6050 return res;
6051}
6052
1d6bda35
FB
6053unsigned int float64_to_uint32( float64 a STATUS_PARAM )
6054{
6055 int64_t v;
6056 unsigned int res;
6057
6058 v = float64_to_int64(a STATUS_VAR);
6059 if (v < 0) {
6060 res = 0;
6061 float_raise( float_flag_invalid STATUS_VAR);
6062 } else if (v > 0xffffffff) {
6063 res = 0xffffffff;
6064 float_raise( float_flag_invalid STATUS_VAR);
6065 } else {
6066 res = v;
6067 }
6068 return res;
6069}
6070
6071unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
6072{
6073 int64_t v;
6074 unsigned int res;
6075
6076 v = float64_to_int64_round_to_zero(a STATUS_VAR);
6077 if (v < 0) {
6078 res = 0;
6079 float_raise( float_flag_invalid STATUS_VAR);
6080 } else if (v > 0xffffffff) {
6081 res = 0xffffffff;
6082 float_raise( float_flag_invalid STATUS_VAR);
6083 } else {
6084 res = v;
6085 }
6086 return res;
6087}
6088
cbcef455
PM
6089unsigned int float64_to_uint16_round_to_zero( float64 a STATUS_PARAM )
6090{
6091 int64_t v;
6092 unsigned int res;
6093
6094 v = float64_to_int64_round_to_zero(a STATUS_VAR);
6095 if (v < 0) {
6096 res = 0;
6097 float_raise( float_flag_invalid STATUS_VAR);
6098 } else if (v > 0xffff) {
6099 res = 0xffff;
6100 float_raise( float_flag_invalid STATUS_VAR);
6101 } else {
6102 res = v;
6103 }
6104 return res;
6105}
6106
f090c9d4 6107/* FIXME: This looks broken. */
75d62a58
JM
6108uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
6109{
6110 int64_t v;
6111
f090c9d4
PB
6112 v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
6113 v += float64_val(a);
6114 v = float64_to_int64(make_float64(v) STATUS_VAR);
75d62a58
JM
6115
6116 return v - INT64_MIN;
6117}
6118
6119uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
6120{
6121 int64_t v;
6122
f090c9d4
PB
6123 v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
6124 v += float64_val(a);
6125 v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);
75d62a58
JM
6126
6127 return v - INT64_MIN;
6128}
6129
1d6bda35 6130#define COMPARE(s, nan_exp) \
750afe93 6131INLINE int float ## s ## _compare_internal( float ## s a, float ## s b, \
1d6bda35
FB
6132 int is_quiet STATUS_PARAM ) \
6133{ \
6134 flag aSign, bSign; \
bb98fe42 6135 uint ## s ## _t av, bv; \
37d18660
PM
6136 a = float ## s ## _squash_input_denormal(a STATUS_VAR); \
6137 b = float ## s ## _squash_input_denormal(b STATUS_VAR); \
1d6bda35
FB
6138 \
6139 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
6140 extractFloat ## s ## Frac( a ) ) || \
6141 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
6142 extractFloat ## s ## Frac( b ) )) { \
6143 if (!is_quiet || \
6144 float ## s ## _is_signaling_nan( a ) || \
6145 float ## s ## _is_signaling_nan( b ) ) { \
6146 float_raise( float_flag_invalid STATUS_VAR); \
6147 } \
6148 return float_relation_unordered; \
6149 } \
6150 aSign = extractFloat ## s ## Sign( a ); \
6151 bSign = extractFloat ## s ## Sign( b ); \
f090c9d4 6152 av = float ## s ## _val(a); \
cd8a2533 6153 bv = float ## s ## _val(b); \
1d6bda35 6154 if ( aSign != bSign ) { \
bb98fe42 6155 if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) { \
1d6bda35
FB
6156 /* zero case */ \
6157 return float_relation_equal; \
6158 } else { \
6159 return 1 - (2 * aSign); \
6160 } \
6161 } else { \
f090c9d4 6162 if (av == bv) { \
1d6bda35
FB
6163 return float_relation_equal; \
6164 } else { \
f090c9d4 6165 return 1 - 2 * (aSign ^ ( av < bv )); \
1d6bda35
FB
6166 } \
6167 } \
6168} \
6169 \
750afe93 6170int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM ) \
1d6bda35
FB
6171{ \
6172 return float ## s ## _compare_internal(a, b, 0 STATUS_VAR); \
6173} \
6174 \
750afe93 6175int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \
1d6bda35
FB
6176{ \
6177 return float ## s ## _compare_internal(a, b, 1 STATUS_VAR); \
6178}
6179
6180COMPARE(32, 0xff)
6181COMPARE(64, 0x7ff)
9ee6e8bb 6182
1f587329
BS
6183INLINE int float128_compare_internal( float128 a, float128 b,
6184 int is_quiet STATUS_PARAM )
6185{
6186 flag aSign, bSign;
6187
6188 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6189 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6190 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6191 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6192 if (!is_quiet ||
6193 float128_is_signaling_nan( a ) ||
6194 float128_is_signaling_nan( b ) ) {
6195 float_raise( float_flag_invalid STATUS_VAR);
6196 }
6197 return float_relation_unordered;
6198 }
6199 aSign = extractFloat128Sign( a );
6200 bSign = extractFloat128Sign( b );
6201 if ( aSign != bSign ) {
6202 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6203 /* zero case */
6204 return float_relation_equal;
6205 } else {
6206 return 1 - (2 * aSign);
6207 }
6208 } else {
6209 if (a.low == b.low && a.high == b.high) {
6210 return float_relation_equal;
6211 } else {
6212 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6213 }
6214 }
6215}
6216
6217int float128_compare( float128 a, float128 b STATUS_PARAM )
6218{
6219 return float128_compare_internal(a, b, 0 STATUS_VAR);
6220}
6221
6222int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
6223{
6224 return float128_compare_internal(a, b, 1 STATUS_VAR);
6225}
6226
274f1b04
PM
6227/* min() and max() functions. These can't be implemented as
6228 * 'compare and pick one input' because that would mishandle
6229 * NaNs and +0 vs -0.
6230 */
6231#define MINMAX(s, nan_exp) \
6232INLINE float ## s float ## s ## _minmax(float ## s a, float ## s b, \
6233 int ismin STATUS_PARAM ) \
6234{ \
6235 flag aSign, bSign; \
6236 uint ## s ## _t av, bv; \
6237 a = float ## s ## _squash_input_denormal(a STATUS_VAR); \
6238 b = float ## s ## _squash_input_denormal(b STATUS_VAR); \
6239 if (float ## s ## _is_any_nan(a) || \
6240 float ## s ## _is_any_nan(b)) { \
6241 return propagateFloat ## s ## NaN(a, b STATUS_VAR); \
6242 } \
6243 aSign = extractFloat ## s ## Sign(a); \
6244 bSign = extractFloat ## s ## Sign(b); \
6245 av = float ## s ## _val(a); \
6246 bv = float ## s ## _val(b); \
6247 if (aSign != bSign) { \
6248 if (ismin) { \
6249 return aSign ? a : b; \
6250 } else { \
6251 return aSign ? b : a; \
6252 } \
6253 } else { \
6254 if (ismin) { \
6255 return (aSign ^ (av < bv)) ? a : b; \
6256 } else { \
6257 return (aSign ^ (av < bv)) ? b : a; \
6258 } \
6259 } \
6260} \
6261 \
6262float ## s float ## s ## _min(float ## s a, float ## s b STATUS_PARAM) \
6263{ \
6264 return float ## s ## _minmax(a, b, 1 STATUS_VAR); \
6265} \
6266 \
6267float ## s float ## s ## _max(float ## s a, float ## s b STATUS_PARAM) \
6268{ \
6269 return float ## s ## _minmax(a, b, 0 STATUS_VAR); \
6270}
6271
6272MINMAX(32, 0xff)
6273MINMAX(64, 0x7ff)
6274
6275
9ee6e8bb
PB
6276/* Multiply A by 2 raised to the power N. */
6277float32 float32_scalbn( float32 a, int n STATUS_PARAM )
6278{
6279 flag aSign;
6280 int16 aExp;
bb98fe42 6281 uint32_t aSig;
9ee6e8bb 6282
37d18660 6283 a = float32_squash_input_denormal(a STATUS_VAR);
9ee6e8bb
PB
6284 aSig = extractFloat32Frac( a );
6285 aExp = extractFloat32Exp( a );
6286 aSign = extractFloat32Sign( a );
6287
6288 if ( aExp == 0xFF ) {
6289 return a;
6290 }
69397542
PB
6291 if ( aExp != 0 )
6292 aSig |= 0x00800000;
6293 else if ( aSig == 0 )
6294 return a;
6295
6296 aExp += n - 1;
6297 aSig <<= 7;
6298 return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
9ee6e8bb
PB
6299}
6300
6301float64 float64_scalbn( float64 a, int n STATUS_PARAM )
6302{
6303 flag aSign;
6304 int16 aExp;
bb98fe42 6305 uint64_t aSig;
9ee6e8bb 6306
37d18660 6307 a = float64_squash_input_denormal(a STATUS_VAR);
9ee6e8bb
PB
6308 aSig = extractFloat64Frac( a );
6309 aExp = extractFloat64Exp( a );
6310 aSign = extractFloat64Sign( a );
6311
6312 if ( aExp == 0x7FF ) {
6313 return a;
6314 }
69397542
PB
6315 if ( aExp != 0 )
6316 aSig |= LIT64( 0x0010000000000000 );
6317 else if ( aSig == 0 )
6318 return a;
6319
6320 aExp += n - 1;
6321 aSig <<= 10;
6322 return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
9ee6e8bb
PB
6323}
6324
6325#ifdef FLOATX80
6326floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
6327{
6328 flag aSign;
6329 int16 aExp;
bb98fe42 6330 uint64_t aSig;
9ee6e8bb
PB
6331
6332 aSig = extractFloatx80Frac( a );
6333 aExp = extractFloatx80Exp( a );
6334 aSign = extractFloatx80Sign( a );
6335
6336 if ( aExp == 0x7FF ) {
6337 return a;
6338 }
69397542
PB
6339 if (aExp == 0 && aSig == 0)
6340 return a;
6341
9ee6e8bb 6342 aExp += n;
69397542
PB
6343 return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
6344 aSign, aExp, aSig, 0 STATUS_VAR );
9ee6e8bb
PB
6345}
6346#endif
6347
6348#ifdef FLOAT128
6349float128 float128_scalbn( float128 a, int n STATUS_PARAM )
6350{
6351 flag aSign;
6352 int32 aExp;
bb98fe42 6353 uint64_t aSig0, aSig1;
9ee6e8bb
PB
6354
6355 aSig1 = extractFloat128Frac1( a );
6356 aSig0 = extractFloat128Frac0( a );
6357 aExp = extractFloat128Exp( a );
6358 aSign = extractFloat128Sign( a );
6359 if ( aExp == 0x7FFF ) {
6360 return a;
6361 }
69397542
PB
6362 if ( aExp != 0 )
6363 aSig0 |= LIT64( 0x0001000000000000 );
6364 else if ( aSig0 == 0 && aSig1 == 0 )
6365 return a;
6366
6367 aExp += n - 1;
6368 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
6369 STATUS_VAR );
9ee6e8bb
PB
6370
6371}
6372#endif