]> git.proxmox.com Git - mirror_qemu.git/blame - fpu/softfloat.c
bFLT 64-bit host fix.
[mirror_qemu.git] / fpu / softfloat.c
CommitLineData
158142c2
FB
1
2/*============================================================================
3
4This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
5Package, Release 2b.
6
7Written by John R. Hauser. This work was made possible in part by the
8International Computer Science Institute, located at Suite 600, 1947 Center
9Street, Berkeley, California 94704. Funding was partially provided by the
10National Science Foundation under grant MIP-9311980. The original version
11of this code was written as part of a project to build a fixed-point vector
12processor in collaboration with the University of California at Berkeley,
13overseen by Profs. Nelson Morgan and John Wawrzynek. More information
14is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
15arithmetic/SoftFloat.html'.
16
17THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
18been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
19RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
20AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
21COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
22EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
23INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
24OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
25
26Derivative works are acceptable, even for commercial purposes, so long as
27(1) the source code for the derivative work includes prominent notice that
28the work is derivative, and (2) the source code includes prominent notice with
29these four paragraphs for those parts of this code that are retained.
30
31=============================================================================*/
32
33#include "softfloat.h"
34
35/*----------------------------------------------------------------------------
36| Primitive arithmetic functions, including multi-word arithmetic, and
37| division and square root approximations. (Can be specialized to target if
38| desired.)
39*----------------------------------------------------------------------------*/
40#include "softfloat-macros.h"
41
42/*----------------------------------------------------------------------------
43| Functions and definitions to determine: (1) whether tininess for underflow
44| is detected before or after rounding by default, (2) what (if anything)
45| happens when exceptions are raised, (3) how signaling NaNs are distinguished
46| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
47| are propagated from function inputs to output. These details are target-
48| specific.
49*----------------------------------------------------------------------------*/
50#include "softfloat-specialize.h"
51
52void set_float_rounding_mode(int val STATUS_PARAM)
53{
54 STATUS(float_rounding_mode) = val;
55}
56
1d6bda35
FB
57void set_float_exception_flags(int val STATUS_PARAM)
58{
59 STATUS(float_exception_flags) = val;
60}
61
158142c2
FB
62#ifdef FLOATX80
63void set_floatx80_rounding_precision(int val STATUS_PARAM)
64{
65 STATUS(floatx80_rounding_precision) = val;
66}
67#endif
68
69/*----------------------------------------------------------------------------
70| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
71| and 7, and returns the properly rounded 32-bit integer corresponding to the
72| input. If `zSign' is 1, the input is negated before being converted to an
73| integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
74| is simply rounded to an integer, with the inexact exception raised if the
75| input cannot be represented exactly as an integer. However, if the fixed-
76| point input is too large, the invalid exception is raised and the largest
77| positive or negative integer is returned.
78*----------------------------------------------------------------------------*/
79
80static int32 roundAndPackInt32( flag zSign, bits64 absZ STATUS_PARAM)
81{
82 int8 roundingMode;
83 flag roundNearestEven;
84 int8 roundIncrement, roundBits;
85 int32 z;
86
87 roundingMode = STATUS(float_rounding_mode);
88 roundNearestEven = ( roundingMode == float_round_nearest_even );
89 roundIncrement = 0x40;
90 if ( ! roundNearestEven ) {
91 if ( roundingMode == float_round_to_zero ) {
92 roundIncrement = 0;
93 }
94 else {
95 roundIncrement = 0x7F;
96 if ( zSign ) {
97 if ( roundingMode == float_round_up ) roundIncrement = 0;
98 }
99 else {
100 if ( roundingMode == float_round_down ) roundIncrement = 0;
101 }
102 }
103 }
104 roundBits = absZ & 0x7F;
105 absZ = ( absZ + roundIncrement )>>7;
106 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
107 z = absZ;
108 if ( zSign ) z = - z;
109 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
110 float_raise( float_flag_invalid STATUS_VAR);
111 return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
112 }
113 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
114 return z;
115
116}
117
118/*----------------------------------------------------------------------------
119| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
120| `absZ1', with binary point between bits 63 and 64 (between the input words),
121| and returns the properly rounded 64-bit integer corresponding to the input.
122| If `zSign' is 1, the input is negated before being converted to an integer.
123| Ordinarily, the fixed-point input is simply rounded to an integer, with
124| the inexact exception raised if the input cannot be represented exactly as
125| an integer. However, if the fixed-point input is too large, the invalid
126| exception is raised and the largest positive or negative integer is
127| returned.
128*----------------------------------------------------------------------------*/
129
130static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 STATUS_PARAM)
131{
132 int8 roundingMode;
133 flag roundNearestEven, increment;
134 int64 z;
135
136 roundingMode = STATUS(float_rounding_mode);
137 roundNearestEven = ( roundingMode == float_round_nearest_even );
138 increment = ( (sbits64) absZ1 < 0 );
139 if ( ! roundNearestEven ) {
140 if ( roundingMode == float_round_to_zero ) {
141 increment = 0;
142 }
143 else {
144 if ( zSign ) {
145 increment = ( roundingMode == float_round_down ) && absZ1;
146 }
147 else {
148 increment = ( roundingMode == float_round_up ) && absZ1;
149 }
150 }
151 }
152 if ( increment ) {
153 ++absZ0;
154 if ( absZ0 == 0 ) goto overflow;
155 absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
156 }
157 z = absZ0;
158 if ( zSign ) z = - z;
159 if ( z && ( ( z < 0 ) ^ zSign ) ) {
160 overflow:
161 float_raise( float_flag_invalid STATUS_VAR);
162 return
163 zSign ? (sbits64) LIT64( 0x8000000000000000 )
164 : LIT64( 0x7FFFFFFFFFFFFFFF );
165 }
166 if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
167 return z;
168
169}
170
171/*----------------------------------------------------------------------------
172| Returns the fraction bits of the single-precision floating-point value `a'.
173*----------------------------------------------------------------------------*/
174
175INLINE bits32 extractFloat32Frac( float32 a )
176{
177
178 return a & 0x007FFFFF;
179
180}
181
182/*----------------------------------------------------------------------------
183| Returns the exponent bits of the single-precision floating-point value `a'.
184*----------------------------------------------------------------------------*/
185
186INLINE int16 extractFloat32Exp( float32 a )
187{
188
189 return ( a>>23 ) & 0xFF;
190
191}
192
193/*----------------------------------------------------------------------------
194| Returns the sign bit of the single-precision floating-point value `a'.
195*----------------------------------------------------------------------------*/
196
197INLINE flag extractFloat32Sign( float32 a )
198{
199
200 return a>>31;
201
202}
203
204/*----------------------------------------------------------------------------
205| Normalizes the subnormal single-precision floating-point value represented
206| by the denormalized significand `aSig'. The normalized exponent and
207| significand are stored at the locations pointed to by `zExpPtr' and
208| `zSigPtr', respectively.
209*----------------------------------------------------------------------------*/
210
211static void
212 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
213{
214 int8 shiftCount;
215
216 shiftCount = countLeadingZeros32( aSig ) - 8;
217 *zSigPtr = aSig<<shiftCount;
218 *zExpPtr = 1 - shiftCount;
219
220}
221
222/*----------------------------------------------------------------------------
223| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
224| single-precision floating-point value, returning the result. After being
225| shifted into the proper positions, the three fields are simply added
226| together to form the result. This means that any integer portion of `zSig'
227| will be added into the exponent. Since a properly normalized significand
228| will have an integer portion equal to 1, the `zExp' input should be 1 less
229| than the desired result exponent whenever `zSig' is a complete, normalized
230| significand.
231*----------------------------------------------------------------------------*/
232
233INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
234{
235
236 return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
237
238}
239
240/*----------------------------------------------------------------------------
241| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
242| and significand `zSig', and returns the proper single-precision floating-
243| point value corresponding to the abstract input. Ordinarily, the abstract
244| value is simply rounded and packed into the single-precision format, with
245| the inexact exception raised if the abstract input cannot be represented
246| exactly. However, if the abstract value is too large, the overflow and
247| inexact exceptions are raised and an infinity or maximal finite value is
248| returned. If the abstract value is too small, the input value is rounded to
249| a subnormal number, and the underflow and inexact exceptions are raised if
250| the abstract input cannot be represented exactly as a subnormal single-
251| precision floating-point number.
252| The input significand `zSig' has its binary point between bits 30
253| and 29, which is 7 bits to the left of the usual location. This shifted
254| significand must be normalized or smaller. If `zSig' is not normalized,
255| `zExp' must be 0; in that case, the result returned is a subnormal number,
256| and it must not require rounding. In the usual case that `zSig' is
257| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
258| The handling of underflow and overflow follows the IEC/IEEE Standard for
259| Binary Floating-Point Arithmetic.
260*----------------------------------------------------------------------------*/
261
262static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
263{
264 int8 roundingMode;
265 flag roundNearestEven;
266 int8 roundIncrement, roundBits;
267 flag isTiny;
268
269 roundingMode = STATUS(float_rounding_mode);
270 roundNearestEven = ( roundingMode == float_round_nearest_even );
271 roundIncrement = 0x40;
272 if ( ! roundNearestEven ) {
273 if ( roundingMode == float_round_to_zero ) {
274 roundIncrement = 0;
275 }
276 else {
277 roundIncrement = 0x7F;
278 if ( zSign ) {
279 if ( roundingMode == float_round_up ) roundIncrement = 0;
280 }
281 else {
282 if ( roundingMode == float_round_down ) roundIncrement = 0;
283 }
284 }
285 }
286 roundBits = zSig & 0x7F;
287 if ( 0xFD <= (bits16) zExp ) {
288 if ( ( 0xFD < zExp )
289 || ( ( zExp == 0xFD )
290 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
291 ) {
292 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
293 return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
294 }
295 if ( zExp < 0 ) {
296 isTiny =
297 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
298 || ( zExp < -1 )
299 || ( zSig + roundIncrement < 0x80000000 );
300 shift32RightJamming( zSig, - zExp, &zSig );
301 zExp = 0;
302 roundBits = zSig & 0x7F;
303 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
304 }
305 }
306 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
307 zSig = ( zSig + roundIncrement )>>7;
308 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
309 if ( zSig == 0 ) zExp = 0;
310 return packFloat32( zSign, zExp, zSig );
311
312}
313
314/*----------------------------------------------------------------------------
315| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
316| and significand `zSig', and returns the proper single-precision floating-
317| point value corresponding to the abstract input. This routine is just like
318| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
319| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
320| floating-point exponent.
321*----------------------------------------------------------------------------*/
322
323static float32
324 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
325{
326 int8 shiftCount;
327
328 shiftCount = countLeadingZeros32( zSig ) - 1;
329 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
330
331}
332
333/*----------------------------------------------------------------------------
334| Returns the fraction bits of the double-precision floating-point value `a'.
335*----------------------------------------------------------------------------*/
336
337INLINE bits64 extractFloat64Frac( float64 a )
338{
339
340 return a & LIT64( 0x000FFFFFFFFFFFFF );
341
342}
343
344/*----------------------------------------------------------------------------
345| Returns the exponent bits of the double-precision floating-point value `a'.
346*----------------------------------------------------------------------------*/
347
348INLINE int16 extractFloat64Exp( float64 a )
349{
350
351 return ( a>>52 ) & 0x7FF;
352
353}
354
355/*----------------------------------------------------------------------------
356| Returns the sign bit of the double-precision floating-point value `a'.
357*----------------------------------------------------------------------------*/
358
359INLINE flag extractFloat64Sign( float64 a )
360{
361
362 return a>>63;
363
364}
365
366/*----------------------------------------------------------------------------
367| Normalizes the subnormal double-precision floating-point value represented
368| by the denormalized significand `aSig'. The normalized exponent and
369| significand are stored at the locations pointed to by `zExpPtr' and
370| `zSigPtr', respectively.
371*----------------------------------------------------------------------------*/
372
373static void
374 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
375{
376 int8 shiftCount;
377
378 shiftCount = countLeadingZeros64( aSig ) - 11;
379 *zSigPtr = aSig<<shiftCount;
380 *zExpPtr = 1 - shiftCount;
381
382}
383
384/*----------------------------------------------------------------------------
385| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
386| double-precision floating-point value, returning the result. After being
387| shifted into the proper positions, the three fields are simply added
388| together to form the result. This means that any integer portion of `zSig'
389| will be added into the exponent. Since a properly normalized significand
390| will have an integer portion equal to 1, the `zExp' input should be 1 less
391| than the desired result exponent whenever `zSig' is a complete, normalized
392| significand.
393*----------------------------------------------------------------------------*/
394
395INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
396{
397
398 return ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig;
399
400}
401
402/*----------------------------------------------------------------------------
403| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
404| and significand `zSig', and returns the proper double-precision floating-
405| point value corresponding to the abstract input. Ordinarily, the abstract
406| value is simply rounded and packed into the double-precision format, with
407| the inexact exception raised if the abstract input cannot be represented
408| exactly. However, if the abstract value is too large, the overflow and
409| inexact exceptions are raised and an infinity or maximal finite value is
410| returned. If the abstract value is too small, the input value is rounded
411| to a subnormal number, and the underflow and inexact exceptions are raised
412| if the abstract input cannot be represented exactly as a subnormal double-
413| precision floating-point number.
414| The input significand `zSig' has its binary point between bits 62
415| and 61, which is 10 bits to the left of the usual location. This shifted
416| significand must be normalized or smaller. If `zSig' is not normalized,
417| `zExp' must be 0; in that case, the result returned is a subnormal number,
418| and it must not require rounding. In the usual case that `zSig' is
419| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
420| The handling of underflow and overflow follows the IEC/IEEE Standard for
421| Binary Floating-Point Arithmetic.
422*----------------------------------------------------------------------------*/
423
424static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
425{
426 int8 roundingMode;
427 flag roundNearestEven;
428 int16 roundIncrement, roundBits;
429 flag isTiny;
430
431 roundingMode = STATUS(float_rounding_mode);
432 roundNearestEven = ( roundingMode == float_round_nearest_even );
433 roundIncrement = 0x200;
434 if ( ! roundNearestEven ) {
435 if ( roundingMode == float_round_to_zero ) {
436 roundIncrement = 0;
437 }
438 else {
439 roundIncrement = 0x3FF;
440 if ( zSign ) {
441 if ( roundingMode == float_round_up ) roundIncrement = 0;
442 }
443 else {
444 if ( roundingMode == float_round_down ) roundIncrement = 0;
445 }
446 }
447 }
448 roundBits = zSig & 0x3FF;
449 if ( 0x7FD <= (bits16) zExp ) {
450 if ( ( 0x7FD < zExp )
451 || ( ( zExp == 0x7FD )
452 && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
453 ) {
454 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
455 return packFloat64( zSign, 0x7FF, 0 ) - ( roundIncrement == 0 );
456 }
457 if ( zExp < 0 ) {
458 isTiny =
459 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
460 || ( zExp < -1 )
461 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
462 shift64RightJamming( zSig, - zExp, &zSig );
463 zExp = 0;
464 roundBits = zSig & 0x3FF;
465 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
466 }
467 }
468 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
469 zSig = ( zSig + roundIncrement )>>10;
470 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
471 if ( zSig == 0 ) zExp = 0;
472 return packFloat64( zSign, zExp, zSig );
473
474}
475
476/*----------------------------------------------------------------------------
477| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
478| and significand `zSig', and returns the proper double-precision floating-
479| point value corresponding to the abstract input. This routine is just like
480| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
481| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
482| floating-point exponent.
483*----------------------------------------------------------------------------*/
484
485static float64
486 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
487{
488 int8 shiftCount;
489
490 shiftCount = countLeadingZeros64( zSig ) - 1;
491 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
492
493}
494
495#ifdef FLOATX80
496
497/*----------------------------------------------------------------------------
498| Returns the fraction bits of the extended double-precision floating-point
499| value `a'.
500*----------------------------------------------------------------------------*/
501
502INLINE bits64 extractFloatx80Frac( floatx80 a )
503{
504
505 return a.low;
506
507}
508
509/*----------------------------------------------------------------------------
510| Returns the exponent bits of the extended double-precision floating-point
511| value `a'.
512*----------------------------------------------------------------------------*/
513
514INLINE int32 extractFloatx80Exp( floatx80 a )
515{
516
517 return a.high & 0x7FFF;
518
519}
520
521/*----------------------------------------------------------------------------
522| Returns the sign bit of the extended double-precision floating-point value
523| `a'.
524*----------------------------------------------------------------------------*/
525
526INLINE flag extractFloatx80Sign( floatx80 a )
527{
528
529 return a.high>>15;
530
531}
532
533/*----------------------------------------------------------------------------
534| Normalizes the subnormal extended double-precision floating-point value
535| represented by the denormalized significand `aSig'. The normalized exponent
536| and significand are stored at the locations pointed to by `zExpPtr' and
537| `zSigPtr', respectively.
538*----------------------------------------------------------------------------*/
539
540static void
541 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
542{
543 int8 shiftCount;
544
545 shiftCount = countLeadingZeros64( aSig );
546 *zSigPtr = aSig<<shiftCount;
547 *zExpPtr = 1 - shiftCount;
548
549}
550
551/*----------------------------------------------------------------------------
552| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
553| extended double-precision floating-point value, returning the result.
554*----------------------------------------------------------------------------*/
555
556INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
557{
558 floatx80 z;
559
560 z.low = zSig;
561 z.high = ( ( (bits16) zSign )<<15 ) + zExp;
562 return z;
563
564}
565
566/*----------------------------------------------------------------------------
567| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
568| and extended significand formed by the concatenation of `zSig0' and `zSig1',
569| and returns the proper extended double-precision floating-point value
570| corresponding to the abstract input. Ordinarily, the abstract value is
571| rounded and packed into the extended double-precision format, with the
572| inexact exception raised if the abstract input cannot be represented
573| exactly. However, if the abstract value is too large, the overflow and
574| inexact exceptions are raised and an infinity or maximal finite value is
575| returned. If the abstract value is too small, the input value is rounded to
576| a subnormal number, and the underflow and inexact exceptions are raised if
577| the abstract input cannot be represented exactly as a subnormal extended
578| double-precision floating-point number.
579| If `roundingPrecision' is 32 or 64, the result is rounded to the same
580| number of bits as single or double precision, respectively. Otherwise, the
581| result is rounded to the full precision of the extended double-precision
582| format.
583| The input significand must be normalized or smaller. If the input
584| significand is not normalized, `zExp' must be 0; in that case, the result
585| returned is a subnormal number, and it must not require rounding. The
586| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
587| Floating-Point Arithmetic.
588*----------------------------------------------------------------------------*/
589
590static floatx80
591 roundAndPackFloatx80(
592 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
593 STATUS_PARAM)
594{
595 int8 roundingMode;
596 flag roundNearestEven, increment, isTiny;
597 int64 roundIncrement, roundMask, roundBits;
598
599 roundingMode = STATUS(float_rounding_mode);
600 roundNearestEven = ( roundingMode == float_round_nearest_even );
601 if ( roundingPrecision == 80 ) goto precision80;
602 if ( roundingPrecision == 64 ) {
603 roundIncrement = LIT64( 0x0000000000000400 );
604 roundMask = LIT64( 0x00000000000007FF );
605 }
606 else if ( roundingPrecision == 32 ) {
607 roundIncrement = LIT64( 0x0000008000000000 );
608 roundMask = LIT64( 0x000000FFFFFFFFFF );
609 }
610 else {
611 goto precision80;
612 }
613 zSig0 |= ( zSig1 != 0 );
614 if ( ! roundNearestEven ) {
615 if ( roundingMode == float_round_to_zero ) {
616 roundIncrement = 0;
617 }
618 else {
619 roundIncrement = roundMask;
620 if ( zSign ) {
621 if ( roundingMode == float_round_up ) roundIncrement = 0;
622 }
623 else {
624 if ( roundingMode == float_round_down ) roundIncrement = 0;
625 }
626 }
627 }
628 roundBits = zSig0 & roundMask;
629 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
630 if ( ( 0x7FFE < zExp )
631 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
632 ) {
633 goto overflow;
634 }
635 if ( zExp <= 0 ) {
636 isTiny =
637 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
638 || ( zExp < 0 )
639 || ( zSig0 <= zSig0 + roundIncrement );
640 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
641 zExp = 0;
642 roundBits = zSig0 & roundMask;
643 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
644 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
645 zSig0 += roundIncrement;
646 if ( (sbits64) zSig0 < 0 ) zExp = 1;
647 roundIncrement = roundMask + 1;
648 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
649 roundMask |= roundIncrement;
650 }
651 zSig0 &= ~ roundMask;
652 return packFloatx80( zSign, zExp, zSig0 );
653 }
654 }
655 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
656 zSig0 += roundIncrement;
657 if ( zSig0 < roundIncrement ) {
658 ++zExp;
659 zSig0 = LIT64( 0x8000000000000000 );
660 }
661 roundIncrement = roundMask + 1;
662 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
663 roundMask |= roundIncrement;
664 }
665 zSig0 &= ~ roundMask;
666 if ( zSig0 == 0 ) zExp = 0;
667 return packFloatx80( zSign, zExp, zSig0 );
668 precision80:
669 increment = ( (sbits64) zSig1 < 0 );
670 if ( ! roundNearestEven ) {
671 if ( roundingMode == float_round_to_zero ) {
672 increment = 0;
673 }
674 else {
675 if ( zSign ) {
676 increment = ( roundingMode == float_round_down ) && zSig1;
677 }
678 else {
679 increment = ( roundingMode == float_round_up ) && zSig1;
680 }
681 }
682 }
683 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
684 if ( ( 0x7FFE < zExp )
685 || ( ( zExp == 0x7FFE )
686 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
687 && increment
688 )
689 ) {
690 roundMask = 0;
691 overflow:
692 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
693 if ( ( roundingMode == float_round_to_zero )
694 || ( zSign && ( roundingMode == float_round_up ) )
695 || ( ! zSign && ( roundingMode == float_round_down ) )
696 ) {
697 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
698 }
699 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
700 }
701 if ( zExp <= 0 ) {
702 isTiny =
703 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
704 || ( zExp < 0 )
705 || ! increment
706 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
707 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
708 zExp = 0;
709 if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
710 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
711 if ( roundNearestEven ) {
712 increment = ( (sbits64) zSig1 < 0 );
713 }
714 else {
715 if ( zSign ) {
716 increment = ( roundingMode == float_round_down ) && zSig1;
717 }
718 else {
719 increment = ( roundingMode == float_round_up ) && zSig1;
720 }
721 }
722 if ( increment ) {
723 ++zSig0;
724 zSig0 &=
725 ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
726 if ( (sbits64) zSig0 < 0 ) zExp = 1;
727 }
728 return packFloatx80( zSign, zExp, zSig0 );
729 }
730 }
731 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
732 if ( increment ) {
733 ++zSig0;
734 if ( zSig0 == 0 ) {
735 ++zExp;
736 zSig0 = LIT64( 0x8000000000000000 );
737 }
738 else {
739 zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
740 }
741 }
742 else {
743 if ( zSig0 == 0 ) zExp = 0;
744 }
745 return packFloatx80( zSign, zExp, zSig0 );
746
747}
748
749/*----------------------------------------------------------------------------
750| Takes an abstract floating-point value having sign `zSign', exponent
751| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
752| and returns the proper extended double-precision floating-point value
753| corresponding to the abstract input. This routine is just like
754| `roundAndPackFloatx80' except that the input significand does not have to be
755| normalized.
756*----------------------------------------------------------------------------*/
757
758static floatx80
759 normalizeRoundAndPackFloatx80(
760 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
761 STATUS_PARAM)
762{
763 int8 shiftCount;
764
765 if ( zSig0 == 0 ) {
766 zSig0 = zSig1;
767 zSig1 = 0;
768 zExp -= 64;
769 }
770 shiftCount = countLeadingZeros64( zSig0 );
771 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
772 zExp -= shiftCount;
773 return
774 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
775
776}
777
778#endif
779
780#ifdef FLOAT128
781
782/*----------------------------------------------------------------------------
783| Returns the least-significant 64 fraction bits of the quadruple-precision
784| floating-point value `a'.
785*----------------------------------------------------------------------------*/
786
787INLINE bits64 extractFloat128Frac1( float128 a )
788{
789
790 return a.low;
791
792}
793
794/*----------------------------------------------------------------------------
795| Returns the most-significant 48 fraction bits of the quadruple-precision
796| floating-point value `a'.
797*----------------------------------------------------------------------------*/
798
799INLINE bits64 extractFloat128Frac0( float128 a )
800{
801
802 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
803
804}
805
806/*----------------------------------------------------------------------------
807| Returns the exponent bits of the quadruple-precision floating-point value
808| `a'.
809*----------------------------------------------------------------------------*/
810
811INLINE int32 extractFloat128Exp( float128 a )
812{
813
814 return ( a.high>>48 ) & 0x7FFF;
815
816}
817
818/*----------------------------------------------------------------------------
819| Returns the sign bit of the quadruple-precision floating-point value `a'.
820*----------------------------------------------------------------------------*/
821
822INLINE flag extractFloat128Sign( float128 a )
823{
824
825 return a.high>>63;
826
827}
828
829/*----------------------------------------------------------------------------
830| Normalizes the subnormal quadruple-precision floating-point value
831| represented by the denormalized significand formed by the concatenation of
832| `aSig0' and `aSig1'. The normalized exponent is stored at the location
833| pointed to by `zExpPtr'. The most significant 49 bits of the normalized
834| significand are stored at the location pointed to by `zSig0Ptr', and the
835| least significant 64 bits of the normalized significand are stored at the
836| location pointed to by `zSig1Ptr'.
837*----------------------------------------------------------------------------*/
838
839static void
840 normalizeFloat128Subnormal(
841 bits64 aSig0,
842 bits64 aSig1,
843 int32 *zExpPtr,
844 bits64 *zSig0Ptr,
845 bits64 *zSig1Ptr
846 )
847{
848 int8 shiftCount;
849
850 if ( aSig0 == 0 ) {
851 shiftCount = countLeadingZeros64( aSig1 ) - 15;
852 if ( shiftCount < 0 ) {
853 *zSig0Ptr = aSig1>>( - shiftCount );
854 *zSig1Ptr = aSig1<<( shiftCount & 63 );
855 }
856 else {
857 *zSig0Ptr = aSig1<<shiftCount;
858 *zSig1Ptr = 0;
859 }
860 *zExpPtr = - shiftCount - 63;
861 }
862 else {
863 shiftCount = countLeadingZeros64( aSig0 ) - 15;
864 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
865 *zExpPtr = 1 - shiftCount;
866 }
867
868}
869
870/*----------------------------------------------------------------------------
871| Packs the sign `zSign', the exponent `zExp', and the significand formed
872| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
873| floating-point value, returning the result. After being shifted into the
874| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
875| added together to form the most significant 32 bits of the result. This
876| means that any integer portion of `zSig0' will be added into the exponent.
877| Since a properly normalized significand will have an integer portion equal
878| to 1, the `zExp' input should be 1 less than the desired result exponent
879| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
880| significand.
881*----------------------------------------------------------------------------*/
882
883INLINE float128
884 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
885{
886 float128 z;
887
888 z.low = zSig1;
889 z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
890 return z;
891
892}
893
894/*----------------------------------------------------------------------------
895| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
896| and extended significand formed by the concatenation of `zSig0', `zSig1',
897| and `zSig2', and returns the proper quadruple-precision floating-point value
898| corresponding to the abstract input. Ordinarily, the abstract value is
899| simply rounded and packed into the quadruple-precision format, with the
900| inexact exception raised if the abstract input cannot be represented
901| exactly. However, if the abstract value is too large, the overflow and
902| inexact exceptions are raised and an infinity or maximal finite value is
903| returned. If the abstract value is too small, the input value is rounded to
904| a subnormal number, and the underflow and inexact exceptions are raised if
905| the abstract input cannot be represented exactly as a subnormal quadruple-
906| precision floating-point number.
907| The input significand must be normalized or smaller. If the input
908| significand is not normalized, `zExp' must be 0; in that case, the result
909| returned is a subnormal number, and it must not require rounding. In the
910| usual case that the input significand is normalized, `zExp' must be 1 less
911| than the ``true'' floating-point exponent. The handling of underflow and
912| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
913*----------------------------------------------------------------------------*/
914
915static float128
916 roundAndPackFloat128(
917 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 STATUS_PARAM)
918{
919 int8 roundingMode;
920 flag roundNearestEven, increment, isTiny;
921
922 roundingMode = STATUS(float_rounding_mode);
923 roundNearestEven = ( roundingMode == float_round_nearest_even );
924 increment = ( (sbits64) zSig2 < 0 );
925 if ( ! roundNearestEven ) {
926 if ( roundingMode == float_round_to_zero ) {
927 increment = 0;
928 }
929 else {
930 if ( zSign ) {
931 increment = ( roundingMode == float_round_down ) && zSig2;
932 }
933 else {
934 increment = ( roundingMode == float_round_up ) && zSig2;
935 }
936 }
937 }
938 if ( 0x7FFD <= (bits32) zExp ) {
939 if ( ( 0x7FFD < zExp )
940 || ( ( zExp == 0x7FFD )
941 && eq128(
942 LIT64( 0x0001FFFFFFFFFFFF ),
943 LIT64( 0xFFFFFFFFFFFFFFFF ),
944 zSig0,
945 zSig1
946 )
947 && increment
948 )
949 ) {
950 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
951 if ( ( roundingMode == float_round_to_zero )
952 || ( zSign && ( roundingMode == float_round_up ) )
953 || ( ! zSign && ( roundingMode == float_round_down ) )
954 ) {
955 return
956 packFloat128(
957 zSign,
958 0x7FFE,
959 LIT64( 0x0000FFFFFFFFFFFF ),
960 LIT64( 0xFFFFFFFFFFFFFFFF )
961 );
962 }
963 return packFloat128( zSign, 0x7FFF, 0, 0 );
964 }
965 if ( zExp < 0 ) {
966 isTiny =
967 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
968 || ( zExp < -1 )
969 || ! increment
970 || lt128(
971 zSig0,
972 zSig1,
973 LIT64( 0x0001FFFFFFFFFFFF ),
974 LIT64( 0xFFFFFFFFFFFFFFFF )
975 );
976 shift128ExtraRightJamming(
977 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
978 zExp = 0;
979 if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
980 if ( roundNearestEven ) {
981 increment = ( (sbits64) zSig2 < 0 );
982 }
983 else {
984 if ( zSign ) {
985 increment = ( roundingMode == float_round_down ) && zSig2;
986 }
987 else {
988 increment = ( roundingMode == float_round_up ) && zSig2;
989 }
990 }
991 }
992 }
993 if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
994 if ( increment ) {
995 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
996 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
997 }
998 else {
999 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1000 }
1001 return packFloat128( zSign, zExp, zSig0, zSig1 );
1002
1003}
1004
1005/*----------------------------------------------------------------------------
1006| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1007| and significand formed by the concatenation of `zSig0' and `zSig1', and
1008| returns the proper quadruple-precision floating-point value corresponding
1009| to the abstract input. This routine is just like `roundAndPackFloat128'
1010| except that the input significand has fewer bits and does not have to be
1011| normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1012| point exponent.
1013*----------------------------------------------------------------------------*/
1014
1015static float128
1016 normalizeRoundAndPackFloat128(
1017 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 STATUS_PARAM)
1018{
1019 int8 shiftCount;
1020 bits64 zSig2;
1021
1022 if ( zSig0 == 0 ) {
1023 zSig0 = zSig1;
1024 zSig1 = 0;
1025 zExp -= 64;
1026 }
1027 shiftCount = countLeadingZeros64( zSig0 ) - 15;
1028 if ( 0 <= shiftCount ) {
1029 zSig2 = 0;
1030 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1031 }
1032 else {
1033 shift128ExtraRightJamming(
1034 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1035 }
1036 zExp -= shiftCount;
1037 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1038
1039}
1040
1041#endif
1042
1043/*----------------------------------------------------------------------------
1044| Returns the result of converting the 32-bit two's complement integer `a'
1045| to the single-precision floating-point format. The conversion is performed
1046| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1047*----------------------------------------------------------------------------*/
1048
1049float32 int32_to_float32( int32 a STATUS_PARAM )
1050{
1051 flag zSign;
1052
1053 if ( a == 0 ) return 0;
1054 if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1055 zSign = ( a < 0 );
1056 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1057
1058}
1059
1060/*----------------------------------------------------------------------------
1061| Returns the result of converting the 32-bit two's complement integer `a'
1062| to the double-precision floating-point format. The conversion is performed
1063| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1064*----------------------------------------------------------------------------*/
1065
1066float64 int32_to_float64( int32 a STATUS_PARAM )
1067{
1068 flag zSign;
1069 uint32 absA;
1070 int8 shiftCount;
1071 bits64 zSig;
1072
1073 if ( a == 0 ) return 0;
1074 zSign = ( a < 0 );
1075 absA = zSign ? - a : a;
1076 shiftCount = countLeadingZeros32( absA ) + 21;
1077 zSig = absA;
1078 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1079
1080}
1081
1082#ifdef FLOATX80
1083
1084/*----------------------------------------------------------------------------
1085| Returns the result of converting the 32-bit two's complement integer `a'
1086| to the extended double-precision floating-point format. The conversion
1087| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1088| Arithmetic.
1089*----------------------------------------------------------------------------*/
1090
1091floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1092{
1093 flag zSign;
1094 uint32 absA;
1095 int8 shiftCount;
1096 bits64 zSig;
1097
1098 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1099 zSign = ( a < 0 );
1100 absA = zSign ? - a : a;
1101 shiftCount = countLeadingZeros32( absA ) + 32;
1102 zSig = absA;
1103 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1104
1105}
1106
1107#endif
1108
1109#ifdef FLOAT128
1110
1111/*----------------------------------------------------------------------------
1112| Returns the result of converting the 32-bit two's complement integer `a' to
1113| the quadruple-precision floating-point format. The conversion is performed
1114| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1115*----------------------------------------------------------------------------*/
1116
1117float128 int32_to_float128( int32 a STATUS_PARAM )
1118{
1119 flag zSign;
1120 uint32 absA;
1121 int8 shiftCount;
1122 bits64 zSig0;
1123
1124 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1125 zSign = ( a < 0 );
1126 absA = zSign ? - a : a;
1127 shiftCount = countLeadingZeros32( absA ) + 17;
1128 zSig0 = absA;
1129 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1130
1131}
1132
1133#endif
1134
1135/*----------------------------------------------------------------------------
1136| Returns the result of converting the 64-bit two's complement integer `a'
1137| to the single-precision floating-point format. The conversion is performed
1138| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1139*----------------------------------------------------------------------------*/
1140
1141float32 int64_to_float32( int64 a STATUS_PARAM )
1142{
1143 flag zSign;
1144 uint64 absA;
1145 int8 shiftCount;
1146
1147 if ( a == 0 ) return 0;
1148 zSign = ( a < 0 );
1149 absA = zSign ? - a : a;
1150 shiftCount = countLeadingZeros64( absA ) - 40;
1151 if ( 0 <= shiftCount ) {
1152 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1153 }
1154 else {
1155 shiftCount += 7;
1156 if ( shiftCount < 0 ) {
1157 shift64RightJamming( absA, - shiftCount, &absA );
1158 }
1159 else {
1160 absA <<= shiftCount;
1161 }
1162 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1163 }
1164
1165}
1166
1167/*----------------------------------------------------------------------------
1168| Returns the result of converting the 64-bit two's complement integer `a'
1169| to the double-precision floating-point format. The conversion is performed
1170| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1171*----------------------------------------------------------------------------*/
1172
1173float64 int64_to_float64( int64 a STATUS_PARAM )
1174{
1175 flag zSign;
1176
1177 if ( a == 0 ) return 0;
1178 if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1179 return packFloat64( 1, 0x43E, 0 );
1180 }
1181 zSign = ( a < 0 );
1182 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1183
1184}
1185
1186#ifdef FLOATX80
1187
1188/*----------------------------------------------------------------------------
1189| Returns the result of converting the 64-bit two's complement integer `a'
1190| to the extended double-precision floating-point format. The conversion
1191| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1192| Arithmetic.
1193*----------------------------------------------------------------------------*/
1194
1195floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1196{
1197 flag zSign;
1198 uint64 absA;
1199 int8 shiftCount;
1200
1201 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1202 zSign = ( a < 0 );
1203 absA = zSign ? - a : a;
1204 shiftCount = countLeadingZeros64( absA );
1205 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1206
1207}
1208
1209#endif
1210
1211#ifdef FLOAT128
1212
1213/*----------------------------------------------------------------------------
1214| Returns the result of converting the 64-bit two's complement integer `a' to
1215| the quadruple-precision floating-point format. The conversion is performed
1216| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1217*----------------------------------------------------------------------------*/
1218
1219float128 int64_to_float128( int64 a STATUS_PARAM )
1220{
1221 flag zSign;
1222 uint64 absA;
1223 int8 shiftCount;
1224 int32 zExp;
1225 bits64 zSig0, zSig1;
1226
1227 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1228 zSign = ( a < 0 );
1229 absA = zSign ? - a : a;
1230 shiftCount = countLeadingZeros64( absA ) + 49;
1231 zExp = 0x406E - shiftCount;
1232 if ( 64 <= shiftCount ) {
1233 zSig1 = 0;
1234 zSig0 = absA;
1235 shiftCount -= 64;
1236 }
1237 else {
1238 zSig1 = absA;
1239 zSig0 = 0;
1240 }
1241 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1242 return packFloat128( zSign, zExp, zSig0, zSig1 );
1243
1244}
1245
1246#endif
1247
1248/*----------------------------------------------------------------------------
1249| Returns the result of converting the single-precision floating-point value
1250| `a' to the 32-bit two's complement integer format. The conversion is
1251| performed according to the IEC/IEEE Standard for Binary Floating-Point
1252| Arithmetic---which means in particular that the conversion is rounded
1253| according to the current rounding mode. If `a' is a NaN, the largest
1254| positive integer is returned. Otherwise, if the conversion overflows, the
1255| largest integer with the same sign as `a' is returned.
1256*----------------------------------------------------------------------------*/
1257
1258int32 float32_to_int32( float32 a STATUS_PARAM )
1259{
1260 flag aSign;
1261 int16 aExp, shiftCount;
1262 bits32 aSig;
1263 bits64 aSig64;
1264
1265 aSig = extractFloat32Frac( a );
1266 aExp = extractFloat32Exp( a );
1267 aSign = extractFloat32Sign( a );
1268 if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1269 if ( aExp ) aSig |= 0x00800000;
1270 shiftCount = 0xAF - aExp;
1271 aSig64 = aSig;
1272 aSig64 <<= 32;
1273 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1274 return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1275
1276}
1277
1278/*----------------------------------------------------------------------------
1279| Returns the result of converting the single-precision floating-point value
1280| `a' to the 32-bit two's complement integer format. The conversion is
1281| performed according to the IEC/IEEE Standard for Binary Floating-Point
1282| Arithmetic, except that the conversion is always rounded toward zero.
1283| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1284| the conversion overflows, the largest integer with the same sign as `a' is
1285| returned.
1286*----------------------------------------------------------------------------*/
1287
1288int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1289{
1290 flag aSign;
1291 int16 aExp, shiftCount;
1292 bits32 aSig;
1293 int32 z;
1294
1295 aSig = extractFloat32Frac( a );
1296 aExp = extractFloat32Exp( a );
1297 aSign = extractFloat32Sign( a );
1298 shiftCount = aExp - 0x9E;
1299 if ( 0 <= shiftCount ) {
1300 if ( a != 0xCF000000 ) {
1301 float_raise( float_flag_invalid STATUS_VAR);
1302 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1303 }
1304 return (sbits32) 0x80000000;
1305 }
1306 else if ( aExp <= 0x7E ) {
1307 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1308 return 0;
1309 }
1310 aSig = ( aSig | 0x00800000 )<<8;
1311 z = aSig>>( - shiftCount );
1312 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1313 STATUS(float_exception_flags) |= float_flag_inexact;
1314 }
1315 if ( aSign ) z = - z;
1316 return z;
1317
1318}
1319
1320/*----------------------------------------------------------------------------
1321| Returns the result of converting the single-precision floating-point value
1322| `a' to the 64-bit two's complement integer format. The conversion is
1323| performed according to the IEC/IEEE Standard for Binary Floating-Point
1324| Arithmetic---which means in particular that the conversion is rounded
1325| according to the current rounding mode. If `a' is a NaN, the largest
1326| positive integer is returned. Otherwise, if the conversion overflows, the
1327| largest integer with the same sign as `a' is returned.
1328*----------------------------------------------------------------------------*/
1329
1330int64 float32_to_int64( float32 a STATUS_PARAM )
1331{
1332 flag aSign;
1333 int16 aExp, shiftCount;
1334 bits32 aSig;
1335 bits64 aSig64, aSigExtra;
1336
1337 aSig = extractFloat32Frac( a );
1338 aExp = extractFloat32Exp( a );
1339 aSign = extractFloat32Sign( a );
1340 shiftCount = 0xBE - aExp;
1341 if ( shiftCount < 0 ) {
1342 float_raise( float_flag_invalid STATUS_VAR);
1343 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1344 return LIT64( 0x7FFFFFFFFFFFFFFF );
1345 }
1346 return (sbits64) LIT64( 0x8000000000000000 );
1347 }
1348 if ( aExp ) aSig |= 0x00800000;
1349 aSig64 = aSig;
1350 aSig64 <<= 40;
1351 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1352 return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1353
1354}
1355
1356/*----------------------------------------------------------------------------
1357| Returns the result of converting the single-precision floating-point value
1358| `a' to the 64-bit two's complement integer format. The conversion is
1359| performed according to the IEC/IEEE Standard for Binary Floating-Point
1360| Arithmetic, except that the conversion is always rounded toward zero. If
1361| `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1362| conversion overflows, the largest integer with the same sign as `a' is
1363| returned.
1364*----------------------------------------------------------------------------*/
1365
1366int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1367{
1368 flag aSign;
1369 int16 aExp, shiftCount;
1370 bits32 aSig;
1371 bits64 aSig64;
1372 int64 z;
1373
1374 aSig = extractFloat32Frac( a );
1375 aExp = extractFloat32Exp( a );
1376 aSign = extractFloat32Sign( a );
1377 shiftCount = aExp - 0xBE;
1378 if ( 0 <= shiftCount ) {
1379 if ( a != 0xDF000000 ) {
1380 float_raise( float_flag_invalid STATUS_VAR);
1381 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1382 return LIT64( 0x7FFFFFFFFFFFFFFF );
1383 }
1384 }
1385 return (sbits64) LIT64( 0x8000000000000000 );
1386 }
1387 else if ( aExp <= 0x7E ) {
1388 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1389 return 0;
1390 }
1391 aSig64 = aSig | 0x00800000;
1392 aSig64 <<= 40;
1393 z = aSig64>>( - shiftCount );
1394 if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1395 STATUS(float_exception_flags) |= float_flag_inexact;
1396 }
1397 if ( aSign ) z = - z;
1398 return z;
1399
1400}
1401
1402/*----------------------------------------------------------------------------
1403| Returns the result of converting the single-precision floating-point value
1404| `a' to the double-precision floating-point format. The conversion is
1405| performed according to the IEC/IEEE Standard for Binary Floating-Point
1406| Arithmetic.
1407*----------------------------------------------------------------------------*/
1408
1409float64 float32_to_float64( float32 a STATUS_PARAM )
1410{
1411 flag aSign;
1412 int16 aExp;
1413 bits32 aSig;
1414
1415 aSig = extractFloat32Frac( a );
1416 aExp = extractFloat32Exp( a );
1417 aSign = extractFloat32Sign( a );
1418 if ( aExp == 0xFF ) {
1419 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ));
1420 return packFloat64( aSign, 0x7FF, 0 );
1421 }
1422 if ( aExp == 0 ) {
1423 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1424 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1425 --aExp;
1426 }
1427 return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1428
1429}
1430
1431#ifdef FLOATX80
1432
1433/*----------------------------------------------------------------------------
1434| Returns the result of converting the single-precision floating-point value
1435| `a' to the extended double-precision floating-point format. The conversion
1436| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1437| Arithmetic.
1438*----------------------------------------------------------------------------*/
1439
1440floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1441{
1442 flag aSign;
1443 int16 aExp;
1444 bits32 aSig;
1445
1446 aSig = extractFloat32Frac( a );
1447 aExp = extractFloat32Exp( a );
1448 aSign = extractFloat32Sign( a );
1449 if ( aExp == 0xFF ) {
1450 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) );
1451 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1452 }
1453 if ( aExp == 0 ) {
1454 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1455 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1456 }
1457 aSig |= 0x00800000;
1458 return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1459
1460}
1461
1462#endif
1463
1464#ifdef FLOAT128
1465
1466/*----------------------------------------------------------------------------
1467| Returns the result of converting the single-precision floating-point value
1468| `a' to the double-precision floating-point format. The conversion is
1469| performed according to the IEC/IEEE Standard for Binary Floating-Point
1470| Arithmetic.
1471*----------------------------------------------------------------------------*/
1472
1473float128 float32_to_float128( float32 a STATUS_PARAM )
1474{
1475 flag aSign;
1476 int16 aExp;
1477 bits32 aSig;
1478
1479 aSig = extractFloat32Frac( a );
1480 aExp = extractFloat32Exp( a );
1481 aSign = extractFloat32Sign( a );
1482 if ( aExp == 0xFF ) {
1483 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) );
1484 return packFloat128( aSign, 0x7FFF, 0, 0 );
1485 }
1486 if ( aExp == 0 ) {
1487 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1488 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1489 --aExp;
1490 }
1491 return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1492
1493}
1494
1495#endif
1496
1497/*----------------------------------------------------------------------------
1498| Rounds the single-precision floating-point value `a' to an integer, and
1499| returns the result as a single-precision floating-point value. The
1500| operation is performed according to the IEC/IEEE Standard for Binary
1501| Floating-Point Arithmetic.
1502*----------------------------------------------------------------------------*/
1503
1504float32 float32_round_to_int( float32 a STATUS_PARAM)
1505{
1506 flag aSign;
1507 int16 aExp;
1508 bits32 lastBitMask, roundBitsMask;
1509 int8 roundingMode;
1510 float32 z;
1511
1512 aExp = extractFloat32Exp( a );
1513 if ( 0x96 <= aExp ) {
1514 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1515 return propagateFloat32NaN( a, a STATUS_VAR );
1516 }
1517 return a;
1518 }
1519 if ( aExp <= 0x7E ) {
1520 if ( (bits32) ( a<<1 ) == 0 ) return a;
1521 STATUS(float_exception_flags) |= float_flag_inexact;
1522 aSign = extractFloat32Sign( a );
1523 switch ( STATUS(float_rounding_mode) ) {
1524 case float_round_nearest_even:
1525 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1526 return packFloat32( aSign, 0x7F, 0 );
1527 }
1528 break;
1529 case float_round_down:
1530 return aSign ? 0xBF800000 : 0;
1531 case float_round_up:
1532 return aSign ? 0x80000000 : 0x3F800000;
1533 }
1534 return packFloat32( aSign, 0, 0 );
1535 }
1536 lastBitMask = 1;
1537 lastBitMask <<= 0x96 - aExp;
1538 roundBitsMask = lastBitMask - 1;
1539 z = a;
1540 roundingMode = STATUS(float_rounding_mode);
1541 if ( roundingMode == float_round_nearest_even ) {
1542 z += lastBitMask>>1;
1543 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1544 }
1545 else if ( roundingMode != float_round_to_zero ) {
1546 if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1547 z += roundBitsMask;
1548 }
1549 }
1550 z &= ~ roundBitsMask;
1551 if ( z != a ) STATUS(float_exception_flags) |= float_flag_inexact;
1552 return z;
1553
1554}
1555
1556/*----------------------------------------------------------------------------
1557| Returns the result of adding the absolute values of the single-precision
1558| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1559| before being returned. `zSign' is ignored if the result is a NaN.
1560| The addition is performed according to the IEC/IEEE Standard for Binary
1561| Floating-Point Arithmetic.
1562*----------------------------------------------------------------------------*/
1563
1564static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1565{
1566 int16 aExp, bExp, zExp;
1567 bits32 aSig, bSig, zSig;
1568 int16 expDiff;
1569
1570 aSig = extractFloat32Frac( a );
1571 aExp = extractFloat32Exp( a );
1572 bSig = extractFloat32Frac( b );
1573 bExp = extractFloat32Exp( b );
1574 expDiff = aExp - bExp;
1575 aSig <<= 6;
1576 bSig <<= 6;
1577 if ( 0 < expDiff ) {
1578 if ( aExp == 0xFF ) {
1579 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1580 return a;
1581 }
1582 if ( bExp == 0 ) {
1583 --expDiff;
1584 }
1585 else {
1586 bSig |= 0x20000000;
1587 }
1588 shift32RightJamming( bSig, expDiff, &bSig );
1589 zExp = aExp;
1590 }
1591 else if ( expDiff < 0 ) {
1592 if ( bExp == 0xFF ) {
1593 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1594 return packFloat32( zSign, 0xFF, 0 );
1595 }
1596 if ( aExp == 0 ) {
1597 ++expDiff;
1598 }
1599 else {
1600 aSig |= 0x20000000;
1601 }
1602 shift32RightJamming( aSig, - expDiff, &aSig );
1603 zExp = bExp;
1604 }
1605 else {
1606 if ( aExp == 0xFF ) {
1607 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1608 return a;
1609 }
1610 if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1611 zSig = 0x40000000 + aSig + bSig;
1612 zExp = aExp;
1613 goto roundAndPack;
1614 }
1615 aSig |= 0x20000000;
1616 zSig = ( aSig + bSig )<<1;
1617 --zExp;
1618 if ( (sbits32) zSig < 0 ) {
1619 zSig = aSig + bSig;
1620 ++zExp;
1621 }
1622 roundAndPack:
1623 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1624
1625}
1626
1627/*----------------------------------------------------------------------------
1628| Returns the result of subtracting the absolute values of the single-
1629| precision floating-point values `a' and `b'. If `zSign' is 1, the
1630| difference is negated before being returned. `zSign' is ignored if the
1631| result is a NaN. The subtraction is performed according to the IEC/IEEE
1632| Standard for Binary Floating-Point Arithmetic.
1633*----------------------------------------------------------------------------*/
1634
1635static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1636{
1637 int16 aExp, bExp, zExp;
1638 bits32 aSig, bSig, zSig;
1639 int16 expDiff;
1640
1641 aSig = extractFloat32Frac( a );
1642 aExp = extractFloat32Exp( a );
1643 bSig = extractFloat32Frac( b );
1644 bExp = extractFloat32Exp( b );
1645 expDiff = aExp - bExp;
1646 aSig <<= 7;
1647 bSig <<= 7;
1648 if ( 0 < expDiff ) goto aExpBigger;
1649 if ( expDiff < 0 ) goto bExpBigger;
1650 if ( aExp == 0xFF ) {
1651 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1652 float_raise( float_flag_invalid STATUS_VAR);
1653 return float32_default_nan;
1654 }
1655 if ( aExp == 0 ) {
1656 aExp = 1;
1657 bExp = 1;
1658 }
1659 if ( bSig < aSig ) goto aBigger;
1660 if ( aSig < bSig ) goto bBigger;
1661 return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1662 bExpBigger:
1663 if ( bExp == 0xFF ) {
1664 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1665 return packFloat32( zSign ^ 1, 0xFF, 0 );
1666 }
1667 if ( aExp == 0 ) {
1668 ++expDiff;
1669 }
1670 else {
1671 aSig |= 0x40000000;
1672 }
1673 shift32RightJamming( aSig, - expDiff, &aSig );
1674 bSig |= 0x40000000;
1675 bBigger:
1676 zSig = bSig - aSig;
1677 zExp = bExp;
1678 zSign ^= 1;
1679 goto normalizeRoundAndPack;
1680 aExpBigger:
1681 if ( aExp == 0xFF ) {
1682 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1683 return a;
1684 }
1685 if ( bExp == 0 ) {
1686 --expDiff;
1687 }
1688 else {
1689 bSig |= 0x40000000;
1690 }
1691 shift32RightJamming( bSig, expDiff, &bSig );
1692 aSig |= 0x40000000;
1693 aBigger:
1694 zSig = aSig - bSig;
1695 zExp = aExp;
1696 normalizeRoundAndPack:
1697 --zExp;
1698 return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1699
1700}
1701
1702/*----------------------------------------------------------------------------
1703| Returns the result of adding the single-precision floating-point values `a'
1704| and `b'. The operation is performed according to the IEC/IEEE Standard for
1705| Binary Floating-Point Arithmetic.
1706*----------------------------------------------------------------------------*/
1707
1708float32 float32_add( float32 a, float32 b STATUS_PARAM )
1709{
1710 flag aSign, bSign;
1711
1712 aSign = extractFloat32Sign( a );
1713 bSign = extractFloat32Sign( b );
1714 if ( aSign == bSign ) {
1715 return addFloat32Sigs( a, b, aSign STATUS_VAR);
1716 }
1717 else {
1718 return subFloat32Sigs( a, b, aSign STATUS_VAR );
1719 }
1720
1721}
1722
1723/*----------------------------------------------------------------------------
1724| Returns the result of subtracting the single-precision floating-point values
1725| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1726| for Binary Floating-Point Arithmetic.
1727*----------------------------------------------------------------------------*/
1728
1729float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1730{
1731 flag aSign, bSign;
1732
1733 aSign = extractFloat32Sign( a );
1734 bSign = extractFloat32Sign( b );
1735 if ( aSign == bSign ) {
1736 return subFloat32Sigs( a, b, aSign STATUS_VAR );
1737 }
1738 else {
1739 return addFloat32Sigs( a, b, aSign STATUS_VAR );
1740 }
1741
1742}
1743
1744/*----------------------------------------------------------------------------
1745| Returns the result of multiplying the single-precision floating-point values
1746| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1747| for Binary Floating-Point Arithmetic.
1748*----------------------------------------------------------------------------*/
1749
1750float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1751{
1752 flag aSign, bSign, zSign;
1753 int16 aExp, bExp, zExp;
1754 bits32 aSig, bSig;
1755 bits64 zSig64;
1756 bits32 zSig;
1757
1758 aSig = extractFloat32Frac( a );
1759 aExp = extractFloat32Exp( a );
1760 aSign = extractFloat32Sign( a );
1761 bSig = extractFloat32Frac( b );
1762 bExp = extractFloat32Exp( b );
1763 bSign = extractFloat32Sign( b );
1764 zSign = aSign ^ bSign;
1765 if ( aExp == 0xFF ) {
1766 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1767 return propagateFloat32NaN( a, b STATUS_VAR );
1768 }
1769 if ( ( bExp | bSig ) == 0 ) {
1770 float_raise( float_flag_invalid STATUS_VAR);
1771 return float32_default_nan;
1772 }
1773 return packFloat32( zSign, 0xFF, 0 );
1774 }
1775 if ( bExp == 0xFF ) {
1776 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1777 if ( ( aExp | aSig ) == 0 ) {
1778 float_raise( float_flag_invalid STATUS_VAR);
1779 return float32_default_nan;
1780 }
1781 return packFloat32( zSign, 0xFF, 0 );
1782 }
1783 if ( aExp == 0 ) {
1784 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1785 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1786 }
1787 if ( bExp == 0 ) {
1788 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1789 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1790 }
1791 zExp = aExp + bExp - 0x7F;
1792 aSig = ( aSig | 0x00800000 )<<7;
1793 bSig = ( bSig | 0x00800000 )<<8;
1794 shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1795 zSig = zSig64;
1796 if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1797 zSig <<= 1;
1798 --zExp;
1799 }
1800 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1801
1802}
1803
1804/*----------------------------------------------------------------------------
1805| Returns the result of dividing the single-precision floating-point value `a'
1806| by the corresponding value `b'. The operation is performed according to the
1807| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1808*----------------------------------------------------------------------------*/
1809
1810float32 float32_div( float32 a, float32 b STATUS_PARAM )
1811{
1812 flag aSign, bSign, zSign;
1813 int16 aExp, bExp, zExp;
1814 bits32 aSig, bSig, zSig;
1815
1816 aSig = extractFloat32Frac( a );
1817 aExp = extractFloat32Exp( a );
1818 aSign = extractFloat32Sign( a );
1819 bSig = extractFloat32Frac( b );
1820 bExp = extractFloat32Exp( b );
1821 bSign = extractFloat32Sign( b );
1822 zSign = aSign ^ bSign;
1823 if ( aExp == 0xFF ) {
1824 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1825 if ( bExp == 0xFF ) {
1826 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1827 float_raise( float_flag_invalid STATUS_VAR);
1828 return float32_default_nan;
1829 }
1830 return packFloat32( zSign, 0xFF, 0 );
1831 }
1832 if ( bExp == 0xFF ) {
1833 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1834 return packFloat32( zSign, 0, 0 );
1835 }
1836 if ( bExp == 0 ) {
1837 if ( bSig == 0 ) {
1838 if ( ( aExp | aSig ) == 0 ) {
1839 float_raise( float_flag_invalid STATUS_VAR);
1840 return float32_default_nan;
1841 }
1842 float_raise( float_flag_divbyzero STATUS_VAR);
1843 return packFloat32( zSign, 0xFF, 0 );
1844 }
1845 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1846 }
1847 if ( aExp == 0 ) {
1848 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1849 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1850 }
1851 zExp = aExp - bExp + 0x7D;
1852 aSig = ( aSig | 0x00800000 )<<7;
1853 bSig = ( bSig | 0x00800000 )<<8;
1854 if ( bSig <= ( aSig + aSig ) ) {
1855 aSig >>= 1;
1856 ++zExp;
1857 }
1858 zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1859 if ( ( zSig & 0x3F ) == 0 ) {
1860 zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
1861 }
1862 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1863
1864}
1865
1866/*----------------------------------------------------------------------------
1867| Returns the remainder of the single-precision floating-point value `a'
1868| with respect to the corresponding value `b'. The operation is performed
1869| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1870*----------------------------------------------------------------------------*/
1871
1872float32 float32_rem( float32 a, float32 b STATUS_PARAM )
1873{
1874 flag aSign, bSign, zSign;
1875 int16 aExp, bExp, expDiff;
1876 bits32 aSig, bSig;
1877 bits32 q;
1878 bits64 aSig64, bSig64, q64;
1879 bits32 alternateASig;
1880 sbits32 sigMean;
1881
1882 aSig = extractFloat32Frac( a );
1883 aExp = extractFloat32Exp( a );
1884 aSign = extractFloat32Sign( a );
1885 bSig = extractFloat32Frac( b );
1886 bExp = extractFloat32Exp( b );
1887 bSign = extractFloat32Sign( b );
1888 if ( aExp == 0xFF ) {
1889 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1890 return propagateFloat32NaN( a, b STATUS_VAR );
1891 }
1892 float_raise( float_flag_invalid STATUS_VAR);
1893 return float32_default_nan;
1894 }
1895 if ( bExp == 0xFF ) {
1896 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1897 return a;
1898 }
1899 if ( bExp == 0 ) {
1900 if ( bSig == 0 ) {
1901 float_raise( float_flag_invalid STATUS_VAR);
1902 return float32_default_nan;
1903 }
1904 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1905 }
1906 if ( aExp == 0 ) {
1907 if ( aSig == 0 ) return a;
1908 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1909 }
1910 expDiff = aExp - bExp;
1911 aSig |= 0x00800000;
1912 bSig |= 0x00800000;
1913 if ( expDiff < 32 ) {
1914 aSig <<= 8;
1915 bSig <<= 8;
1916 if ( expDiff < 0 ) {
1917 if ( expDiff < -1 ) return a;
1918 aSig >>= 1;
1919 }
1920 q = ( bSig <= aSig );
1921 if ( q ) aSig -= bSig;
1922 if ( 0 < expDiff ) {
1923 q = ( ( (bits64) aSig )<<32 ) / bSig;
1924 q >>= 32 - expDiff;
1925 bSig >>= 2;
1926 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1927 }
1928 else {
1929 aSig >>= 2;
1930 bSig >>= 2;
1931 }
1932 }
1933 else {
1934 if ( bSig <= aSig ) aSig -= bSig;
1935 aSig64 = ( (bits64) aSig )<<40;
1936 bSig64 = ( (bits64) bSig )<<40;
1937 expDiff -= 64;
1938 while ( 0 < expDiff ) {
1939 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1940 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1941 aSig64 = - ( ( bSig * q64 )<<38 );
1942 expDiff -= 62;
1943 }
1944 expDiff += 64;
1945 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1946 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1947 q = q64>>( 64 - expDiff );
1948 bSig <<= 6;
1949 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
1950 }
1951 do {
1952 alternateASig = aSig;
1953 ++q;
1954 aSig -= bSig;
1955 } while ( 0 <= (sbits32) aSig );
1956 sigMean = aSig + alternateASig;
1957 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1958 aSig = alternateASig;
1959 }
1960 zSign = ( (sbits32) aSig < 0 );
1961 if ( zSign ) aSig = - aSig;
1962 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
1963
1964}
1965
1966/*----------------------------------------------------------------------------
1967| Returns the square root of the single-precision floating-point value `a'.
1968| The operation is performed according to the IEC/IEEE Standard for Binary
1969| Floating-Point Arithmetic.
1970*----------------------------------------------------------------------------*/
1971
1972float32 float32_sqrt( float32 a STATUS_PARAM )
1973{
1974 flag aSign;
1975 int16 aExp, zExp;
1976 bits32 aSig, zSig;
1977 bits64 rem, term;
1978
1979 aSig = extractFloat32Frac( a );
1980 aExp = extractFloat32Exp( a );
1981 aSign = extractFloat32Sign( a );
1982 if ( aExp == 0xFF ) {
1983 if ( aSig ) return propagateFloat32NaN( a, 0 STATUS_VAR );
1984 if ( ! aSign ) return a;
1985 float_raise( float_flag_invalid STATUS_VAR);
1986 return float32_default_nan;
1987 }
1988 if ( aSign ) {
1989 if ( ( aExp | aSig ) == 0 ) return a;
1990 float_raise( float_flag_invalid STATUS_VAR);
1991 return float32_default_nan;
1992 }
1993 if ( aExp == 0 ) {
1994 if ( aSig == 0 ) return 0;
1995 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1996 }
1997 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
1998 aSig = ( aSig | 0x00800000 )<<8;
1999 zSig = estimateSqrt32( aExp, aSig ) + 2;
2000 if ( ( zSig & 0x7F ) <= 5 ) {
2001 if ( zSig < 2 ) {
2002 zSig = 0x7FFFFFFF;
2003 goto roundAndPack;
2004 }
2005 aSig >>= aExp & 1;
2006 term = ( (bits64) zSig ) * zSig;
2007 rem = ( ( (bits64) aSig )<<32 ) - term;
2008 while ( (sbits64) rem < 0 ) {
2009 --zSig;
2010 rem += ( ( (bits64) zSig )<<1 ) | 1;
2011 }
2012 zSig |= ( rem != 0 );
2013 }
2014 shift32RightJamming( zSig, 1, &zSig );
2015 roundAndPack:
2016 return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2017
2018}
2019
2020/*----------------------------------------------------------------------------
2021| Returns 1 if the single-precision floating-point value `a' is equal to
2022| the corresponding value `b', and 0 otherwise. The comparison is performed
2023| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2024*----------------------------------------------------------------------------*/
2025
2026flag float32_eq( float32 a, float32 b STATUS_PARAM )
2027{
2028
2029 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2030 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2031 ) {
2032 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2033 float_raise( float_flag_invalid STATUS_VAR);
2034 }
2035 return 0;
2036 }
2037 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2038
2039}
2040
2041/*----------------------------------------------------------------------------
2042| Returns 1 if the single-precision floating-point value `a' is less than
2043| or equal to the corresponding value `b', and 0 otherwise. The comparison
2044| is performed according to the IEC/IEEE Standard for Binary Floating-Point
2045| Arithmetic.
2046*----------------------------------------------------------------------------*/
2047
2048flag float32_le( float32 a, float32 b STATUS_PARAM )
2049{
2050 flag aSign, bSign;
2051
2052 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2053 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2054 ) {
2055 float_raise( float_flag_invalid STATUS_VAR);
2056 return 0;
2057 }
2058 aSign = extractFloat32Sign( a );
2059 bSign = extractFloat32Sign( b );
2060 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2061 return ( a == b ) || ( aSign ^ ( a < b ) );
2062
2063}
2064
2065/*----------------------------------------------------------------------------
2066| Returns 1 if the single-precision floating-point value `a' is less than
2067| the corresponding value `b', and 0 otherwise. The comparison is performed
2068| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2069*----------------------------------------------------------------------------*/
2070
2071flag float32_lt( float32 a, float32 b STATUS_PARAM )
2072{
2073 flag aSign, bSign;
2074
2075 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2076 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2077 ) {
2078 float_raise( float_flag_invalid STATUS_VAR);
2079 return 0;
2080 }
2081 aSign = extractFloat32Sign( a );
2082 bSign = extractFloat32Sign( b );
2083 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2084 return ( a != b ) && ( aSign ^ ( a < b ) );
2085
2086}
2087
2088/*----------------------------------------------------------------------------
2089| Returns 1 if the single-precision floating-point value `a' is equal to
2090| the corresponding value `b', and 0 otherwise. The invalid exception is
2091| raised if either operand is a NaN. Otherwise, the comparison is performed
2092| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2093*----------------------------------------------------------------------------*/
2094
2095flag float32_eq_signaling( float32 a, float32 b STATUS_PARAM )
2096{
2097
2098 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2099 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2100 ) {
2101 float_raise( float_flag_invalid STATUS_VAR);
2102 return 0;
2103 }
2104 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2105
2106}
2107
2108/*----------------------------------------------------------------------------
2109| Returns 1 if the single-precision floating-point value `a' is less than or
2110| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2111| cause an exception. Otherwise, the comparison is performed according to the
2112| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2113*----------------------------------------------------------------------------*/
2114
2115flag float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2116{
2117 flag aSign, bSign;
2118
2119 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2120 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2121 ) {
2122 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2123 float_raise( float_flag_invalid STATUS_VAR);
2124 }
2125 return 0;
2126 }
2127 aSign = extractFloat32Sign( a );
2128 bSign = extractFloat32Sign( b );
2129 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2130 return ( a == b ) || ( aSign ^ ( a < b ) );
2131
2132}
2133
2134/*----------------------------------------------------------------------------
2135| Returns 1 if the single-precision floating-point value `a' is less than
2136| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2137| exception. Otherwise, the comparison is performed according to the IEC/IEEE
2138| Standard for Binary Floating-Point Arithmetic.
2139*----------------------------------------------------------------------------*/
2140
2141flag float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2142{
2143 flag aSign, bSign;
2144
2145 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2146 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2147 ) {
2148 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2149 float_raise( float_flag_invalid STATUS_VAR);
2150 }
2151 return 0;
2152 }
2153 aSign = extractFloat32Sign( a );
2154 bSign = extractFloat32Sign( b );
2155 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2156 return ( a != b ) && ( aSign ^ ( a < b ) );
2157
2158}
2159
2160/*----------------------------------------------------------------------------
2161| Returns the result of converting the double-precision floating-point value
2162| `a' to the 32-bit two's complement integer format. The conversion is
2163| performed according to the IEC/IEEE Standard for Binary Floating-Point
2164| Arithmetic---which means in particular that the conversion is rounded
2165| according to the current rounding mode. If `a' is a NaN, the largest
2166| positive integer is returned. Otherwise, if the conversion overflows, the
2167| largest integer with the same sign as `a' is returned.
2168*----------------------------------------------------------------------------*/
2169
2170int32 float64_to_int32( float64 a STATUS_PARAM )
2171{
2172 flag aSign;
2173 int16 aExp, shiftCount;
2174 bits64 aSig;
2175
2176 aSig = extractFloat64Frac( a );
2177 aExp = extractFloat64Exp( a );
2178 aSign = extractFloat64Sign( a );
2179 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2180 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2181 shiftCount = 0x42C - aExp;
2182 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2183 return roundAndPackInt32( aSign, aSig STATUS_VAR );
2184
2185}
2186
2187/*----------------------------------------------------------------------------
2188| Returns the result of converting the double-precision floating-point value
2189| `a' to the 32-bit two's complement integer format. The conversion is
2190| performed according to the IEC/IEEE Standard for Binary Floating-Point
2191| Arithmetic, except that the conversion is always rounded toward zero.
2192| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2193| the conversion overflows, the largest integer with the same sign as `a' is
2194| returned.
2195*----------------------------------------------------------------------------*/
2196
2197int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2198{
2199 flag aSign;
2200 int16 aExp, shiftCount;
2201 bits64 aSig, savedASig;
2202 int32 z;
2203
2204 aSig = extractFloat64Frac( a );
2205 aExp = extractFloat64Exp( a );
2206 aSign = extractFloat64Sign( a );
2207 if ( 0x41E < aExp ) {
2208 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2209 goto invalid;
2210 }
2211 else if ( aExp < 0x3FF ) {
2212 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2213 return 0;
2214 }
2215 aSig |= LIT64( 0x0010000000000000 );
2216 shiftCount = 0x433 - aExp;
2217 savedASig = aSig;
2218 aSig >>= shiftCount;
2219 z = aSig;
2220 if ( aSign ) z = - z;
2221 if ( ( z < 0 ) ^ aSign ) {
2222 invalid:
2223 float_raise( float_flag_invalid STATUS_VAR);
2224 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2225 }
2226 if ( ( aSig<<shiftCount ) != savedASig ) {
2227 STATUS(float_exception_flags) |= float_flag_inexact;
2228 }
2229 return z;
2230
2231}
2232
2233/*----------------------------------------------------------------------------
2234| Returns the result of converting the double-precision floating-point value
2235| `a' to the 64-bit two's complement integer format. The conversion is
2236| performed according to the IEC/IEEE Standard for Binary Floating-Point
2237| Arithmetic---which means in particular that the conversion is rounded
2238| according to the current rounding mode. If `a' is a NaN, the largest
2239| positive integer is returned. Otherwise, if the conversion overflows, the
2240| largest integer with the same sign as `a' is returned.
2241*----------------------------------------------------------------------------*/
2242
2243int64 float64_to_int64( float64 a STATUS_PARAM )
2244{
2245 flag aSign;
2246 int16 aExp, shiftCount;
2247 bits64 aSig, aSigExtra;
2248
2249 aSig = extractFloat64Frac( a );
2250 aExp = extractFloat64Exp( a );
2251 aSign = extractFloat64Sign( a );
2252 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2253 shiftCount = 0x433 - aExp;
2254 if ( shiftCount <= 0 ) {
2255 if ( 0x43E < aExp ) {
2256 float_raise( float_flag_invalid STATUS_VAR);
2257 if ( ! aSign
2258 || ( ( aExp == 0x7FF )
2259 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2260 ) {
2261 return LIT64( 0x7FFFFFFFFFFFFFFF );
2262 }
2263 return (sbits64) LIT64( 0x8000000000000000 );
2264 }
2265 aSigExtra = 0;
2266 aSig <<= - shiftCount;
2267 }
2268 else {
2269 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2270 }
2271 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2272
2273}
2274
2275/*----------------------------------------------------------------------------
2276| Returns the result of converting the double-precision floating-point value
2277| `a' to the 64-bit two's complement integer format. The conversion is
2278| performed according to the IEC/IEEE Standard for Binary Floating-Point
2279| Arithmetic, except that the conversion is always rounded toward zero.
2280| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2281| the conversion overflows, the largest integer with the same sign as `a' is
2282| returned.
2283*----------------------------------------------------------------------------*/
2284
2285int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2286{
2287 flag aSign;
2288 int16 aExp, shiftCount;
2289 bits64 aSig;
2290 int64 z;
2291
2292 aSig = extractFloat64Frac( a );
2293 aExp = extractFloat64Exp( a );
2294 aSign = extractFloat64Sign( a );
2295 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2296 shiftCount = aExp - 0x433;
2297 if ( 0 <= shiftCount ) {
2298 if ( 0x43E <= aExp ) {
2299 if ( a != LIT64( 0xC3E0000000000000 ) ) {
2300 float_raise( float_flag_invalid STATUS_VAR);
2301 if ( ! aSign
2302 || ( ( aExp == 0x7FF )
2303 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2304 ) {
2305 return LIT64( 0x7FFFFFFFFFFFFFFF );
2306 }
2307 }
2308 return (sbits64) LIT64( 0x8000000000000000 );
2309 }
2310 z = aSig<<shiftCount;
2311 }
2312 else {
2313 if ( aExp < 0x3FE ) {
2314 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2315 return 0;
2316 }
2317 z = aSig>>( - shiftCount );
2318 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2319 STATUS(float_exception_flags) |= float_flag_inexact;
2320 }
2321 }
2322 if ( aSign ) z = - z;
2323 return z;
2324
2325}
2326
2327/*----------------------------------------------------------------------------
2328| Returns the result of converting the double-precision floating-point value
2329| `a' to the single-precision floating-point format. The conversion is
2330| performed according to the IEC/IEEE Standard for Binary Floating-Point
2331| Arithmetic.
2332*----------------------------------------------------------------------------*/
2333
2334float32 float64_to_float32( float64 a STATUS_PARAM )
2335{
2336 flag aSign;
2337 int16 aExp;
2338 bits64 aSig;
2339 bits32 zSig;
2340
2341 aSig = extractFloat64Frac( a );
2342 aExp = extractFloat64Exp( a );
2343 aSign = extractFloat64Sign( a );
2344 if ( aExp == 0x7FF ) {
2345 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) );
2346 return packFloat32( aSign, 0xFF, 0 );
2347 }
2348 shift64RightJamming( aSig, 22, &aSig );
2349 zSig = aSig;
2350 if ( aExp || zSig ) {
2351 zSig |= 0x40000000;
2352 aExp -= 0x381;
2353 }
2354 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2355
2356}
2357
2358#ifdef FLOATX80
2359
2360/*----------------------------------------------------------------------------
2361| Returns the result of converting the double-precision floating-point value
2362| `a' to the extended double-precision floating-point format. The conversion
2363| is performed according to the IEC/IEEE Standard for Binary Floating-Point
2364| Arithmetic.
2365*----------------------------------------------------------------------------*/
2366
2367floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
2368{
2369 flag aSign;
2370 int16 aExp;
2371 bits64 aSig;
2372
2373 aSig = extractFloat64Frac( a );
2374 aExp = extractFloat64Exp( a );
2375 aSign = extractFloat64Sign( a );
2376 if ( aExp == 0x7FF ) {
2377 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) );
2378 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2379 }
2380 if ( aExp == 0 ) {
2381 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2382 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2383 }
2384 return
2385 packFloatx80(
2386 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2387
2388}
2389
2390#endif
2391
2392#ifdef FLOAT128
2393
2394/*----------------------------------------------------------------------------
2395| Returns the result of converting the double-precision floating-point value
2396| `a' to the quadruple-precision floating-point format. The conversion is
2397| performed according to the IEC/IEEE Standard for Binary Floating-Point
2398| Arithmetic.
2399*----------------------------------------------------------------------------*/
2400
2401float128 float64_to_float128( float64 a STATUS_PARAM )
2402{
2403 flag aSign;
2404 int16 aExp;
2405 bits64 aSig, zSig0, zSig1;
2406
2407 aSig = extractFloat64Frac( a );
2408 aExp = extractFloat64Exp( a );
2409 aSign = extractFloat64Sign( a );
2410 if ( aExp == 0x7FF ) {
2411 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) );
2412 return packFloat128( aSign, 0x7FFF, 0, 0 );
2413 }
2414 if ( aExp == 0 ) {
2415 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2416 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2417 --aExp;
2418 }
2419 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2420 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2421
2422}
2423
2424#endif
2425
2426/*----------------------------------------------------------------------------
2427| Rounds the double-precision floating-point value `a' to an integer, and
2428| returns the result as a double-precision floating-point value. The
2429| operation is performed according to the IEC/IEEE Standard for Binary
2430| Floating-Point Arithmetic.
2431*----------------------------------------------------------------------------*/
2432
2433float64 float64_round_to_int( float64 a STATUS_PARAM )
2434{
2435 flag aSign;
2436 int16 aExp;
2437 bits64 lastBitMask, roundBitsMask;
2438 int8 roundingMode;
2439 float64 z;
2440
2441 aExp = extractFloat64Exp( a );
2442 if ( 0x433 <= aExp ) {
2443 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2444 return propagateFloat64NaN( a, a STATUS_VAR );
2445 }
2446 return a;
2447 }
2448 if ( aExp < 0x3FF ) {
2449 if ( (bits64) ( a<<1 ) == 0 ) return a;
2450 STATUS(float_exception_flags) |= float_flag_inexact;
2451 aSign = extractFloat64Sign( a );
2452 switch ( STATUS(float_rounding_mode) ) {
2453 case float_round_nearest_even:
2454 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2455 return packFloat64( aSign, 0x3FF, 0 );
2456 }
2457 break;
2458 case float_round_down:
2459 return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2460 case float_round_up:
2461 return
2462 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2463 }
2464 return packFloat64( aSign, 0, 0 );
2465 }
2466 lastBitMask = 1;
2467 lastBitMask <<= 0x433 - aExp;
2468 roundBitsMask = lastBitMask - 1;
2469 z = a;
2470 roundingMode = STATUS(float_rounding_mode);
2471 if ( roundingMode == float_round_nearest_even ) {
2472 z += lastBitMask>>1;
2473 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2474 }
2475 else if ( roundingMode != float_round_to_zero ) {
2476 if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2477 z += roundBitsMask;
2478 }
2479 }
2480 z &= ~ roundBitsMask;
2481 if ( z != a ) STATUS(float_exception_flags) |= float_flag_inexact;
2482 return z;
2483
2484}
2485
2486/*----------------------------------------------------------------------------
2487| Returns the result of adding the absolute values of the double-precision
2488| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
2489| before being returned. `zSign' is ignored if the result is a NaN.
2490| The addition is performed according to the IEC/IEEE Standard for Binary
2491| Floating-Point Arithmetic.
2492*----------------------------------------------------------------------------*/
2493
2494static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2495{
2496 int16 aExp, bExp, zExp;
2497 bits64 aSig, bSig, zSig;
2498 int16 expDiff;
2499
2500 aSig = extractFloat64Frac( a );
2501 aExp = extractFloat64Exp( a );
2502 bSig = extractFloat64Frac( b );
2503 bExp = extractFloat64Exp( b );
2504 expDiff = aExp - bExp;
2505 aSig <<= 9;
2506 bSig <<= 9;
2507 if ( 0 < expDiff ) {
2508 if ( aExp == 0x7FF ) {
2509 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2510 return a;
2511 }
2512 if ( bExp == 0 ) {
2513 --expDiff;
2514 }
2515 else {
2516 bSig |= LIT64( 0x2000000000000000 );
2517 }
2518 shift64RightJamming( bSig, expDiff, &bSig );
2519 zExp = aExp;
2520 }
2521 else if ( expDiff < 0 ) {
2522 if ( bExp == 0x7FF ) {
2523 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2524 return packFloat64( zSign, 0x7FF, 0 );
2525 }
2526 if ( aExp == 0 ) {
2527 ++expDiff;
2528 }
2529 else {
2530 aSig |= LIT64( 0x2000000000000000 );
2531 }
2532 shift64RightJamming( aSig, - expDiff, &aSig );
2533 zExp = bExp;
2534 }
2535 else {
2536 if ( aExp == 0x7FF ) {
2537 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2538 return a;
2539 }
2540 if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2541 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2542 zExp = aExp;
2543 goto roundAndPack;
2544 }
2545 aSig |= LIT64( 0x2000000000000000 );
2546 zSig = ( aSig + bSig )<<1;
2547 --zExp;
2548 if ( (sbits64) zSig < 0 ) {
2549 zSig = aSig + bSig;
2550 ++zExp;
2551 }
2552 roundAndPack:
2553 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2554
2555}
2556
2557/*----------------------------------------------------------------------------
2558| Returns the result of subtracting the absolute values of the double-
2559| precision floating-point values `a' and `b'. If `zSign' is 1, the
2560| difference is negated before being returned. `zSign' is ignored if the
2561| result is a NaN. The subtraction is performed according to the IEC/IEEE
2562| Standard for Binary Floating-Point Arithmetic.
2563*----------------------------------------------------------------------------*/
2564
2565static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2566{
2567 int16 aExp, bExp, zExp;
2568 bits64 aSig, bSig, zSig;
2569 int16 expDiff;
2570
2571 aSig = extractFloat64Frac( a );
2572 aExp = extractFloat64Exp( a );
2573 bSig = extractFloat64Frac( b );
2574 bExp = extractFloat64Exp( b );
2575 expDiff = aExp - bExp;
2576 aSig <<= 10;
2577 bSig <<= 10;
2578 if ( 0 < expDiff ) goto aExpBigger;
2579 if ( expDiff < 0 ) goto bExpBigger;
2580 if ( aExp == 0x7FF ) {
2581 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2582 float_raise( float_flag_invalid STATUS_VAR);
2583 return float64_default_nan;
2584 }
2585 if ( aExp == 0 ) {
2586 aExp = 1;
2587 bExp = 1;
2588 }
2589 if ( bSig < aSig ) goto aBigger;
2590 if ( aSig < bSig ) goto bBigger;
2591 return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
2592 bExpBigger:
2593 if ( bExp == 0x7FF ) {
2594 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2595 return packFloat64( zSign ^ 1, 0x7FF, 0 );
2596 }
2597 if ( aExp == 0 ) {
2598 ++expDiff;
2599 }
2600 else {
2601 aSig |= LIT64( 0x4000000000000000 );
2602 }
2603 shift64RightJamming( aSig, - expDiff, &aSig );
2604 bSig |= LIT64( 0x4000000000000000 );
2605 bBigger:
2606 zSig = bSig - aSig;
2607 zExp = bExp;
2608 zSign ^= 1;
2609 goto normalizeRoundAndPack;
2610 aExpBigger:
2611 if ( aExp == 0x7FF ) {
2612 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2613 return a;
2614 }
2615 if ( bExp == 0 ) {
2616 --expDiff;
2617 }
2618 else {
2619 bSig |= LIT64( 0x4000000000000000 );
2620 }
2621 shift64RightJamming( bSig, expDiff, &bSig );
2622 aSig |= LIT64( 0x4000000000000000 );
2623 aBigger:
2624 zSig = aSig - bSig;
2625 zExp = aExp;
2626 normalizeRoundAndPack:
2627 --zExp;
2628 return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2629
2630}
2631
2632/*----------------------------------------------------------------------------
2633| Returns the result of adding the double-precision floating-point values `a'
2634| and `b'. The operation is performed according to the IEC/IEEE Standard for
2635| Binary Floating-Point Arithmetic.
2636*----------------------------------------------------------------------------*/
2637
2638float64 float64_add( float64 a, float64 b STATUS_PARAM )
2639{
2640 flag aSign, bSign;
2641
2642 aSign = extractFloat64Sign( a );
2643 bSign = extractFloat64Sign( b );
2644 if ( aSign == bSign ) {
2645 return addFloat64Sigs( a, b, aSign STATUS_VAR );
2646 }
2647 else {
2648 return subFloat64Sigs( a, b, aSign STATUS_VAR );
2649 }
2650
2651}
2652
2653/*----------------------------------------------------------------------------
2654| Returns the result of subtracting the double-precision floating-point values
2655| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2656| for Binary Floating-Point Arithmetic.
2657*----------------------------------------------------------------------------*/
2658
2659float64 float64_sub( float64 a, float64 b STATUS_PARAM )
2660{
2661 flag aSign, bSign;
2662
2663 aSign = extractFloat64Sign( a );
2664 bSign = extractFloat64Sign( b );
2665 if ( aSign == bSign ) {
2666 return subFloat64Sigs( a, b, aSign STATUS_VAR );
2667 }
2668 else {
2669 return addFloat64Sigs( a, b, aSign STATUS_VAR );
2670 }
2671
2672}
2673
2674/*----------------------------------------------------------------------------
2675| Returns the result of multiplying the double-precision floating-point values
2676| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2677| for Binary Floating-Point Arithmetic.
2678*----------------------------------------------------------------------------*/
2679
2680float64 float64_mul( float64 a, float64 b STATUS_PARAM )
2681{
2682 flag aSign, bSign, zSign;
2683 int16 aExp, bExp, zExp;
2684 bits64 aSig, bSig, zSig0, zSig1;
2685
2686 aSig = extractFloat64Frac( a );
2687 aExp = extractFloat64Exp( a );
2688 aSign = extractFloat64Sign( a );
2689 bSig = extractFloat64Frac( b );
2690 bExp = extractFloat64Exp( b );
2691 bSign = extractFloat64Sign( b );
2692 zSign = aSign ^ bSign;
2693 if ( aExp == 0x7FF ) {
2694 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2695 return propagateFloat64NaN( a, b STATUS_VAR );
2696 }
2697 if ( ( bExp | bSig ) == 0 ) {
2698 float_raise( float_flag_invalid STATUS_VAR);
2699 return float64_default_nan;
2700 }
2701 return packFloat64( zSign, 0x7FF, 0 );
2702 }
2703 if ( bExp == 0x7FF ) {
2704 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2705 if ( ( aExp | aSig ) == 0 ) {
2706 float_raise( float_flag_invalid STATUS_VAR);
2707 return float64_default_nan;
2708 }
2709 return packFloat64( zSign, 0x7FF, 0 );
2710 }
2711 if ( aExp == 0 ) {
2712 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2713 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2714 }
2715 if ( bExp == 0 ) {
2716 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2717 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2718 }
2719 zExp = aExp + bExp - 0x3FF;
2720 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2721 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2722 mul64To128( aSig, bSig, &zSig0, &zSig1 );
2723 zSig0 |= ( zSig1 != 0 );
2724 if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2725 zSig0 <<= 1;
2726 --zExp;
2727 }
2728 return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
2729
2730}
2731
2732/*----------------------------------------------------------------------------
2733| Returns the result of dividing the double-precision floating-point value `a'
2734| by the corresponding value `b'. The operation is performed according to
2735| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2736*----------------------------------------------------------------------------*/
2737
2738float64 float64_div( float64 a, float64 b STATUS_PARAM )
2739{
2740 flag aSign, bSign, zSign;
2741 int16 aExp, bExp, zExp;
2742 bits64 aSig, bSig, zSig;
2743 bits64 rem0, rem1;
2744 bits64 term0, term1;
2745
2746 aSig = extractFloat64Frac( a );
2747 aExp = extractFloat64Exp( a );
2748 aSign = extractFloat64Sign( a );
2749 bSig = extractFloat64Frac( b );
2750 bExp = extractFloat64Exp( b );
2751 bSign = extractFloat64Sign( b );
2752 zSign = aSign ^ bSign;
2753 if ( aExp == 0x7FF ) {
2754 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2755 if ( bExp == 0x7FF ) {
2756 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2757 float_raise( float_flag_invalid STATUS_VAR);
2758 return float64_default_nan;
2759 }
2760 return packFloat64( zSign, 0x7FF, 0 );
2761 }
2762 if ( bExp == 0x7FF ) {
2763 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2764 return packFloat64( zSign, 0, 0 );
2765 }
2766 if ( bExp == 0 ) {
2767 if ( bSig == 0 ) {
2768 if ( ( aExp | aSig ) == 0 ) {
2769 float_raise( float_flag_invalid STATUS_VAR);
2770 return float64_default_nan;
2771 }
2772 float_raise( float_flag_divbyzero STATUS_VAR);
2773 return packFloat64( zSign, 0x7FF, 0 );
2774 }
2775 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2776 }
2777 if ( aExp == 0 ) {
2778 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2779 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2780 }
2781 zExp = aExp - bExp + 0x3FD;
2782 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2783 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2784 if ( bSig <= ( aSig + aSig ) ) {
2785 aSig >>= 1;
2786 ++zExp;
2787 }
2788 zSig = estimateDiv128To64( aSig, 0, bSig );
2789 if ( ( zSig & 0x1FF ) <= 2 ) {
2790 mul64To128( bSig, zSig, &term0, &term1 );
2791 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2792 while ( (sbits64) rem0 < 0 ) {
2793 --zSig;
2794 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2795 }
2796 zSig |= ( rem1 != 0 );
2797 }
2798 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2799
2800}
2801
2802/*----------------------------------------------------------------------------
2803| Returns the remainder of the double-precision floating-point value `a'
2804| with respect to the corresponding value `b'. The operation is performed
2805| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2806*----------------------------------------------------------------------------*/
2807
2808float64 float64_rem( float64 a, float64 b STATUS_PARAM )
2809{
2810 flag aSign, bSign, zSign;
2811 int16 aExp, bExp, expDiff;
2812 bits64 aSig, bSig;
2813 bits64 q, alternateASig;
2814 sbits64 sigMean;
2815
2816 aSig = extractFloat64Frac( a );
2817 aExp = extractFloat64Exp( a );
2818 aSign = extractFloat64Sign( a );
2819 bSig = extractFloat64Frac( b );
2820 bExp = extractFloat64Exp( b );
2821 bSign = extractFloat64Sign( b );
2822 if ( aExp == 0x7FF ) {
2823 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2824 return propagateFloat64NaN( a, b STATUS_VAR );
2825 }
2826 float_raise( float_flag_invalid STATUS_VAR);
2827 return float64_default_nan;
2828 }
2829 if ( bExp == 0x7FF ) {
2830 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2831 return a;
2832 }
2833 if ( bExp == 0 ) {
2834 if ( bSig == 0 ) {
2835 float_raise( float_flag_invalid STATUS_VAR);
2836 return float64_default_nan;
2837 }
2838 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2839 }
2840 if ( aExp == 0 ) {
2841 if ( aSig == 0 ) return a;
2842 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2843 }
2844 expDiff = aExp - bExp;
2845 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2846 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2847 if ( expDiff < 0 ) {
2848 if ( expDiff < -1 ) return a;
2849 aSig >>= 1;
2850 }
2851 q = ( bSig <= aSig );
2852 if ( q ) aSig -= bSig;
2853 expDiff -= 64;
2854 while ( 0 < expDiff ) {
2855 q = estimateDiv128To64( aSig, 0, bSig );
2856 q = ( 2 < q ) ? q - 2 : 0;
2857 aSig = - ( ( bSig>>2 ) * q );
2858 expDiff -= 62;
2859 }
2860 expDiff += 64;
2861 if ( 0 < expDiff ) {
2862 q = estimateDiv128To64( aSig, 0, bSig );
2863 q = ( 2 < q ) ? q - 2 : 0;
2864 q >>= 64 - expDiff;
2865 bSig >>= 2;
2866 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2867 }
2868 else {
2869 aSig >>= 2;
2870 bSig >>= 2;
2871 }
2872 do {
2873 alternateASig = aSig;
2874 ++q;
2875 aSig -= bSig;
2876 } while ( 0 <= (sbits64) aSig );
2877 sigMean = aSig + alternateASig;
2878 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2879 aSig = alternateASig;
2880 }
2881 zSign = ( (sbits64) aSig < 0 );
2882 if ( zSign ) aSig = - aSig;
2883 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
2884
2885}
2886
2887/*----------------------------------------------------------------------------
2888| Returns the square root of the double-precision floating-point value `a'.
2889| The operation is performed according to the IEC/IEEE Standard for Binary
2890| Floating-Point Arithmetic.
2891*----------------------------------------------------------------------------*/
2892
2893float64 float64_sqrt( float64 a STATUS_PARAM )
2894{
2895 flag aSign;
2896 int16 aExp, zExp;
2897 bits64 aSig, zSig, doubleZSig;
2898 bits64 rem0, rem1, term0, term1;
2899
2900 aSig = extractFloat64Frac( a );
2901 aExp = extractFloat64Exp( a );
2902 aSign = extractFloat64Sign( a );
2903 if ( aExp == 0x7FF ) {
2904 if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
2905 if ( ! aSign ) return a;
2906 float_raise( float_flag_invalid STATUS_VAR);
2907 return float64_default_nan;
2908 }
2909 if ( aSign ) {
2910 if ( ( aExp | aSig ) == 0 ) return a;
2911 float_raise( float_flag_invalid STATUS_VAR);
2912 return float64_default_nan;
2913 }
2914 if ( aExp == 0 ) {
2915 if ( aSig == 0 ) return 0;
2916 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2917 }
2918 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
2919 aSig |= LIT64( 0x0010000000000000 );
2920 zSig = estimateSqrt32( aExp, aSig>>21 );
2921 aSig <<= 9 - ( aExp & 1 );
2922 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
2923 if ( ( zSig & 0x1FF ) <= 5 ) {
2924 doubleZSig = zSig<<1;
2925 mul64To128( zSig, zSig, &term0, &term1 );
2926 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2927 while ( (sbits64) rem0 < 0 ) {
2928 --zSig;
2929 doubleZSig -= 2;
2930 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
2931 }
2932 zSig |= ( ( rem0 | rem1 ) != 0 );
2933 }
2934 return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
2935
2936}
2937
2938/*----------------------------------------------------------------------------
2939| Returns 1 if the double-precision floating-point value `a' is equal to the
2940| corresponding value `b', and 0 otherwise. The comparison is performed
2941| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2942*----------------------------------------------------------------------------*/
2943
2944flag float64_eq( float64 a, float64 b STATUS_PARAM )
2945{
2946
2947 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2948 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2949 ) {
2950 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2951 float_raise( float_flag_invalid STATUS_VAR);
2952 }
2953 return 0;
2954 }
2955 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2956
2957}
2958
2959/*----------------------------------------------------------------------------
2960| Returns 1 if the double-precision floating-point value `a' is less than or
2961| equal to the corresponding value `b', and 0 otherwise. The comparison is
2962| performed according to the IEC/IEEE Standard for Binary Floating-Point
2963| Arithmetic.
2964*----------------------------------------------------------------------------*/
2965
2966flag float64_le( float64 a, float64 b STATUS_PARAM )
2967{
2968 flag aSign, bSign;
2969
2970 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2971 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2972 ) {
2973 float_raise( float_flag_invalid STATUS_VAR);
2974 return 0;
2975 }
2976 aSign = extractFloat64Sign( a );
2977 bSign = extractFloat64Sign( b );
2978 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2979 return ( a == b ) || ( aSign ^ ( a < b ) );
2980
2981}
2982
2983/*----------------------------------------------------------------------------
2984| Returns 1 if the double-precision floating-point value `a' is less than
2985| the corresponding value `b', and 0 otherwise. The comparison is performed
2986| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2987*----------------------------------------------------------------------------*/
2988
2989flag float64_lt( float64 a, float64 b STATUS_PARAM )
2990{
2991 flag aSign, bSign;
2992
2993 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2994 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2995 ) {
2996 float_raise( float_flag_invalid STATUS_VAR);
2997 return 0;
2998 }
2999 aSign = extractFloat64Sign( a );
3000 bSign = extractFloat64Sign( b );
3001 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3002 return ( a != b ) && ( aSign ^ ( a < b ) );
3003
3004}
3005
3006/*----------------------------------------------------------------------------
3007| Returns 1 if the double-precision floating-point value `a' is equal to the
3008| corresponding value `b', and 0 otherwise. The invalid exception is raised
3009| if either operand is a NaN. Otherwise, the comparison is performed
3010| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3011*----------------------------------------------------------------------------*/
3012
3013flag float64_eq_signaling( float64 a, float64 b STATUS_PARAM )
3014{
3015
3016 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3017 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3018 ) {
3019 float_raise( float_flag_invalid STATUS_VAR);
3020 return 0;
3021 }
3022 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
3023
3024}
3025
3026/*----------------------------------------------------------------------------
3027| Returns 1 if the double-precision floating-point value `a' is less than or
3028| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3029| cause an exception. Otherwise, the comparison is performed according to the
3030| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3031*----------------------------------------------------------------------------*/
3032
3033flag float64_le_quiet( float64 a, float64 b STATUS_PARAM )
3034{
3035 flag aSign, bSign;
3036
3037 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3038 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3039 ) {
3040 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3041 float_raise( float_flag_invalid STATUS_VAR);
3042 }
3043 return 0;
3044 }
3045 aSign = extractFloat64Sign( a );
3046 bSign = extractFloat64Sign( b );
3047 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
3048 return ( a == b ) || ( aSign ^ ( a < b ) );
3049
3050}
3051
3052/*----------------------------------------------------------------------------
3053| Returns 1 if the double-precision floating-point value `a' is less than
3054| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3055| exception. Otherwise, the comparison is performed according to the IEC/IEEE
3056| Standard for Binary Floating-Point Arithmetic.
3057*----------------------------------------------------------------------------*/
3058
3059flag float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
3060{
3061 flag aSign, bSign;
3062
3063 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3064 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3065 ) {
3066 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3067 float_raise( float_flag_invalid STATUS_VAR);
3068 }
3069 return 0;
3070 }
3071 aSign = extractFloat64Sign( a );
3072 bSign = extractFloat64Sign( b );
3073 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3074 return ( a != b ) && ( aSign ^ ( a < b ) );
3075
3076}
3077
3078#ifdef FLOATX80
3079
3080/*----------------------------------------------------------------------------
3081| Returns the result of converting the extended double-precision floating-
3082| point value `a' to the 32-bit two's complement integer format. The
3083| conversion is performed according to the IEC/IEEE Standard for Binary
3084| Floating-Point Arithmetic---which means in particular that the conversion
3085| is rounded according to the current rounding mode. If `a' is a NaN, the
3086| largest positive integer is returned. Otherwise, if the conversion
3087| overflows, the largest integer with the same sign as `a' is returned.
3088*----------------------------------------------------------------------------*/
3089
3090int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
3091{
3092 flag aSign;
3093 int32 aExp, shiftCount;
3094 bits64 aSig;
3095
3096 aSig = extractFloatx80Frac( a );
3097 aExp = extractFloatx80Exp( a );
3098 aSign = extractFloatx80Sign( a );
3099 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3100 shiftCount = 0x4037 - aExp;
3101 if ( shiftCount <= 0 ) shiftCount = 1;
3102 shift64RightJamming( aSig, shiftCount, &aSig );
3103 return roundAndPackInt32( aSign, aSig STATUS_VAR );
3104
3105}
3106
3107/*----------------------------------------------------------------------------
3108| Returns the result of converting the extended double-precision floating-
3109| point value `a' to the 32-bit two's complement integer format. The
3110| conversion is performed according to the IEC/IEEE Standard for Binary
3111| Floating-Point Arithmetic, except that the conversion is always rounded
3112| toward zero. If `a' is a NaN, the largest positive integer is returned.
3113| Otherwise, if the conversion overflows, the largest integer with the same
3114| sign as `a' is returned.
3115*----------------------------------------------------------------------------*/
3116
3117int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
3118{
3119 flag aSign;
3120 int32 aExp, shiftCount;
3121 bits64 aSig, savedASig;
3122 int32 z;
3123
3124 aSig = extractFloatx80Frac( a );
3125 aExp = extractFloatx80Exp( a );
3126 aSign = extractFloatx80Sign( a );
3127 if ( 0x401E < aExp ) {
3128 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3129 goto invalid;
3130 }
3131 else if ( aExp < 0x3FFF ) {
3132 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3133 return 0;
3134 }
3135 shiftCount = 0x403E - aExp;
3136 savedASig = aSig;
3137 aSig >>= shiftCount;
3138 z = aSig;
3139 if ( aSign ) z = - z;
3140 if ( ( z < 0 ) ^ aSign ) {
3141 invalid:
3142 float_raise( float_flag_invalid STATUS_VAR);
3143 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3144 }
3145 if ( ( aSig<<shiftCount ) != savedASig ) {
3146 STATUS(float_exception_flags) |= float_flag_inexact;
3147 }
3148 return z;
3149
3150}
3151
3152/*----------------------------------------------------------------------------
3153| Returns the result of converting the extended double-precision floating-
3154| point value `a' to the 64-bit two's complement integer format. The
3155| conversion is performed according to the IEC/IEEE Standard for Binary
3156| Floating-Point Arithmetic---which means in particular that the conversion
3157| is rounded according to the current rounding mode. If `a' is a NaN,
3158| the largest positive integer is returned. Otherwise, if the conversion
3159| overflows, the largest integer with the same sign as `a' is returned.
3160*----------------------------------------------------------------------------*/
3161
3162int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
3163{
3164 flag aSign;
3165 int32 aExp, shiftCount;
3166 bits64 aSig, aSigExtra;
3167
3168 aSig = extractFloatx80Frac( a );
3169 aExp = extractFloatx80Exp( a );
3170 aSign = extractFloatx80Sign( a );
3171 shiftCount = 0x403E - aExp;
3172 if ( shiftCount <= 0 ) {
3173 if ( shiftCount ) {
3174 float_raise( float_flag_invalid STATUS_VAR);
3175 if ( ! aSign
3176 || ( ( aExp == 0x7FFF )
3177 && ( aSig != LIT64( 0x8000000000000000 ) ) )
3178 ) {
3179 return LIT64( 0x7FFFFFFFFFFFFFFF );
3180 }
3181 return (sbits64) LIT64( 0x8000000000000000 );
3182 }
3183 aSigExtra = 0;
3184 }
3185 else {
3186 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3187 }
3188 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3189
3190}
3191
3192/*----------------------------------------------------------------------------
3193| Returns the result of converting the extended double-precision floating-
3194| point value `a' to the 64-bit two's complement integer format. The
3195| conversion is performed according to the IEC/IEEE Standard for Binary
3196| Floating-Point Arithmetic, except that the conversion is always rounded
3197| toward zero. If `a' is a NaN, the largest positive integer is returned.
3198| Otherwise, if the conversion overflows, the largest integer with the same
3199| sign as `a' is returned.
3200*----------------------------------------------------------------------------*/
3201
3202int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
3203{
3204 flag aSign;
3205 int32 aExp, shiftCount;
3206 bits64 aSig;
3207 int64 z;
3208
3209 aSig = extractFloatx80Frac( a );
3210 aExp = extractFloatx80Exp( a );
3211 aSign = extractFloatx80Sign( a );
3212 shiftCount = aExp - 0x403E;
3213 if ( 0 <= shiftCount ) {
3214 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3215 if ( ( a.high != 0xC03E ) || aSig ) {
3216 float_raise( float_flag_invalid STATUS_VAR);
3217 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3218 return LIT64( 0x7FFFFFFFFFFFFFFF );
3219 }
3220 }
3221 return (sbits64) LIT64( 0x8000000000000000 );
3222 }
3223 else if ( aExp < 0x3FFF ) {
3224 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3225 return 0;
3226 }
3227 z = aSig>>( - shiftCount );
3228 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3229 STATUS(float_exception_flags) |= float_flag_inexact;
3230 }
3231 if ( aSign ) z = - z;
3232 return z;
3233
3234}
3235
3236/*----------------------------------------------------------------------------
3237| Returns the result of converting the extended double-precision floating-
3238| point value `a' to the single-precision floating-point format. The
3239| conversion is performed according to the IEC/IEEE Standard for Binary
3240| Floating-Point Arithmetic.
3241*----------------------------------------------------------------------------*/
3242
3243float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
3244{
3245 flag aSign;
3246 int32 aExp;
3247 bits64 aSig;
3248
3249 aSig = extractFloatx80Frac( a );
3250 aExp = extractFloatx80Exp( a );
3251 aSign = extractFloatx80Sign( a );
3252 if ( aExp == 0x7FFF ) {
3253 if ( (bits64) ( aSig<<1 ) ) {
3254 return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) );
3255 }
3256 return packFloat32( aSign, 0xFF, 0 );
3257 }
3258 shift64RightJamming( aSig, 33, &aSig );
3259 if ( aExp || aSig ) aExp -= 0x3F81;
3260 return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
3261
3262}
3263
3264/*----------------------------------------------------------------------------
3265| Returns the result of converting the extended double-precision floating-
3266| point value `a' to the double-precision floating-point format. The
3267| conversion is performed according to the IEC/IEEE Standard for Binary
3268| Floating-Point Arithmetic.
3269*----------------------------------------------------------------------------*/
3270
3271float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
3272{
3273 flag aSign;
3274 int32 aExp;
3275 bits64 aSig, zSig;
3276
3277 aSig = extractFloatx80Frac( a );
3278 aExp = extractFloatx80Exp( a );
3279 aSign = extractFloatx80Sign( a );
3280 if ( aExp == 0x7FFF ) {
3281 if ( (bits64) ( aSig<<1 ) ) {
3282 return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) );
3283 }
3284 return packFloat64( aSign, 0x7FF, 0 );
3285 }
3286 shift64RightJamming( aSig, 1, &zSig );
3287 if ( aExp || aSig ) aExp -= 0x3C01;
3288 return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
3289
3290}
3291
3292#ifdef FLOAT128
3293
3294/*----------------------------------------------------------------------------
3295| Returns the result of converting the extended double-precision floating-
3296| point value `a' to the quadruple-precision floating-point format. The
3297| conversion is performed according to the IEC/IEEE Standard for Binary
3298| Floating-Point Arithmetic.
3299*----------------------------------------------------------------------------*/
3300
3301float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
3302{
3303 flag aSign;
3304 int16 aExp;
3305 bits64 aSig, zSig0, zSig1;
3306
3307 aSig = extractFloatx80Frac( a );
3308 aExp = extractFloatx80Exp( a );
3309 aSign = extractFloatx80Sign( a );
3310 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3311 return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) );
3312 }
3313 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3314 return packFloat128( aSign, aExp, zSig0, zSig1 );
3315
3316}
3317
3318#endif
3319
3320/*----------------------------------------------------------------------------
3321| Rounds the extended double-precision floating-point value `a' to an integer,
3322| and returns the result as an extended quadruple-precision floating-point
3323| value. The operation is performed according to the IEC/IEEE Standard for
3324| Binary Floating-Point Arithmetic.
3325*----------------------------------------------------------------------------*/
3326
3327floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
3328{
3329 flag aSign;
3330 int32 aExp;
3331 bits64 lastBitMask, roundBitsMask;
3332 int8 roundingMode;
3333 floatx80 z;
3334
3335 aExp = extractFloatx80Exp( a );
3336 if ( 0x403E <= aExp ) {
3337 if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3338 return propagateFloatx80NaN( a, a STATUS_VAR );
3339 }
3340 return a;
3341 }
3342 if ( aExp < 0x3FFF ) {
3343 if ( ( aExp == 0 )
3344 && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3345 return a;
3346 }
3347 STATUS(float_exception_flags) |= float_flag_inexact;
3348 aSign = extractFloatx80Sign( a );
3349 switch ( STATUS(float_rounding_mode) ) {
3350 case float_round_nearest_even:
3351 if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3352 ) {
3353 return
3354 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3355 }
3356 break;
3357 case float_round_down:
3358 return
3359 aSign ?
3360 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3361 : packFloatx80( 0, 0, 0 );
3362 case float_round_up:
3363 return
3364 aSign ? packFloatx80( 1, 0, 0 )
3365 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3366 }
3367 return packFloatx80( aSign, 0, 0 );
3368 }
3369 lastBitMask = 1;
3370 lastBitMask <<= 0x403E - aExp;
3371 roundBitsMask = lastBitMask - 1;
3372 z = a;
3373 roundingMode = STATUS(float_rounding_mode);
3374 if ( roundingMode == float_round_nearest_even ) {
3375 z.low += lastBitMask>>1;
3376 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3377 }
3378 else if ( roundingMode != float_round_to_zero ) {
3379 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3380 z.low += roundBitsMask;
3381 }
3382 }
3383 z.low &= ~ roundBitsMask;
3384 if ( z.low == 0 ) {
3385 ++z.high;
3386 z.low = LIT64( 0x8000000000000000 );
3387 }
3388 if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
3389 return z;
3390
3391}
3392
3393/*----------------------------------------------------------------------------
3394| Returns the result of adding the absolute values of the extended double-
3395| precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
3396| negated before being returned. `zSign' is ignored if the result is a NaN.
3397| The addition is performed according to the IEC/IEEE Standard for Binary
3398| Floating-Point Arithmetic.
3399*----------------------------------------------------------------------------*/
3400
3401static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
3402{
3403 int32 aExp, bExp, zExp;
3404 bits64 aSig, bSig, zSig0, zSig1;
3405 int32 expDiff;
3406
3407 aSig = extractFloatx80Frac( a );
3408 aExp = extractFloatx80Exp( a );
3409 bSig = extractFloatx80Frac( b );
3410 bExp = extractFloatx80Exp( b );
3411 expDiff = aExp - bExp;
3412 if ( 0 < expDiff ) {
3413 if ( aExp == 0x7FFF ) {
3414 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3415 return a;
3416 }
3417 if ( bExp == 0 ) --expDiff;
3418 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3419 zExp = aExp;
3420 }
3421 else if ( expDiff < 0 ) {
3422 if ( bExp == 0x7FFF ) {
3423 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3424 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3425 }
3426 if ( aExp == 0 ) ++expDiff;
3427 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3428 zExp = bExp;
3429 }
3430 else {
3431 if ( aExp == 0x7FFF ) {
3432 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3433 return propagateFloatx80NaN( a, b STATUS_VAR );
3434 }
3435 return a;
3436 }
3437 zSig1 = 0;
3438 zSig0 = aSig + bSig;
3439 if ( aExp == 0 ) {
3440 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3441 goto roundAndPack;
3442 }
3443 zExp = aExp;
3444 goto shiftRight1;
3445 }
3446 zSig0 = aSig + bSig;
3447 if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3448 shiftRight1:
3449 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3450 zSig0 |= LIT64( 0x8000000000000000 );
3451 ++zExp;
3452 roundAndPack:
3453 return
3454 roundAndPackFloatx80(
3455 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3456
3457}
3458
3459/*----------------------------------------------------------------------------
3460| Returns the result of subtracting the absolute values of the extended
3461| double-precision floating-point values `a' and `b'. If `zSign' is 1, the
3462| difference is negated before being returned. `zSign' is ignored if the
3463| result is a NaN. The subtraction is performed according to the IEC/IEEE
3464| Standard for Binary Floating-Point Arithmetic.
3465*----------------------------------------------------------------------------*/
3466
3467static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
3468{
3469 int32 aExp, bExp, zExp;
3470 bits64 aSig, bSig, zSig0, zSig1;
3471 int32 expDiff;
3472 floatx80 z;
3473
3474 aSig = extractFloatx80Frac( a );
3475 aExp = extractFloatx80Exp( a );
3476 bSig = extractFloatx80Frac( b );
3477 bExp = extractFloatx80Exp( b );
3478 expDiff = aExp - bExp;
3479 if ( 0 < expDiff ) goto aExpBigger;
3480 if ( expDiff < 0 ) goto bExpBigger;
3481 if ( aExp == 0x7FFF ) {
3482 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3483 return propagateFloatx80NaN( a, b STATUS_VAR );
3484 }
3485 float_raise( float_flag_invalid STATUS_VAR);
3486 z.low = floatx80_default_nan_low;
3487 z.high = floatx80_default_nan_high;
3488 return z;
3489 }
3490 if ( aExp == 0 ) {
3491 aExp = 1;
3492 bExp = 1;
3493 }
3494 zSig1 = 0;
3495 if ( bSig < aSig ) goto aBigger;
3496 if ( aSig < bSig ) goto bBigger;
3497 return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3498 bExpBigger:
3499 if ( bExp == 0x7FFF ) {
3500 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3501 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3502 }
3503 if ( aExp == 0 ) ++expDiff;
3504 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3505 bBigger:
3506 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3507 zExp = bExp;
3508 zSign ^= 1;
3509 goto normalizeRoundAndPack;
3510 aExpBigger:
3511 if ( aExp == 0x7FFF ) {
3512 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3513 return a;
3514 }
3515 if ( bExp == 0 ) --expDiff;
3516 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3517 aBigger:
3518 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3519 zExp = aExp;
3520 normalizeRoundAndPack:
3521 return
3522 normalizeRoundAndPackFloatx80(
3523 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3524
3525}
3526
3527/*----------------------------------------------------------------------------
3528| Returns the result of adding the extended double-precision floating-point
3529| values `a' and `b'. The operation is performed according to the IEC/IEEE
3530| Standard for Binary Floating-Point Arithmetic.
3531*----------------------------------------------------------------------------*/
3532
3533floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
3534{
3535 flag aSign, bSign;
3536
3537 aSign = extractFloatx80Sign( a );
3538 bSign = extractFloatx80Sign( b );
3539 if ( aSign == bSign ) {
3540 return addFloatx80Sigs( a, b, aSign STATUS_VAR );
3541 }
3542 else {
3543 return subFloatx80Sigs( a, b, aSign STATUS_VAR );
3544 }
3545
3546}
3547
3548/*----------------------------------------------------------------------------
3549| Returns the result of subtracting the extended double-precision floating-
3550| point values `a' and `b'. The operation is performed according to the
3551| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3552*----------------------------------------------------------------------------*/
3553
3554floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
3555{
3556 flag aSign, bSign;
3557
3558 aSign = extractFloatx80Sign( a );
3559 bSign = extractFloatx80Sign( b );
3560 if ( aSign == bSign ) {
3561 return subFloatx80Sigs( a, b, aSign STATUS_VAR );
3562 }
3563 else {
3564 return addFloatx80Sigs( a, b, aSign STATUS_VAR );
3565 }
3566
3567}
3568
3569/*----------------------------------------------------------------------------
3570| Returns the result of multiplying the extended double-precision floating-
3571| point values `a' and `b'. The operation is performed according to the
3572| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3573*----------------------------------------------------------------------------*/
3574
3575floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
3576{
3577 flag aSign, bSign, zSign;
3578 int32 aExp, bExp, zExp;
3579 bits64 aSig, bSig, zSig0, zSig1;
3580 floatx80 z;
3581
3582 aSig = extractFloatx80Frac( a );
3583 aExp = extractFloatx80Exp( a );
3584 aSign = extractFloatx80Sign( a );
3585 bSig = extractFloatx80Frac( b );
3586 bExp = extractFloatx80Exp( b );
3587 bSign = extractFloatx80Sign( b );
3588 zSign = aSign ^ bSign;
3589 if ( aExp == 0x7FFF ) {
3590 if ( (bits64) ( aSig<<1 )
3591 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3592 return propagateFloatx80NaN( a, b STATUS_VAR );
3593 }
3594 if ( ( bExp | bSig ) == 0 ) goto invalid;
3595 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3596 }
3597 if ( bExp == 0x7FFF ) {
3598 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3599 if ( ( aExp | aSig ) == 0 ) {
3600 invalid:
3601 float_raise( float_flag_invalid STATUS_VAR);
3602 z.low = floatx80_default_nan_low;
3603 z.high = floatx80_default_nan_high;
3604 return z;
3605 }
3606 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3607 }
3608 if ( aExp == 0 ) {
3609 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3610 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3611 }
3612 if ( bExp == 0 ) {
3613 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3614 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3615 }
3616 zExp = aExp + bExp - 0x3FFE;
3617 mul64To128( aSig, bSig, &zSig0, &zSig1 );
3618 if ( 0 < (sbits64) zSig0 ) {
3619 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3620 --zExp;
3621 }
3622 return
3623 roundAndPackFloatx80(
3624 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3625
3626}
3627
3628/*----------------------------------------------------------------------------
3629| Returns the result of dividing the extended double-precision floating-point
3630| value `a' by the corresponding value `b'. The operation is performed
3631| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3632*----------------------------------------------------------------------------*/
3633
3634floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
3635{
3636 flag aSign, bSign, zSign;
3637 int32 aExp, bExp, zExp;
3638 bits64 aSig, bSig, zSig0, zSig1;
3639 bits64 rem0, rem1, rem2, term0, term1, term2;
3640 floatx80 z;
3641
3642 aSig = extractFloatx80Frac( a );
3643 aExp = extractFloatx80Exp( a );
3644 aSign = extractFloatx80Sign( a );
3645 bSig = extractFloatx80Frac( b );
3646 bExp = extractFloatx80Exp( b );
3647 bSign = extractFloatx80Sign( b );
3648 zSign = aSign ^ bSign;
3649 if ( aExp == 0x7FFF ) {
3650 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3651 if ( bExp == 0x7FFF ) {
3652 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3653 goto invalid;
3654 }
3655 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3656 }
3657 if ( bExp == 0x7FFF ) {
3658 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3659 return packFloatx80( zSign, 0, 0 );
3660 }
3661 if ( bExp == 0 ) {
3662 if ( bSig == 0 ) {
3663 if ( ( aExp | aSig ) == 0 ) {
3664 invalid:
3665 float_raise( float_flag_invalid STATUS_VAR);
3666 z.low = floatx80_default_nan_low;
3667 z.high = floatx80_default_nan_high;
3668 return z;
3669 }
3670 float_raise( float_flag_divbyzero STATUS_VAR);
3671 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3672 }
3673 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3674 }
3675 if ( aExp == 0 ) {
3676 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3677 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3678 }
3679 zExp = aExp - bExp + 0x3FFE;
3680 rem1 = 0;
3681 if ( bSig <= aSig ) {
3682 shift128Right( aSig, 0, 1, &aSig, &rem1 );
3683 ++zExp;
3684 }
3685 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3686 mul64To128( bSig, zSig0, &term0, &term1 );
3687 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3688 while ( (sbits64) rem0 < 0 ) {
3689 --zSig0;
3690 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3691 }
3692 zSig1 = estimateDiv128To64( rem1, 0, bSig );
3693 if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3694 mul64To128( bSig, zSig1, &term1, &term2 );
3695 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3696 while ( (sbits64) rem1 < 0 ) {
3697 --zSig1;
3698 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3699 }
3700 zSig1 |= ( ( rem1 | rem2 ) != 0 );
3701 }
3702 return
3703 roundAndPackFloatx80(
3704 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3705
3706}
3707
3708/*----------------------------------------------------------------------------
3709| Returns the remainder of the extended double-precision floating-point value
3710| `a' with respect to the corresponding value `b'. The operation is performed
3711| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3712*----------------------------------------------------------------------------*/
3713
3714floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
3715{
3716 flag aSign, bSign, zSign;
3717 int32 aExp, bExp, expDiff;
3718 bits64 aSig0, aSig1, bSig;
3719 bits64 q, term0, term1, alternateASig0, alternateASig1;
3720 floatx80 z;
3721
3722 aSig0 = extractFloatx80Frac( a );
3723 aExp = extractFloatx80Exp( a );
3724 aSign = extractFloatx80Sign( a );
3725 bSig = extractFloatx80Frac( b );
3726 bExp = extractFloatx80Exp( b );
3727 bSign = extractFloatx80Sign( b );
3728 if ( aExp == 0x7FFF ) {
3729 if ( (bits64) ( aSig0<<1 )
3730 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3731 return propagateFloatx80NaN( a, b STATUS_VAR );
3732 }
3733 goto invalid;
3734 }
3735 if ( bExp == 0x7FFF ) {
3736 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3737 return a;
3738 }
3739 if ( bExp == 0 ) {
3740 if ( bSig == 0 ) {
3741 invalid:
3742 float_raise( float_flag_invalid STATUS_VAR);
3743 z.low = floatx80_default_nan_low;
3744 z.high = floatx80_default_nan_high;
3745 return z;
3746 }
3747 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3748 }
3749 if ( aExp == 0 ) {
3750 if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3751 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3752 }
3753 bSig |= LIT64( 0x8000000000000000 );
3754 zSign = aSign;
3755 expDiff = aExp - bExp;
3756 aSig1 = 0;
3757 if ( expDiff < 0 ) {
3758 if ( expDiff < -1 ) return a;
3759 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3760 expDiff = 0;
3761 }
3762 q = ( bSig <= aSig0 );
3763 if ( q ) aSig0 -= bSig;
3764 expDiff -= 64;
3765 while ( 0 < expDiff ) {
3766 q = estimateDiv128To64( aSig0, aSig1, bSig );
3767 q = ( 2 < q ) ? q - 2 : 0;
3768 mul64To128( bSig, q, &term0, &term1 );
3769 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3770 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3771 expDiff -= 62;
3772 }
3773 expDiff += 64;
3774 if ( 0 < expDiff ) {
3775 q = estimateDiv128To64( aSig0, aSig1, bSig );
3776 q = ( 2 < q ) ? q - 2 : 0;
3777 q >>= 64 - expDiff;
3778 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3779 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3780 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3781 while ( le128( term0, term1, aSig0, aSig1 ) ) {
3782 ++q;
3783 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3784 }
3785 }
3786 else {
3787 term1 = 0;
3788 term0 = bSig;
3789 }
3790 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3791 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3792 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3793 && ( q & 1 ) )
3794 ) {
3795 aSig0 = alternateASig0;
3796 aSig1 = alternateASig1;
3797 zSign = ! zSign;
3798 }
3799 return
3800 normalizeRoundAndPackFloatx80(
3801 80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
3802
3803}
3804
3805/*----------------------------------------------------------------------------
3806| Returns the square root of the extended double-precision floating-point
3807| value `a'. The operation is performed according to the IEC/IEEE Standard
3808| for Binary Floating-Point Arithmetic.
3809*----------------------------------------------------------------------------*/
3810
3811floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
3812{
3813 flag aSign;
3814 int32 aExp, zExp;
3815 bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
3816 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3817 floatx80 z;
3818
3819 aSig0 = extractFloatx80Frac( a );
3820 aExp = extractFloatx80Exp( a );
3821 aSign = extractFloatx80Sign( a );
3822 if ( aExp == 0x7FFF ) {
3823 if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
3824 if ( ! aSign ) return a;
3825 goto invalid;
3826 }
3827 if ( aSign ) {
3828 if ( ( aExp | aSig0 ) == 0 ) return a;
3829 invalid:
3830 float_raise( float_flag_invalid STATUS_VAR);
3831 z.low = floatx80_default_nan_low;
3832 z.high = floatx80_default_nan_high;
3833 return z;
3834 }
3835 if ( aExp == 0 ) {
3836 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
3837 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3838 }
3839 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
3840 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
3841 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
3842 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
3843 doubleZSig0 = zSig0<<1;
3844 mul64To128( zSig0, zSig0, &term0, &term1 );
3845 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
3846 while ( (sbits64) rem0 < 0 ) {
3847 --zSig0;
3848 doubleZSig0 -= 2;
3849 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
3850 }
3851 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
3852 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
3853 if ( zSig1 == 0 ) zSig1 = 1;
3854 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
3855 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3856 mul64To128( zSig1, zSig1, &term2, &term3 );
3857 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
3858 while ( (sbits64) rem1 < 0 ) {
3859 --zSig1;
3860 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
3861 term3 |= 1;
3862 term2 |= doubleZSig0;
3863 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
3864 }
3865 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
3866 }
3867 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
3868 zSig0 |= doubleZSig0;
3869 return
3870 roundAndPackFloatx80(
3871 STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
3872
3873}
3874
3875/*----------------------------------------------------------------------------
3876| Returns 1 if the extended double-precision floating-point value `a' is
3877| equal to the corresponding value `b', and 0 otherwise. The comparison is
3878| performed according to the IEC/IEEE Standard for Binary Floating-Point
3879| Arithmetic.
3880*----------------------------------------------------------------------------*/
3881
3882flag floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
3883{
3884
3885 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3886 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3887 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3888 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3889 ) {
3890 if ( floatx80_is_signaling_nan( a )
3891 || floatx80_is_signaling_nan( b ) ) {
3892 float_raise( float_flag_invalid STATUS_VAR);
3893 }
3894 return 0;
3895 }
3896 return
3897 ( a.low == b.low )
3898 && ( ( a.high == b.high )
3899 || ( ( a.low == 0 )
3900 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3901 );
3902
3903}
3904
3905/*----------------------------------------------------------------------------
3906| Returns 1 if the extended double-precision floating-point value `a' is
3907| less than or equal to the corresponding value `b', and 0 otherwise. The
3908| comparison is performed according to the IEC/IEEE Standard for Binary
3909| Floating-Point Arithmetic.
3910*----------------------------------------------------------------------------*/
3911
3912flag floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
3913{
3914 flag aSign, bSign;
3915
3916 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3917 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3918 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3919 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3920 ) {
3921 float_raise( float_flag_invalid STATUS_VAR);
3922 return 0;
3923 }
3924 aSign = extractFloatx80Sign( a );
3925 bSign = extractFloatx80Sign( b );
3926 if ( aSign != bSign ) {
3927 return
3928 aSign
3929 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3930 == 0 );
3931 }
3932 return
3933 aSign ? le128( b.high, b.low, a.high, a.low )
3934 : le128( a.high, a.low, b.high, b.low );
3935
3936}
3937
3938/*----------------------------------------------------------------------------
3939| Returns 1 if the extended double-precision floating-point value `a' is
3940| less than the corresponding value `b', and 0 otherwise. The comparison
3941| is performed according to the IEC/IEEE Standard for Binary Floating-Point
3942| Arithmetic.
3943*----------------------------------------------------------------------------*/
3944
3945flag floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
3946{
3947 flag aSign, bSign;
3948
3949 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3950 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3951 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3952 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3953 ) {
3954 float_raise( float_flag_invalid STATUS_VAR);
3955 return 0;
3956 }
3957 aSign = extractFloatx80Sign( a );
3958 bSign = extractFloatx80Sign( b );
3959 if ( aSign != bSign ) {
3960 return
3961 aSign
3962 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3963 != 0 );
3964 }
3965 return
3966 aSign ? lt128( b.high, b.low, a.high, a.low )
3967 : lt128( a.high, a.low, b.high, b.low );
3968
3969}
3970
3971/*----------------------------------------------------------------------------
3972| Returns 1 if the extended double-precision floating-point value `a' is equal
3973| to the corresponding value `b', and 0 otherwise. The invalid exception is
3974| raised if either operand is a NaN. Otherwise, the comparison is performed
3975| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3976*----------------------------------------------------------------------------*/
3977
3978flag floatx80_eq_signaling( floatx80 a, floatx80 b STATUS_PARAM )
3979{
3980
3981 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3982 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3983 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3984 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3985 ) {
3986 float_raise( float_flag_invalid STATUS_VAR);
3987 return 0;
3988 }
3989 return
3990 ( a.low == b.low )
3991 && ( ( a.high == b.high )
3992 || ( ( a.low == 0 )
3993 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3994 );
3995
3996}
3997
3998/*----------------------------------------------------------------------------
3999| Returns 1 if the extended double-precision floating-point value `a' is less
4000| than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
4001| do not cause an exception. Otherwise, the comparison is performed according
4002| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4003*----------------------------------------------------------------------------*/
4004
4005flag floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4006{
4007 flag aSign, bSign;
4008
4009 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4010 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4011 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4012 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4013 ) {
4014 if ( floatx80_is_signaling_nan( a )
4015 || floatx80_is_signaling_nan( b ) ) {
4016 float_raise( float_flag_invalid STATUS_VAR);
4017 }
4018 return 0;
4019 }
4020 aSign = extractFloatx80Sign( a );
4021 bSign = extractFloatx80Sign( b );
4022 if ( aSign != bSign ) {
4023 return
4024 aSign
4025 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4026 == 0 );
4027 }
4028 return
4029 aSign ? le128( b.high, b.low, a.high, a.low )
4030 : le128( a.high, a.low, b.high, b.low );
4031
4032}
4033
4034/*----------------------------------------------------------------------------
4035| Returns 1 if the extended double-precision floating-point value `a' is less
4036| than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
4037| an exception. Otherwise, the comparison is performed according to the
4038| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4039*----------------------------------------------------------------------------*/
4040
4041flag floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4042{
4043 flag aSign, bSign;
4044
4045 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4046 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4047 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4048 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4049 ) {
4050 if ( floatx80_is_signaling_nan( a )
4051 || floatx80_is_signaling_nan( b ) ) {
4052 float_raise( float_flag_invalid STATUS_VAR);
4053 }
4054 return 0;
4055 }
4056 aSign = extractFloatx80Sign( a );
4057 bSign = extractFloatx80Sign( b );
4058 if ( aSign != bSign ) {
4059 return
4060 aSign
4061 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4062 != 0 );
4063 }
4064 return
4065 aSign ? lt128( b.high, b.low, a.high, a.low )
4066 : lt128( a.high, a.low, b.high, b.low );
4067
4068}
4069
4070#endif
4071
4072#ifdef FLOAT128
4073
4074/*----------------------------------------------------------------------------
4075| Returns the result of converting the quadruple-precision floating-point
4076| value `a' to the 32-bit two's complement integer format. The conversion
4077| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4078| Arithmetic---which means in particular that the conversion is rounded
4079| according to the current rounding mode. If `a' is a NaN, the largest
4080| positive integer is returned. Otherwise, if the conversion overflows, the
4081| largest integer with the same sign as `a' is returned.
4082*----------------------------------------------------------------------------*/
4083
4084int32 float128_to_int32( float128 a STATUS_PARAM )
4085{
4086 flag aSign;
4087 int32 aExp, shiftCount;
4088 bits64 aSig0, aSig1;
4089
4090 aSig1 = extractFloat128Frac1( a );
4091 aSig0 = extractFloat128Frac0( a );
4092 aExp = extractFloat128Exp( a );
4093 aSign = extractFloat128Sign( a );
4094 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4095 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4096 aSig0 |= ( aSig1 != 0 );
4097 shiftCount = 0x4028 - aExp;
4098 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4099 return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
4100
4101}
4102
4103/*----------------------------------------------------------------------------
4104| Returns the result of converting the quadruple-precision floating-point
4105| value `a' to the 32-bit two's complement integer format. The conversion
4106| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4107| Arithmetic, except that the conversion is always rounded toward zero. If
4108| `a' is a NaN, the largest positive integer is returned. Otherwise, if the
4109| conversion overflows, the largest integer with the same sign as `a' is
4110| returned.
4111*----------------------------------------------------------------------------*/
4112
4113int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
4114{
4115 flag aSign;
4116 int32 aExp, shiftCount;
4117 bits64 aSig0, aSig1, savedASig;
4118 int32 z;
4119
4120 aSig1 = extractFloat128Frac1( a );
4121 aSig0 = extractFloat128Frac0( a );
4122 aExp = extractFloat128Exp( a );
4123 aSign = extractFloat128Sign( a );
4124 aSig0 |= ( aSig1 != 0 );
4125 if ( 0x401E < aExp ) {
4126 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4127 goto invalid;
4128 }
4129 else if ( aExp < 0x3FFF ) {
4130 if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
4131 return 0;
4132 }
4133 aSig0 |= LIT64( 0x0001000000000000 );
4134 shiftCount = 0x402F - aExp;
4135 savedASig = aSig0;
4136 aSig0 >>= shiftCount;
4137 z = aSig0;
4138 if ( aSign ) z = - z;
4139 if ( ( z < 0 ) ^ aSign ) {
4140 invalid:
4141 float_raise( float_flag_invalid STATUS_VAR);
4142 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4143 }
4144 if ( ( aSig0<<shiftCount ) != savedASig ) {
4145 STATUS(float_exception_flags) |= float_flag_inexact;
4146 }
4147 return z;
4148
4149}
4150
4151/*----------------------------------------------------------------------------
4152| Returns the result of converting the quadruple-precision floating-point
4153| value `a' to the 64-bit two's complement integer format. The conversion
4154| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4155| Arithmetic---which means in particular that the conversion is rounded
4156| according to the current rounding mode. If `a' is a NaN, the largest
4157| positive integer is returned. Otherwise, if the conversion overflows, the
4158| largest integer with the same sign as `a' is returned.
4159*----------------------------------------------------------------------------*/
4160
4161int64 float128_to_int64( float128 a STATUS_PARAM )
4162{
4163 flag aSign;
4164 int32 aExp, shiftCount;
4165 bits64 aSig0, aSig1;
4166
4167 aSig1 = extractFloat128Frac1( a );
4168 aSig0 = extractFloat128Frac0( a );
4169 aExp = extractFloat128Exp( a );
4170 aSign = extractFloat128Sign( a );
4171 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4172 shiftCount = 0x402F - aExp;
4173 if ( shiftCount <= 0 ) {
4174 if ( 0x403E < aExp ) {
4175 float_raise( float_flag_invalid STATUS_VAR);
4176 if ( ! aSign
4177 || ( ( aExp == 0x7FFF )
4178 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4179 )
4180 ) {
4181 return LIT64( 0x7FFFFFFFFFFFFFFF );
4182 }
4183 return (sbits64) LIT64( 0x8000000000000000 );
4184 }
4185 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4186 }
4187 else {
4188 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4189 }
4190 return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
4191
4192}
4193
4194/*----------------------------------------------------------------------------
4195| Returns the result of converting the quadruple-precision floating-point
4196| value `a' to the 64-bit two's complement integer format. The conversion
4197| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4198| Arithmetic, except that the conversion is always rounded toward zero.
4199| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
4200| the conversion overflows, the largest integer with the same sign as `a' is
4201| returned.
4202*----------------------------------------------------------------------------*/
4203
4204int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
4205{
4206 flag aSign;
4207 int32 aExp, shiftCount;
4208 bits64 aSig0, aSig1;
4209 int64 z;
4210
4211 aSig1 = extractFloat128Frac1( a );
4212 aSig0 = extractFloat128Frac0( a );
4213 aExp = extractFloat128Exp( a );
4214 aSign = extractFloat128Sign( a );
4215 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4216 shiftCount = aExp - 0x402F;
4217 if ( 0 < shiftCount ) {
4218 if ( 0x403E <= aExp ) {
4219 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4220 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
4221 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4222 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
4223 }
4224 else {
4225 float_raise( float_flag_invalid STATUS_VAR);
4226 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4227 return LIT64( 0x7FFFFFFFFFFFFFFF );
4228 }
4229 }
4230 return (sbits64) LIT64( 0x8000000000000000 );
4231 }
4232 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4233 if ( (bits64) ( aSig1<<shiftCount ) ) {
4234 STATUS(float_exception_flags) |= float_flag_inexact;
4235 }
4236 }
4237 else {
4238 if ( aExp < 0x3FFF ) {
4239 if ( aExp | aSig0 | aSig1 ) {
4240 STATUS(float_exception_flags) |= float_flag_inexact;
4241 }
4242 return 0;
4243 }
4244 z = aSig0>>( - shiftCount );
4245 if ( aSig1
4246 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4247 STATUS(float_exception_flags) |= float_flag_inexact;
4248 }
4249 }
4250 if ( aSign ) z = - z;
4251 return z;
4252
4253}
4254
4255/*----------------------------------------------------------------------------
4256| Returns the result of converting the quadruple-precision floating-point
4257| value `a' to the single-precision floating-point format. The conversion
4258| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4259| Arithmetic.
4260*----------------------------------------------------------------------------*/
4261
4262float32 float128_to_float32( float128 a STATUS_PARAM )
4263{
4264 flag aSign;
4265 int32 aExp;
4266 bits64 aSig0, aSig1;
4267 bits32 zSig;
4268
4269 aSig1 = extractFloat128Frac1( a );
4270 aSig0 = extractFloat128Frac0( a );
4271 aExp = extractFloat128Exp( a );
4272 aSign = extractFloat128Sign( a );
4273 if ( aExp == 0x7FFF ) {
4274 if ( aSig0 | aSig1 ) {
4275 return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) );
4276 }
4277 return packFloat32( aSign, 0xFF, 0 );
4278 }
4279 aSig0 |= ( aSig1 != 0 );
4280 shift64RightJamming( aSig0, 18, &aSig0 );
4281 zSig = aSig0;
4282 if ( aExp || zSig ) {
4283 zSig |= 0x40000000;
4284 aExp -= 0x3F81;
4285 }
4286 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
4287
4288}
4289
4290/*----------------------------------------------------------------------------
4291| Returns the result of converting the quadruple-precision floating-point
4292| value `a' to the double-precision floating-point format. The conversion
4293| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4294| Arithmetic.
4295*----------------------------------------------------------------------------*/
4296
4297float64 float128_to_float64( float128 a STATUS_PARAM )
4298{
4299 flag aSign;
4300 int32 aExp;
4301 bits64 aSig0, aSig1;
4302
4303 aSig1 = extractFloat128Frac1( a );
4304 aSig0 = extractFloat128Frac0( a );
4305 aExp = extractFloat128Exp( a );
4306 aSign = extractFloat128Sign( a );
4307 if ( aExp == 0x7FFF ) {
4308 if ( aSig0 | aSig1 ) {
4309 return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) );
4310 }
4311 return packFloat64( aSign, 0x7FF, 0 );
4312 }
4313 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4314 aSig0 |= ( aSig1 != 0 );
4315 if ( aExp || aSig0 ) {
4316 aSig0 |= LIT64( 0x4000000000000000 );
4317 aExp -= 0x3C01;
4318 }
4319 return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
4320
4321}
4322
4323#ifdef FLOATX80
4324
4325/*----------------------------------------------------------------------------
4326| Returns the result of converting the quadruple-precision floating-point
4327| value `a' to the extended double-precision floating-point format. The
4328| conversion is performed according to the IEC/IEEE Standard for Binary
4329| Floating-Point Arithmetic.
4330*----------------------------------------------------------------------------*/
4331
4332floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
4333{
4334 flag aSign;
4335 int32 aExp;
4336 bits64 aSig0, aSig1;
4337
4338 aSig1 = extractFloat128Frac1( a );
4339 aSig0 = extractFloat128Frac0( a );
4340 aExp = extractFloat128Exp( a );
4341 aSign = extractFloat128Sign( a );
4342 if ( aExp == 0x7FFF ) {
4343 if ( aSig0 | aSig1 ) {
4344 return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) );
4345 }
4346 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4347 }
4348 if ( aExp == 0 ) {
4349 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4350 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4351 }
4352 else {
4353 aSig0 |= LIT64( 0x0001000000000000 );
4354 }
4355 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4356 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
4357
4358}
4359
4360#endif
4361
4362/*----------------------------------------------------------------------------
4363| Rounds the quadruple-precision floating-point value `a' to an integer, and
4364| returns the result as a quadruple-precision floating-point value. The
4365| operation is performed according to the IEC/IEEE Standard for Binary
4366| Floating-Point Arithmetic.
4367*----------------------------------------------------------------------------*/
4368
4369float128 float128_round_to_int( float128 a STATUS_PARAM )
4370{
4371 flag aSign;
4372 int32 aExp;
4373 bits64 lastBitMask, roundBitsMask;
4374 int8 roundingMode;
4375 float128 z;
4376
4377 aExp = extractFloat128Exp( a );
4378 if ( 0x402F <= aExp ) {
4379 if ( 0x406F <= aExp ) {
4380 if ( ( aExp == 0x7FFF )
4381 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4382 ) {
4383 return propagateFloat128NaN( a, a STATUS_VAR );
4384 }
4385 return a;
4386 }
4387 lastBitMask = 1;
4388 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4389 roundBitsMask = lastBitMask - 1;
4390 z = a;
4391 roundingMode = STATUS(float_rounding_mode);
4392 if ( roundingMode == float_round_nearest_even ) {
4393 if ( lastBitMask ) {
4394 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4395 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4396 }
4397 else {
4398 if ( (sbits64) z.low < 0 ) {
4399 ++z.high;
4400 if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4401 }
4402 }
4403 }
4404 else if ( roundingMode != float_round_to_zero ) {
4405 if ( extractFloat128Sign( z )
4406 ^ ( roundingMode == float_round_up ) ) {
4407 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4408 }
4409 }
4410 z.low &= ~ roundBitsMask;
4411 }
4412 else {
4413 if ( aExp < 0x3FFF ) {
4414 if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4415 STATUS(float_exception_flags) |= float_flag_inexact;
4416 aSign = extractFloat128Sign( a );
4417 switch ( STATUS(float_rounding_mode) ) {
4418 case float_round_nearest_even:
4419 if ( ( aExp == 0x3FFE )
4420 && ( extractFloat128Frac0( a )
4421 | extractFloat128Frac1( a ) )
4422 ) {
4423 return packFloat128( aSign, 0x3FFF, 0, 0 );
4424 }
4425 break;
4426 case float_round_down:
4427 return
4428 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4429 : packFloat128( 0, 0, 0, 0 );
4430 case float_round_up:
4431 return
4432 aSign ? packFloat128( 1, 0, 0, 0 )
4433 : packFloat128( 0, 0x3FFF, 0, 0 );
4434 }
4435 return packFloat128( aSign, 0, 0, 0 );
4436 }
4437 lastBitMask = 1;
4438 lastBitMask <<= 0x402F - aExp;
4439 roundBitsMask = lastBitMask - 1;
4440 z.low = 0;
4441 z.high = a.high;
4442 roundingMode = STATUS(float_rounding_mode);
4443 if ( roundingMode == float_round_nearest_even ) {
4444 z.high += lastBitMask>>1;
4445 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4446 z.high &= ~ lastBitMask;
4447 }
4448 }
4449 else if ( roundingMode != float_round_to_zero ) {
4450 if ( extractFloat128Sign( z )
4451 ^ ( roundingMode == float_round_up ) ) {
4452 z.high |= ( a.low != 0 );
4453 z.high += roundBitsMask;
4454 }
4455 }
4456 z.high &= ~ roundBitsMask;
4457 }
4458 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4459 STATUS(float_exception_flags) |= float_flag_inexact;
4460 }
4461 return z;
4462
4463}
4464
4465/*----------------------------------------------------------------------------
4466| Returns the result of adding the absolute values of the quadruple-precision
4467| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
4468| before being returned. `zSign' is ignored if the result is a NaN.
4469| The addition is performed according to the IEC/IEEE Standard for Binary
4470| Floating-Point Arithmetic.
4471*----------------------------------------------------------------------------*/
4472
4473static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
4474{
4475 int32 aExp, bExp, zExp;
4476 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4477 int32 expDiff;
4478
4479 aSig1 = extractFloat128Frac1( a );
4480 aSig0 = extractFloat128Frac0( a );
4481 aExp = extractFloat128Exp( a );
4482 bSig1 = extractFloat128Frac1( b );
4483 bSig0 = extractFloat128Frac0( b );
4484 bExp = extractFloat128Exp( b );
4485 expDiff = aExp - bExp;
4486 if ( 0 < expDiff ) {
4487 if ( aExp == 0x7FFF ) {
4488 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4489 return a;
4490 }
4491 if ( bExp == 0 ) {
4492 --expDiff;
4493 }
4494 else {
4495 bSig0 |= LIT64( 0x0001000000000000 );
4496 }
4497 shift128ExtraRightJamming(
4498 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4499 zExp = aExp;
4500 }
4501 else if ( expDiff < 0 ) {
4502 if ( bExp == 0x7FFF ) {
4503 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4504 return packFloat128( zSign, 0x7FFF, 0, 0 );
4505 }
4506 if ( aExp == 0 ) {
4507 ++expDiff;
4508 }
4509 else {
4510 aSig0 |= LIT64( 0x0001000000000000 );
4511 }
4512 shift128ExtraRightJamming(
4513 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4514 zExp = bExp;
4515 }
4516 else {
4517 if ( aExp == 0x7FFF ) {
4518 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4519 return propagateFloat128NaN( a, b STATUS_VAR );
4520 }
4521 return a;
4522 }
4523 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4524 if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4525 zSig2 = 0;
4526 zSig0 |= LIT64( 0x0002000000000000 );
4527 zExp = aExp;
4528 goto shiftRight1;
4529 }
4530 aSig0 |= LIT64( 0x0001000000000000 );
4531 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4532 --zExp;
4533 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4534 ++zExp;
4535 shiftRight1:
4536 shift128ExtraRightJamming(
4537 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4538 roundAndPack:
4539 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4540
4541}
4542
4543/*----------------------------------------------------------------------------
4544| Returns the result of subtracting the absolute values of the quadruple-
4545| precision floating-point values `a' and `b'. If `zSign' is 1, the
4546| difference is negated before being returned. `zSign' is ignored if the
4547| result is a NaN. The subtraction is performed according to the IEC/IEEE
4548| Standard for Binary Floating-Point Arithmetic.
4549*----------------------------------------------------------------------------*/
4550
4551static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
4552{
4553 int32 aExp, bExp, zExp;
4554 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4555 int32 expDiff;
4556 float128 z;
4557
4558 aSig1 = extractFloat128Frac1( a );
4559 aSig0 = extractFloat128Frac0( a );
4560 aExp = extractFloat128Exp( a );
4561 bSig1 = extractFloat128Frac1( b );
4562 bSig0 = extractFloat128Frac0( b );
4563 bExp = extractFloat128Exp( b );
4564 expDiff = aExp - bExp;
4565 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4566 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4567 if ( 0 < expDiff ) goto aExpBigger;
4568 if ( expDiff < 0 ) goto bExpBigger;
4569 if ( aExp == 0x7FFF ) {
4570 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4571 return propagateFloat128NaN( a, b STATUS_VAR );
4572 }
4573 float_raise( float_flag_invalid STATUS_VAR);
4574 z.low = float128_default_nan_low;
4575 z.high = float128_default_nan_high;
4576 return z;
4577 }
4578 if ( aExp == 0 ) {
4579 aExp = 1;
4580 bExp = 1;
4581 }
4582 if ( bSig0 < aSig0 ) goto aBigger;
4583 if ( aSig0 < bSig0 ) goto bBigger;
4584 if ( bSig1 < aSig1 ) goto aBigger;
4585 if ( aSig1 < bSig1 ) goto bBigger;
4586 return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
4587 bExpBigger:
4588 if ( bExp == 0x7FFF ) {
4589 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4590 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4591 }
4592 if ( aExp == 0 ) {
4593 ++expDiff;
4594 }
4595 else {
4596 aSig0 |= LIT64( 0x4000000000000000 );
4597 }
4598 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4599 bSig0 |= LIT64( 0x4000000000000000 );
4600 bBigger:
4601 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4602 zExp = bExp;
4603 zSign ^= 1;
4604 goto normalizeRoundAndPack;
4605 aExpBigger:
4606 if ( aExp == 0x7FFF ) {
4607 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4608 return a;
4609 }
4610 if ( bExp == 0 ) {
4611 --expDiff;
4612 }
4613 else {
4614 bSig0 |= LIT64( 0x4000000000000000 );
4615 }
4616 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4617 aSig0 |= LIT64( 0x4000000000000000 );
4618 aBigger:
4619 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4620 zExp = aExp;
4621 normalizeRoundAndPack:
4622 --zExp;
4623 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
4624
4625}
4626
4627/*----------------------------------------------------------------------------
4628| Returns the result of adding the quadruple-precision floating-point values
4629| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
4630| for Binary Floating-Point Arithmetic.
4631*----------------------------------------------------------------------------*/
4632
4633float128 float128_add( float128 a, float128 b STATUS_PARAM )
4634{
4635 flag aSign, bSign;
4636
4637 aSign = extractFloat128Sign( a );
4638 bSign = extractFloat128Sign( b );
4639 if ( aSign == bSign ) {
4640 return addFloat128Sigs( a, b, aSign STATUS_VAR );
4641 }
4642 else {
4643 return subFloat128Sigs( a, b, aSign STATUS_VAR );
4644 }
4645
4646}
4647
4648/*----------------------------------------------------------------------------
4649| Returns the result of subtracting the quadruple-precision floating-point
4650| values `a' and `b'. The operation is performed according to the IEC/IEEE
4651| Standard for Binary Floating-Point Arithmetic.
4652*----------------------------------------------------------------------------*/
4653
4654float128 float128_sub( float128 a, float128 b STATUS_PARAM )
4655{
4656 flag aSign, bSign;
4657
4658 aSign = extractFloat128Sign( a );
4659 bSign = extractFloat128Sign( b );
4660 if ( aSign == bSign ) {
4661 return subFloat128Sigs( a, b, aSign STATUS_VAR );
4662 }
4663 else {
4664 return addFloat128Sigs( a, b, aSign STATUS_VAR );
4665 }
4666
4667}
4668
4669/*----------------------------------------------------------------------------
4670| Returns the result of multiplying the quadruple-precision floating-point
4671| values `a' and `b'. The operation is performed according to the IEC/IEEE
4672| Standard for Binary Floating-Point Arithmetic.
4673*----------------------------------------------------------------------------*/
4674
4675float128 float128_mul( float128 a, float128 b STATUS_PARAM )
4676{
4677 flag aSign, bSign, zSign;
4678 int32 aExp, bExp, zExp;
4679 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4680 float128 z;
4681
4682 aSig1 = extractFloat128Frac1( a );
4683 aSig0 = extractFloat128Frac0( a );
4684 aExp = extractFloat128Exp( a );
4685 aSign = extractFloat128Sign( a );
4686 bSig1 = extractFloat128Frac1( b );
4687 bSig0 = extractFloat128Frac0( b );
4688 bExp = extractFloat128Exp( b );
4689 bSign = extractFloat128Sign( b );
4690 zSign = aSign ^ bSign;
4691 if ( aExp == 0x7FFF ) {
4692 if ( ( aSig0 | aSig1 )
4693 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4694 return propagateFloat128NaN( a, b STATUS_VAR );
4695 }
4696 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4697 return packFloat128( zSign, 0x7FFF, 0, 0 );
4698 }
4699 if ( bExp == 0x7FFF ) {
4700 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4701 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4702 invalid:
4703 float_raise( float_flag_invalid STATUS_VAR);
4704 z.low = float128_default_nan_low;
4705 z.high = float128_default_nan_high;
4706 return z;
4707 }
4708 return packFloat128( zSign, 0x7FFF, 0, 0 );
4709 }
4710 if ( aExp == 0 ) {
4711 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4712 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4713 }
4714 if ( bExp == 0 ) {
4715 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4716 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4717 }
4718 zExp = aExp + bExp - 0x4000;
4719 aSig0 |= LIT64( 0x0001000000000000 );
4720 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
4721 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
4722 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
4723 zSig2 |= ( zSig3 != 0 );
4724 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
4725 shift128ExtraRightJamming(
4726 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4727 ++zExp;
4728 }
4729 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4730
4731}
4732
4733/*----------------------------------------------------------------------------
4734| Returns the result of dividing the quadruple-precision floating-point value
4735| `a' by the corresponding value `b'. The operation is performed according to
4736| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4737*----------------------------------------------------------------------------*/
4738
4739float128 float128_div( float128 a, float128 b STATUS_PARAM )
4740{
4741 flag aSign, bSign, zSign;
4742 int32 aExp, bExp, zExp;
4743 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4744 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4745 float128 z;
4746
4747 aSig1 = extractFloat128Frac1( a );
4748 aSig0 = extractFloat128Frac0( a );
4749 aExp = extractFloat128Exp( a );
4750 aSign = extractFloat128Sign( a );
4751 bSig1 = extractFloat128Frac1( b );
4752 bSig0 = extractFloat128Frac0( b );
4753 bExp = extractFloat128Exp( b );
4754 bSign = extractFloat128Sign( b );
4755 zSign = aSign ^ bSign;
4756 if ( aExp == 0x7FFF ) {
4757 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4758 if ( bExp == 0x7FFF ) {
4759 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4760 goto invalid;
4761 }
4762 return packFloat128( zSign, 0x7FFF, 0, 0 );
4763 }
4764 if ( bExp == 0x7FFF ) {
4765 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4766 return packFloat128( zSign, 0, 0, 0 );
4767 }
4768 if ( bExp == 0 ) {
4769 if ( ( bSig0 | bSig1 ) == 0 ) {
4770 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4771 invalid:
4772 float_raise( float_flag_invalid STATUS_VAR);
4773 z.low = float128_default_nan_low;
4774 z.high = float128_default_nan_high;
4775 return z;
4776 }
4777 float_raise( float_flag_divbyzero STATUS_VAR);
4778 return packFloat128( zSign, 0x7FFF, 0, 0 );
4779 }
4780 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4781 }
4782 if ( aExp == 0 ) {
4783 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4784 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4785 }
4786 zExp = aExp - bExp + 0x3FFD;
4787 shortShift128Left(
4788 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
4789 shortShift128Left(
4790 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4791 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
4792 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
4793 ++zExp;
4794 }
4795 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
4796 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
4797 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
4798 while ( (sbits64) rem0 < 0 ) {
4799 --zSig0;
4800 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
4801 }
4802 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
4803 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
4804 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
4805 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
4806 while ( (sbits64) rem1 < 0 ) {
4807 --zSig1;
4808 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
4809 }
4810 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4811 }
4812 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
4813 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4814
4815}
4816
4817/*----------------------------------------------------------------------------
4818| Returns the remainder of the quadruple-precision floating-point value `a'
4819| with respect to the corresponding value `b'. The operation is performed
4820| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4821*----------------------------------------------------------------------------*/
4822
4823float128 float128_rem( float128 a, float128 b STATUS_PARAM )
4824{
4825 flag aSign, bSign, zSign;
4826 int32 aExp, bExp, expDiff;
4827 bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
4828 bits64 allZero, alternateASig0, alternateASig1, sigMean1;
4829 sbits64 sigMean0;
4830 float128 z;
4831
4832 aSig1 = extractFloat128Frac1( a );
4833 aSig0 = extractFloat128Frac0( a );
4834 aExp = extractFloat128Exp( a );
4835 aSign = extractFloat128Sign( a );
4836 bSig1 = extractFloat128Frac1( b );
4837 bSig0 = extractFloat128Frac0( b );
4838 bExp = extractFloat128Exp( b );
4839 bSign = extractFloat128Sign( b );
4840 if ( aExp == 0x7FFF ) {
4841 if ( ( aSig0 | aSig1 )
4842 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4843 return propagateFloat128NaN( a, b STATUS_VAR );
4844 }
4845 goto invalid;
4846 }
4847 if ( bExp == 0x7FFF ) {
4848 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4849 return a;
4850 }
4851 if ( bExp == 0 ) {
4852 if ( ( bSig0 | bSig1 ) == 0 ) {
4853 invalid:
4854 float_raise( float_flag_invalid STATUS_VAR);
4855 z.low = float128_default_nan_low;
4856 z.high = float128_default_nan_high;
4857 return z;
4858 }
4859 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4860 }
4861 if ( aExp == 0 ) {
4862 if ( ( aSig0 | aSig1 ) == 0 ) return a;
4863 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4864 }
4865 expDiff = aExp - bExp;
4866 if ( expDiff < -1 ) return a;
4867 shortShift128Left(
4868 aSig0 | LIT64( 0x0001000000000000 ),
4869 aSig1,
4870 15 - ( expDiff < 0 ),
4871 &aSig0,
4872 &aSig1
4873 );
4874 shortShift128Left(
4875 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4876 q = le128( bSig0, bSig1, aSig0, aSig1 );
4877 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
4878 expDiff -= 64;
4879 while ( 0 < expDiff ) {
4880 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
4881 q = ( 4 < q ) ? q - 4 : 0;
4882 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
4883 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
4884 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
4885 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
4886 expDiff -= 61;
4887 }
4888 if ( -64 < expDiff ) {
4889 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
4890 q = ( 4 < q ) ? q - 4 : 0;
4891 q >>= - expDiff;
4892 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
4893 expDiff += 52;
4894 if ( expDiff < 0 ) {
4895 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4896 }
4897 else {
4898 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
4899 }
4900 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
4901 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
4902 }
4903 else {
4904 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
4905 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
4906 }
4907 do {
4908 alternateASig0 = aSig0;
4909 alternateASig1 = aSig1;
4910 ++q;
4911 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
4912 } while ( 0 <= (sbits64) aSig0 );
4913 add128(
4914 aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
4915 if ( ( sigMean0 < 0 )
4916 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
4917 aSig0 = alternateASig0;
4918 aSig1 = alternateASig1;
4919 }
4920 zSign = ( (sbits64) aSig0 < 0 );
4921 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
4922 return
4923 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
4924
4925}
4926
4927/*----------------------------------------------------------------------------
4928| Returns the square root of the quadruple-precision floating-point value `a'.
4929| The operation is performed according to the IEC/IEEE Standard for Binary
4930| Floating-Point Arithmetic.
4931*----------------------------------------------------------------------------*/
4932
4933float128 float128_sqrt( float128 a STATUS_PARAM )
4934{
4935 flag aSign;
4936 int32 aExp, zExp;
4937 bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
4938 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4939 float128 z;
4940
4941 aSig1 = extractFloat128Frac1( a );
4942 aSig0 = extractFloat128Frac0( a );
4943 aExp = extractFloat128Exp( a );
4944 aSign = extractFloat128Sign( a );
4945 if ( aExp == 0x7FFF ) {
4946 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
4947 if ( ! aSign ) return a;
4948 goto invalid;
4949 }
4950 if ( aSign ) {
4951 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
4952 invalid:
4953 float_raise( float_flag_invalid STATUS_VAR);
4954 z.low = float128_default_nan_low;
4955 z.high = float128_default_nan_high;
4956 return z;
4957 }
4958 if ( aExp == 0 ) {
4959 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
4960 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4961 }
4962 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
4963 aSig0 |= LIT64( 0x0001000000000000 );
4964 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
4965 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
4966 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4967 doubleZSig0 = zSig0<<1;
4968 mul64To128( zSig0, zSig0, &term0, &term1 );
4969 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4970 while ( (sbits64) rem0 < 0 ) {
4971 --zSig0;
4972 doubleZSig0 -= 2;
4973 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4974 }
4975 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4976 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
4977 if ( zSig1 == 0 ) zSig1 = 1;
4978 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4979 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4980 mul64To128( zSig1, zSig1, &term2, &term3 );
4981 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4982 while ( (sbits64) rem1 < 0 ) {
4983 --zSig1;
4984 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4985 term3 |= 1;
4986 term2 |= doubleZSig0;
4987 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4988 }
4989 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4990 }
4991 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
4992 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4993
4994}
4995
4996/*----------------------------------------------------------------------------
4997| Returns 1 if the quadruple-precision floating-point value `a' is equal to
4998| the corresponding value `b', and 0 otherwise. The comparison is performed
4999| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5000*----------------------------------------------------------------------------*/
5001
5002flag float128_eq( float128 a, float128 b STATUS_PARAM )
5003{
5004
5005 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5006 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5007 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5008 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5009 ) {
5010 if ( float128_is_signaling_nan( a )
5011 || float128_is_signaling_nan( b ) ) {
5012 float_raise( float_flag_invalid STATUS_VAR);
5013 }
5014 return 0;
5015 }
5016 return
5017 ( a.low == b.low )
5018 && ( ( a.high == b.high )
5019 || ( ( a.low == 0 )
5020 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5021 );
5022
5023}
5024
5025/*----------------------------------------------------------------------------
5026| Returns 1 if the quadruple-precision floating-point value `a' is less than
5027| or equal to the corresponding value `b', and 0 otherwise. The comparison
5028| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5029| Arithmetic.
5030*----------------------------------------------------------------------------*/
5031
5032flag float128_le( float128 a, float128 b STATUS_PARAM )
5033{
5034 flag aSign, bSign;
5035
5036 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5037 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5038 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5039 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5040 ) {
5041 float_raise( float_flag_invalid STATUS_VAR);
5042 return 0;
5043 }
5044 aSign = extractFloat128Sign( a );
5045 bSign = extractFloat128Sign( b );
5046 if ( aSign != bSign ) {
5047 return
5048 aSign
5049 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5050 == 0 );
5051 }
5052 return
5053 aSign ? le128( b.high, b.low, a.high, a.low )
5054 : le128( a.high, a.low, b.high, b.low );
5055
5056}
5057
5058/*----------------------------------------------------------------------------
5059| Returns 1 if the quadruple-precision floating-point value `a' is less than
5060| the corresponding value `b', and 0 otherwise. The comparison is performed
5061| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5062*----------------------------------------------------------------------------*/
5063
5064flag float128_lt( float128 a, float128 b STATUS_PARAM )
5065{
5066 flag aSign, bSign;
5067
5068 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5069 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5070 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5071 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5072 ) {
5073 float_raise( float_flag_invalid STATUS_VAR);
5074 return 0;
5075 }
5076 aSign = extractFloat128Sign( a );
5077 bSign = extractFloat128Sign( b );
5078 if ( aSign != bSign ) {
5079 return
5080 aSign
5081 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5082 != 0 );
5083 }
5084 return
5085 aSign ? lt128( b.high, b.low, a.high, a.low )
5086 : lt128( a.high, a.low, b.high, b.low );
5087
5088}
5089
5090/*----------------------------------------------------------------------------
5091| Returns 1 if the quadruple-precision floating-point value `a' is equal to
5092| the corresponding value `b', and 0 otherwise. The invalid exception is
5093| raised if either operand is a NaN. Otherwise, the comparison is performed
5094| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5095*----------------------------------------------------------------------------*/
5096
5097flag float128_eq_signaling( float128 a, float128 b STATUS_PARAM )
5098{
5099
5100 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5101 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5102 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5103 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5104 ) {
5105 float_raise( float_flag_invalid STATUS_VAR);
5106 return 0;
5107 }
5108 return
5109 ( a.low == b.low )
5110 && ( ( a.high == b.high )
5111 || ( ( a.low == 0 )
5112 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5113 );
5114
5115}
5116
5117/*----------------------------------------------------------------------------
5118| Returns 1 if the quadruple-precision floating-point value `a' is less than
5119| or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5120| cause an exception. Otherwise, the comparison is performed according to the
5121| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5122*----------------------------------------------------------------------------*/
5123
5124flag float128_le_quiet( float128 a, float128 b STATUS_PARAM )
5125{
5126 flag aSign, bSign;
5127
5128 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5129 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5130 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5131 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5132 ) {
5133 if ( float128_is_signaling_nan( a )
5134 || float128_is_signaling_nan( b ) ) {
5135 float_raise( float_flag_invalid STATUS_VAR);
5136 }
5137 return 0;
5138 }
5139 aSign = extractFloat128Sign( a );
5140 bSign = extractFloat128Sign( b );
5141 if ( aSign != bSign ) {
5142 return
5143 aSign
5144 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5145 == 0 );
5146 }
5147 return
5148 aSign ? le128( b.high, b.low, a.high, a.low )
5149 : le128( a.high, a.low, b.high, b.low );
5150
5151}
5152
5153/*----------------------------------------------------------------------------
5154| Returns 1 if the quadruple-precision floating-point value `a' is less than
5155| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5156| exception. Otherwise, the comparison is performed according to the IEC/IEEE
5157| Standard for Binary Floating-Point Arithmetic.
5158*----------------------------------------------------------------------------*/
5159
5160flag float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
5161{
5162 flag aSign, bSign;
5163
5164 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5165 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5166 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5167 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5168 ) {
5169 if ( float128_is_signaling_nan( a )
5170 || float128_is_signaling_nan( b ) ) {
5171 float_raise( float_flag_invalid STATUS_VAR);
5172 }
5173 return 0;
5174 }
5175 aSign = extractFloat128Sign( a );
5176 bSign = extractFloat128Sign( b );
5177 if ( aSign != bSign ) {
5178 return
5179 aSign
5180 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5181 != 0 );
5182 }
5183 return
5184 aSign ? lt128( b.high, b.low, a.high, a.low )
5185 : lt128( a.high, a.low, b.high, b.low );
5186
5187}
5188
5189#endif
5190
1d6bda35
FB
5191/* misc functions */
5192float32 uint32_to_float32( unsigned int a STATUS_PARAM )
5193{
5194 return int64_to_float32(a STATUS_VAR);
5195}
5196
5197float64 uint32_to_float64( unsigned int a STATUS_PARAM )
5198{
5199 return int64_to_float64(a STATUS_VAR);
5200}
5201
5202unsigned int float32_to_uint32( float32 a STATUS_PARAM )
5203{
5204 int64_t v;
5205 unsigned int res;
5206
5207 v = float32_to_int64(a STATUS_VAR);
5208 if (v < 0) {
5209 res = 0;
5210 float_raise( float_flag_invalid STATUS_VAR);
5211 } else if (v > 0xffffffff) {
5212 res = 0xffffffff;
5213 float_raise( float_flag_invalid STATUS_VAR);
5214 } else {
5215 res = v;
5216 }
5217 return res;
5218}
5219
5220unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
5221{
5222 int64_t v;
5223 unsigned int res;
5224
5225 v = float32_to_int64_round_to_zero(a STATUS_VAR);
5226 if (v < 0) {
5227 res = 0;
5228 float_raise( float_flag_invalid STATUS_VAR);
5229 } else if (v > 0xffffffff) {
5230 res = 0xffffffff;
5231 float_raise( float_flag_invalid STATUS_VAR);
5232 } else {
5233 res = v;
5234 }
5235 return res;
5236}
5237
5238unsigned int float64_to_uint32( float64 a STATUS_PARAM )
5239{
5240 int64_t v;
5241 unsigned int res;
5242
5243 v = float64_to_int64(a STATUS_VAR);
5244 if (v < 0) {
5245 res = 0;
5246 float_raise( float_flag_invalid STATUS_VAR);
5247 } else if (v > 0xffffffff) {
5248 res = 0xffffffff;
5249 float_raise( float_flag_invalid STATUS_VAR);
5250 } else {
5251 res = v;
5252 }
5253 return res;
5254}
5255
5256unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
5257{
5258 int64_t v;
5259 unsigned int res;
5260
5261 v = float64_to_int64_round_to_zero(a STATUS_VAR);
5262 if (v < 0) {
5263 res = 0;
5264 float_raise( float_flag_invalid STATUS_VAR);
5265 } else if (v > 0xffffffff) {
5266 res = 0xffffffff;
5267 float_raise( float_flag_invalid STATUS_VAR);
5268 } else {
5269 res = v;
5270 }
5271 return res;
5272}
5273
5274#define COMPARE(s, nan_exp) \
5275INLINE char float ## s ## _compare_internal( float ## s a, float ## s b, \
5276 int is_quiet STATUS_PARAM ) \
5277{ \
5278 flag aSign, bSign; \
5279 \
5280 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
5281 extractFloat ## s ## Frac( a ) ) || \
5282 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
5283 extractFloat ## s ## Frac( b ) )) { \
5284 if (!is_quiet || \
5285 float ## s ## _is_signaling_nan( a ) || \
5286 float ## s ## _is_signaling_nan( b ) ) { \
5287 float_raise( float_flag_invalid STATUS_VAR); \
5288 } \
5289 return float_relation_unordered; \
5290 } \
5291 aSign = extractFloat ## s ## Sign( a ); \
5292 bSign = extractFloat ## s ## Sign( b ); \
5293 if ( aSign != bSign ) { \
5294 if ( (bits ## s) ( ( a | b )<<1 ) == 0 ) { \
5295 /* zero case */ \
5296 return float_relation_equal; \
5297 } else { \
5298 return 1 - (2 * aSign); \
5299 } \
5300 } else { \
5301 if (a == b) { \
5302 return float_relation_equal; \
5303 } else { \
5304 return 1 - 2 * (aSign ^ ( a < b )); \
5305 } \
5306 } \
5307} \
5308 \
5309char float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM ) \
5310{ \
5311 return float ## s ## _compare_internal(a, b, 0 STATUS_VAR); \
5312} \
5313 \
5314char float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \
5315{ \
5316 return float ## s ## _compare_internal(a, b, 1 STATUS_VAR); \
5317}
5318
5319COMPARE(32, 0xff)
5320COMPARE(64, 0x7ff)