]> git.proxmox.com Git - qemu.git/blame - fpu/softfloat.c
monitor: fix build breakage for !CONFIG_VNC
[qemu.git] / fpu / softfloat.c
CommitLineData
8d725fac
AF
1/*
2 * QEMU float support
3 *
4 * Derived from SoftFloat.
5 */
158142c2
FB
6
7/*============================================================================
8
9This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
10Package, Release 2b.
11
12Written by John R. Hauser. This work was made possible in part by the
13International Computer Science Institute, located at Suite 600, 1947 Center
14Street, Berkeley, California 94704. Funding was partially provided by the
15National Science Foundation under grant MIP-9311980. The original version
16of this code was written as part of a project to build a fixed-point vector
17processor in collaboration with the University of California at Berkeley,
18overseen by Profs. Nelson Morgan and John Wawrzynek. More information
19is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
20arithmetic/SoftFloat.html'.
21
22THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
23been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
24RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
25AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
26COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
27EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
28INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
29OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
30
31Derivative works are acceptable, even for commercial purposes, so long as
32(1) the source code for the derivative work includes prominent notice that
33the work is derivative, and (2) the source code includes prominent notice with
34these four paragraphs for those parts of this code that are retained.
35
36=============================================================================*/
37
38#include "softfloat.h"
39
40/*----------------------------------------------------------------------------
41| Primitive arithmetic functions, including multi-word arithmetic, and
42| division and square root approximations. (Can be specialized to target if
43| desired.)
44*----------------------------------------------------------------------------*/
45#include "softfloat-macros.h"
46
47/*----------------------------------------------------------------------------
48| Functions and definitions to determine: (1) whether tininess for underflow
49| is detected before or after rounding by default, (2) what (if anything)
50| happens when exceptions are raised, (3) how signaling NaNs are distinguished
51| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
52| are propagated from function inputs to output. These details are target-
53| specific.
54*----------------------------------------------------------------------------*/
55#include "softfloat-specialize.h"
56
57void set_float_rounding_mode(int val STATUS_PARAM)
58{
59 STATUS(float_rounding_mode) = val;
60}
61
1d6bda35
FB
62void set_float_exception_flags(int val STATUS_PARAM)
63{
64 STATUS(float_exception_flags) = val;
65}
66
158142c2
FB
67void set_floatx80_rounding_precision(int val STATUS_PARAM)
68{
69 STATUS(floatx80_rounding_precision) = val;
70}
158142c2 71
bb4d4bb3
PM
72/*----------------------------------------------------------------------------
73| Returns the fraction bits of the half-precision floating-point value `a'.
74*----------------------------------------------------------------------------*/
75
76INLINE uint32_t extractFloat16Frac(float16 a)
77{
78 return float16_val(a) & 0x3ff;
79}
80
81/*----------------------------------------------------------------------------
82| Returns the exponent bits of the half-precision floating-point value `a'.
83*----------------------------------------------------------------------------*/
84
85INLINE int16 extractFloat16Exp(float16 a)
86{
87 return (float16_val(a) >> 10) & 0x1f;
88}
89
90/*----------------------------------------------------------------------------
91| Returns the sign bit of the single-precision floating-point value `a'.
92*----------------------------------------------------------------------------*/
93
94INLINE flag extractFloat16Sign(float16 a)
95{
96 return float16_val(a)>>15;
97}
98
158142c2
FB
99/*----------------------------------------------------------------------------
100| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
101| and 7, and returns the properly rounded 32-bit integer corresponding to the
102| input. If `zSign' is 1, the input is negated before being converted to an
103| integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
104| is simply rounded to an integer, with the inexact exception raised if the
105| input cannot be represented exactly as an integer. However, if the fixed-
106| point input is too large, the invalid exception is raised and the largest
107| positive or negative integer is returned.
108*----------------------------------------------------------------------------*/
109
bb98fe42 110static int32 roundAndPackInt32( flag zSign, uint64_t absZ STATUS_PARAM)
158142c2
FB
111{
112 int8 roundingMode;
113 flag roundNearestEven;
114 int8 roundIncrement, roundBits;
115 int32 z;
116
117 roundingMode = STATUS(float_rounding_mode);
118 roundNearestEven = ( roundingMode == float_round_nearest_even );
119 roundIncrement = 0x40;
120 if ( ! roundNearestEven ) {
121 if ( roundingMode == float_round_to_zero ) {
122 roundIncrement = 0;
123 }
124 else {
125 roundIncrement = 0x7F;
126 if ( zSign ) {
127 if ( roundingMode == float_round_up ) roundIncrement = 0;
128 }
129 else {
130 if ( roundingMode == float_round_down ) roundIncrement = 0;
131 }
132 }
133 }
134 roundBits = absZ & 0x7F;
135 absZ = ( absZ + roundIncrement )>>7;
136 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
137 z = absZ;
138 if ( zSign ) z = - z;
139 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
140 float_raise( float_flag_invalid STATUS_VAR);
bb98fe42 141 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2
FB
142 }
143 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
144 return z;
145
146}
147
148/*----------------------------------------------------------------------------
149| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
150| `absZ1', with binary point between bits 63 and 64 (between the input words),
151| and returns the properly rounded 64-bit integer corresponding to the input.
152| If `zSign' is 1, the input is negated before being converted to an integer.
153| Ordinarily, the fixed-point input is simply rounded to an integer, with
154| the inexact exception raised if the input cannot be represented exactly as
155| an integer. However, if the fixed-point input is too large, the invalid
156| exception is raised and the largest positive or negative integer is
157| returned.
158*----------------------------------------------------------------------------*/
159
bb98fe42 160static int64 roundAndPackInt64( flag zSign, uint64_t absZ0, uint64_t absZ1 STATUS_PARAM)
158142c2
FB
161{
162 int8 roundingMode;
163 flag roundNearestEven, increment;
164 int64 z;
165
166 roundingMode = STATUS(float_rounding_mode);
167 roundNearestEven = ( roundingMode == float_round_nearest_even );
bb98fe42 168 increment = ( (int64_t) absZ1 < 0 );
158142c2
FB
169 if ( ! roundNearestEven ) {
170 if ( roundingMode == float_round_to_zero ) {
171 increment = 0;
172 }
173 else {
174 if ( zSign ) {
175 increment = ( roundingMode == float_round_down ) && absZ1;
176 }
177 else {
178 increment = ( roundingMode == float_round_up ) && absZ1;
179 }
180 }
181 }
182 if ( increment ) {
183 ++absZ0;
184 if ( absZ0 == 0 ) goto overflow;
bb98fe42 185 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
158142c2
FB
186 }
187 z = absZ0;
188 if ( zSign ) z = - z;
189 if ( z && ( ( z < 0 ) ^ zSign ) ) {
190 overflow:
191 float_raise( float_flag_invalid STATUS_VAR);
192 return
bb98fe42 193 zSign ? (int64_t) LIT64( 0x8000000000000000 )
158142c2
FB
194 : LIT64( 0x7FFFFFFFFFFFFFFF );
195 }
196 if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
197 return z;
198
199}
200
201/*----------------------------------------------------------------------------
202| Returns the fraction bits of the single-precision floating-point value `a'.
203*----------------------------------------------------------------------------*/
204
bb98fe42 205INLINE uint32_t extractFloat32Frac( float32 a )
158142c2
FB
206{
207
f090c9d4 208 return float32_val(a) & 0x007FFFFF;
158142c2
FB
209
210}
211
212/*----------------------------------------------------------------------------
213| Returns the exponent bits of the single-precision floating-point value `a'.
214*----------------------------------------------------------------------------*/
215
216INLINE int16 extractFloat32Exp( float32 a )
217{
218
f090c9d4 219 return ( float32_val(a)>>23 ) & 0xFF;
158142c2
FB
220
221}
222
223/*----------------------------------------------------------------------------
224| Returns the sign bit of the single-precision floating-point value `a'.
225*----------------------------------------------------------------------------*/
226
227INLINE flag extractFloat32Sign( float32 a )
228{
229
f090c9d4 230 return float32_val(a)>>31;
158142c2
FB
231
232}
233
37d18660
PM
234/*----------------------------------------------------------------------------
235| If `a' is denormal and we are in flush-to-zero mode then set the
236| input-denormal exception and return zero. Otherwise just return the value.
237*----------------------------------------------------------------------------*/
238static float32 float32_squash_input_denormal(float32 a STATUS_PARAM)
239{
240 if (STATUS(flush_inputs_to_zero)) {
241 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
242 float_raise(float_flag_input_denormal STATUS_VAR);
243 return make_float32(float32_val(a) & 0x80000000);
244 }
245 }
246 return a;
247}
248
158142c2
FB
249/*----------------------------------------------------------------------------
250| Normalizes the subnormal single-precision floating-point value represented
251| by the denormalized significand `aSig'. The normalized exponent and
252| significand are stored at the locations pointed to by `zExpPtr' and
253| `zSigPtr', respectively.
254*----------------------------------------------------------------------------*/
255
256static void
bb98fe42 257 normalizeFloat32Subnormal( uint32_t aSig, int16 *zExpPtr, uint32_t *zSigPtr )
158142c2
FB
258{
259 int8 shiftCount;
260
261 shiftCount = countLeadingZeros32( aSig ) - 8;
262 *zSigPtr = aSig<<shiftCount;
263 *zExpPtr = 1 - shiftCount;
264
265}
266
267/*----------------------------------------------------------------------------
268| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
269| single-precision floating-point value, returning the result. After being
270| shifted into the proper positions, the three fields are simply added
271| together to form the result. This means that any integer portion of `zSig'
272| will be added into the exponent. Since a properly normalized significand
273| will have an integer portion equal to 1, the `zExp' input should be 1 less
274| than the desired result exponent whenever `zSig' is a complete, normalized
275| significand.
276*----------------------------------------------------------------------------*/
277
bb98fe42 278INLINE float32 packFloat32( flag zSign, int16 zExp, uint32_t zSig )
158142c2
FB
279{
280
f090c9d4 281 return make_float32(
bb98fe42 282 ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
158142c2
FB
283
284}
285
286/*----------------------------------------------------------------------------
287| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
288| and significand `zSig', and returns the proper single-precision floating-
289| point value corresponding to the abstract input. Ordinarily, the abstract
290| value is simply rounded and packed into the single-precision format, with
291| the inexact exception raised if the abstract input cannot be represented
292| exactly. However, if the abstract value is too large, the overflow and
293| inexact exceptions are raised and an infinity or maximal finite value is
294| returned. If the abstract value is too small, the input value is rounded to
295| a subnormal number, and the underflow and inexact exceptions are raised if
296| the abstract input cannot be represented exactly as a subnormal single-
297| precision floating-point number.
298| The input significand `zSig' has its binary point between bits 30
299| and 29, which is 7 bits to the left of the usual location. This shifted
300| significand must be normalized or smaller. If `zSig' is not normalized,
301| `zExp' must be 0; in that case, the result returned is a subnormal number,
302| and it must not require rounding. In the usual case that `zSig' is
303| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
304| The handling of underflow and overflow follows the IEC/IEEE Standard for
305| Binary Floating-Point Arithmetic.
306*----------------------------------------------------------------------------*/
307
bb98fe42 308static float32 roundAndPackFloat32( flag zSign, int16 zExp, uint32_t zSig STATUS_PARAM)
158142c2
FB
309{
310 int8 roundingMode;
311 flag roundNearestEven;
312 int8 roundIncrement, roundBits;
313 flag isTiny;
314
315 roundingMode = STATUS(float_rounding_mode);
316 roundNearestEven = ( roundingMode == float_round_nearest_even );
317 roundIncrement = 0x40;
318 if ( ! roundNearestEven ) {
319 if ( roundingMode == float_round_to_zero ) {
320 roundIncrement = 0;
321 }
322 else {
323 roundIncrement = 0x7F;
324 if ( zSign ) {
325 if ( roundingMode == float_round_up ) roundIncrement = 0;
326 }
327 else {
328 if ( roundingMode == float_round_down ) roundIncrement = 0;
329 }
330 }
331 }
332 roundBits = zSig & 0x7F;
bb98fe42 333 if ( 0xFD <= (uint16_t) zExp ) {
158142c2
FB
334 if ( ( 0xFD < zExp )
335 || ( ( zExp == 0xFD )
bb98fe42 336 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
158142c2
FB
337 ) {
338 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
f090c9d4 339 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
158142c2
FB
340 }
341 if ( zExp < 0 ) {
e6afc87f
PM
342 if (STATUS(flush_to_zero)) {
343 float_raise(float_flag_output_denormal STATUS_VAR);
344 return packFloat32(zSign, 0, 0);
345 }
158142c2
FB
346 isTiny =
347 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
348 || ( zExp < -1 )
349 || ( zSig + roundIncrement < 0x80000000 );
350 shift32RightJamming( zSig, - zExp, &zSig );
351 zExp = 0;
352 roundBits = zSig & 0x7F;
353 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
354 }
355 }
356 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
357 zSig = ( zSig + roundIncrement )>>7;
358 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
359 if ( zSig == 0 ) zExp = 0;
360 return packFloat32( zSign, zExp, zSig );
361
362}
363
364/*----------------------------------------------------------------------------
365| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
366| and significand `zSig', and returns the proper single-precision floating-
367| point value corresponding to the abstract input. This routine is just like
368| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
369| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
370| floating-point exponent.
371*----------------------------------------------------------------------------*/
372
373static float32
bb98fe42 374 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, uint32_t zSig STATUS_PARAM)
158142c2
FB
375{
376 int8 shiftCount;
377
378 shiftCount = countLeadingZeros32( zSig ) - 1;
379 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
380
381}
382
383/*----------------------------------------------------------------------------
384| Returns the fraction bits of the double-precision floating-point value `a'.
385*----------------------------------------------------------------------------*/
386
bb98fe42 387INLINE uint64_t extractFloat64Frac( float64 a )
158142c2
FB
388{
389
f090c9d4 390 return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
158142c2
FB
391
392}
393
394/*----------------------------------------------------------------------------
395| Returns the exponent bits of the double-precision floating-point value `a'.
396*----------------------------------------------------------------------------*/
397
398INLINE int16 extractFloat64Exp( float64 a )
399{
400
f090c9d4 401 return ( float64_val(a)>>52 ) & 0x7FF;
158142c2
FB
402
403}
404
405/*----------------------------------------------------------------------------
406| Returns the sign bit of the double-precision floating-point value `a'.
407*----------------------------------------------------------------------------*/
408
409INLINE flag extractFloat64Sign( float64 a )
410{
411
f090c9d4 412 return float64_val(a)>>63;
158142c2
FB
413
414}
415
37d18660
PM
416/*----------------------------------------------------------------------------
417| If `a' is denormal and we are in flush-to-zero mode then set the
418| input-denormal exception and return zero. Otherwise just return the value.
419*----------------------------------------------------------------------------*/
420static float64 float64_squash_input_denormal(float64 a STATUS_PARAM)
421{
422 if (STATUS(flush_inputs_to_zero)) {
423 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
424 float_raise(float_flag_input_denormal STATUS_VAR);
425 return make_float64(float64_val(a) & (1ULL << 63));
426 }
427 }
428 return a;
429}
430
158142c2
FB
431/*----------------------------------------------------------------------------
432| Normalizes the subnormal double-precision floating-point value represented
433| by the denormalized significand `aSig'. The normalized exponent and
434| significand are stored at the locations pointed to by `zExpPtr' and
435| `zSigPtr', respectively.
436*----------------------------------------------------------------------------*/
437
438static void
bb98fe42 439 normalizeFloat64Subnormal( uint64_t aSig, int16 *zExpPtr, uint64_t *zSigPtr )
158142c2
FB
440{
441 int8 shiftCount;
442
443 shiftCount = countLeadingZeros64( aSig ) - 11;
444 *zSigPtr = aSig<<shiftCount;
445 *zExpPtr = 1 - shiftCount;
446
447}
448
449/*----------------------------------------------------------------------------
450| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
451| double-precision floating-point value, returning the result. After being
452| shifted into the proper positions, the three fields are simply added
453| together to form the result. This means that any integer portion of `zSig'
454| will be added into the exponent. Since a properly normalized significand
455| will have an integer portion equal to 1, the `zExp' input should be 1 less
456| than the desired result exponent whenever `zSig' is a complete, normalized
457| significand.
458*----------------------------------------------------------------------------*/
459
bb98fe42 460INLINE float64 packFloat64( flag zSign, int16 zExp, uint64_t zSig )
158142c2
FB
461{
462
f090c9d4 463 return make_float64(
bb98fe42 464 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
158142c2
FB
465
466}
467
468/*----------------------------------------------------------------------------
469| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
470| and significand `zSig', and returns the proper double-precision floating-
471| point value corresponding to the abstract input. Ordinarily, the abstract
472| value is simply rounded and packed into the double-precision format, with
473| the inexact exception raised if the abstract input cannot be represented
474| exactly. However, if the abstract value is too large, the overflow and
475| inexact exceptions are raised and an infinity or maximal finite value is
476| returned. If the abstract value is too small, the input value is rounded
477| to a subnormal number, and the underflow and inexact exceptions are raised
478| if the abstract input cannot be represented exactly as a subnormal double-
479| precision floating-point number.
480| The input significand `zSig' has its binary point between bits 62
481| and 61, which is 10 bits to the left of the usual location. This shifted
482| significand must be normalized or smaller. If `zSig' is not normalized,
483| `zExp' must be 0; in that case, the result returned is a subnormal number,
484| and it must not require rounding. In the usual case that `zSig' is
485| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
486| The handling of underflow and overflow follows the IEC/IEEE Standard for
487| Binary Floating-Point Arithmetic.
488*----------------------------------------------------------------------------*/
489
bb98fe42 490static float64 roundAndPackFloat64( flag zSign, int16 zExp, uint64_t zSig STATUS_PARAM)
158142c2
FB
491{
492 int8 roundingMode;
493 flag roundNearestEven;
494 int16 roundIncrement, roundBits;
495 flag isTiny;
496
497 roundingMode = STATUS(float_rounding_mode);
498 roundNearestEven = ( roundingMode == float_round_nearest_even );
499 roundIncrement = 0x200;
500 if ( ! roundNearestEven ) {
501 if ( roundingMode == float_round_to_zero ) {
502 roundIncrement = 0;
503 }
504 else {
505 roundIncrement = 0x3FF;
506 if ( zSign ) {
507 if ( roundingMode == float_round_up ) roundIncrement = 0;
508 }
509 else {
510 if ( roundingMode == float_round_down ) roundIncrement = 0;
511 }
512 }
513 }
514 roundBits = zSig & 0x3FF;
bb98fe42 515 if ( 0x7FD <= (uint16_t) zExp ) {
158142c2
FB
516 if ( ( 0x7FD < zExp )
517 || ( ( zExp == 0x7FD )
bb98fe42 518 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
158142c2
FB
519 ) {
520 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
f090c9d4 521 return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
158142c2
FB
522 }
523 if ( zExp < 0 ) {
e6afc87f
PM
524 if (STATUS(flush_to_zero)) {
525 float_raise(float_flag_output_denormal STATUS_VAR);
526 return packFloat64(zSign, 0, 0);
527 }
158142c2
FB
528 isTiny =
529 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
530 || ( zExp < -1 )
531 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
532 shift64RightJamming( zSig, - zExp, &zSig );
533 zExp = 0;
534 roundBits = zSig & 0x3FF;
535 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
536 }
537 }
538 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
539 zSig = ( zSig + roundIncrement )>>10;
540 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
541 if ( zSig == 0 ) zExp = 0;
542 return packFloat64( zSign, zExp, zSig );
543
544}
545
546/*----------------------------------------------------------------------------
547| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
548| and significand `zSig', and returns the proper double-precision floating-
549| point value corresponding to the abstract input. This routine is just like
550| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
551| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
552| floating-point exponent.
553*----------------------------------------------------------------------------*/
554
555static float64
bb98fe42 556 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, uint64_t zSig STATUS_PARAM)
158142c2
FB
557{
558 int8 shiftCount;
559
560 shiftCount = countLeadingZeros64( zSig ) - 1;
561 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
562
563}
564
158142c2
FB
565/*----------------------------------------------------------------------------
566| Returns the fraction bits of the extended double-precision floating-point
567| value `a'.
568*----------------------------------------------------------------------------*/
569
bb98fe42 570INLINE uint64_t extractFloatx80Frac( floatx80 a )
158142c2
FB
571{
572
573 return a.low;
574
575}
576
577/*----------------------------------------------------------------------------
578| Returns the exponent bits of the extended double-precision floating-point
579| value `a'.
580*----------------------------------------------------------------------------*/
581
582INLINE int32 extractFloatx80Exp( floatx80 a )
583{
584
585 return a.high & 0x7FFF;
586
587}
588
589/*----------------------------------------------------------------------------
590| Returns the sign bit of the extended double-precision floating-point value
591| `a'.
592*----------------------------------------------------------------------------*/
593
594INLINE flag extractFloatx80Sign( floatx80 a )
595{
596
597 return a.high>>15;
598
599}
600
601/*----------------------------------------------------------------------------
602| Normalizes the subnormal extended double-precision floating-point value
603| represented by the denormalized significand `aSig'. The normalized exponent
604| and significand are stored at the locations pointed to by `zExpPtr' and
605| `zSigPtr', respectively.
606*----------------------------------------------------------------------------*/
607
608static void
bb98fe42 609 normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr )
158142c2
FB
610{
611 int8 shiftCount;
612
613 shiftCount = countLeadingZeros64( aSig );
614 *zSigPtr = aSig<<shiftCount;
615 *zExpPtr = 1 - shiftCount;
616
617}
618
619/*----------------------------------------------------------------------------
620| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
621| extended double-precision floating-point value, returning the result.
622*----------------------------------------------------------------------------*/
623
bb98fe42 624INLINE floatx80 packFloatx80( flag zSign, int32 zExp, uint64_t zSig )
158142c2
FB
625{
626 floatx80 z;
627
628 z.low = zSig;
bb98fe42 629 z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
158142c2
FB
630 return z;
631
632}
633
634/*----------------------------------------------------------------------------
635| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
636| and extended significand formed by the concatenation of `zSig0' and `zSig1',
637| and returns the proper extended double-precision floating-point value
638| corresponding to the abstract input. Ordinarily, the abstract value is
639| rounded and packed into the extended double-precision format, with the
640| inexact exception raised if the abstract input cannot be represented
641| exactly. However, if the abstract value is too large, the overflow and
642| inexact exceptions are raised and an infinity or maximal finite value is
643| returned. If the abstract value is too small, the input value is rounded to
644| a subnormal number, and the underflow and inexact exceptions are raised if
645| the abstract input cannot be represented exactly as a subnormal extended
646| double-precision floating-point number.
647| If `roundingPrecision' is 32 or 64, the result is rounded to the same
648| number of bits as single or double precision, respectively. Otherwise, the
649| result is rounded to the full precision of the extended double-precision
650| format.
651| The input significand must be normalized or smaller. If the input
652| significand is not normalized, `zExp' must be 0; in that case, the result
653| returned is a subnormal number, and it must not require rounding. The
654| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
655| Floating-Point Arithmetic.
656*----------------------------------------------------------------------------*/
657
658static floatx80
659 roundAndPackFloatx80(
bb98fe42 660 int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
158142c2
FB
661 STATUS_PARAM)
662{
663 int8 roundingMode;
664 flag roundNearestEven, increment, isTiny;
665 int64 roundIncrement, roundMask, roundBits;
666
667 roundingMode = STATUS(float_rounding_mode);
668 roundNearestEven = ( roundingMode == float_round_nearest_even );
669 if ( roundingPrecision == 80 ) goto precision80;
670 if ( roundingPrecision == 64 ) {
671 roundIncrement = LIT64( 0x0000000000000400 );
672 roundMask = LIT64( 0x00000000000007FF );
673 }
674 else if ( roundingPrecision == 32 ) {
675 roundIncrement = LIT64( 0x0000008000000000 );
676 roundMask = LIT64( 0x000000FFFFFFFFFF );
677 }
678 else {
679 goto precision80;
680 }
681 zSig0 |= ( zSig1 != 0 );
682 if ( ! roundNearestEven ) {
683 if ( roundingMode == float_round_to_zero ) {
684 roundIncrement = 0;
685 }
686 else {
687 roundIncrement = roundMask;
688 if ( zSign ) {
689 if ( roundingMode == float_round_up ) roundIncrement = 0;
690 }
691 else {
692 if ( roundingMode == float_round_down ) roundIncrement = 0;
693 }
694 }
695 }
696 roundBits = zSig0 & roundMask;
bb98fe42 697 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
158142c2
FB
698 if ( ( 0x7FFE < zExp )
699 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
700 ) {
701 goto overflow;
702 }
703 if ( zExp <= 0 ) {
e6afc87f
PM
704 if (STATUS(flush_to_zero)) {
705 float_raise(float_flag_output_denormal STATUS_VAR);
706 return packFloatx80(zSign, 0, 0);
707 }
158142c2
FB
708 isTiny =
709 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
710 || ( zExp < 0 )
711 || ( zSig0 <= zSig0 + roundIncrement );
712 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
713 zExp = 0;
714 roundBits = zSig0 & roundMask;
715 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
716 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
717 zSig0 += roundIncrement;
bb98fe42 718 if ( (int64_t) zSig0 < 0 ) zExp = 1;
158142c2
FB
719 roundIncrement = roundMask + 1;
720 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
721 roundMask |= roundIncrement;
722 }
723 zSig0 &= ~ roundMask;
724 return packFloatx80( zSign, zExp, zSig0 );
725 }
726 }
727 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
728 zSig0 += roundIncrement;
729 if ( zSig0 < roundIncrement ) {
730 ++zExp;
731 zSig0 = LIT64( 0x8000000000000000 );
732 }
733 roundIncrement = roundMask + 1;
734 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
735 roundMask |= roundIncrement;
736 }
737 zSig0 &= ~ roundMask;
738 if ( zSig0 == 0 ) zExp = 0;
739 return packFloatx80( zSign, zExp, zSig0 );
740 precision80:
bb98fe42 741 increment = ( (int64_t) zSig1 < 0 );
158142c2
FB
742 if ( ! roundNearestEven ) {
743 if ( roundingMode == float_round_to_zero ) {
744 increment = 0;
745 }
746 else {
747 if ( zSign ) {
748 increment = ( roundingMode == float_round_down ) && zSig1;
749 }
750 else {
751 increment = ( roundingMode == float_round_up ) && zSig1;
752 }
753 }
754 }
bb98fe42 755 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
158142c2
FB
756 if ( ( 0x7FFE < zExp )
757 || ( ( zExp == 0x7FFE )
758 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
759 && increment
760 )
761 ) {
762 roundMask = 0;
763 overflow:
764 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
765 if ( ( roundingMode == float_round_to_zero )
766 || ( zSign && ( roundingMode == float_round_up ) )
767 || ( ! zSign && ( roundingMode == float_round_down ) )
768 ) {
769 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
770 }
771 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
772 }
773 if ( zExp <= 0 ) {
774 isTiny =
775 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
776 || ( zExp < 0 )
777 || ! increment
778 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
779 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
780 zExp = 0;
781 if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
782 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
783 if ( roundNearestEven ) {
bb98fe42 784 increment = ( (int64_t) zSig1 < 0 );
158142c2
FB
785 }
786 else {
787 if ( zSign ) {
788 increment = ( roundingMode == float_round_down ) && zSig1;
789 }
790 else {
791 increment = ( roundingMode == float_round_up ) && zSig1;
792 }
793 }
794 if ( increment ) {
795 ++zSig0;
796 zSig0 &=
bb98fe42
AF
797 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
798 if ( (int64_t) zSig0 < 0 ) zExp = 1;
158142c2
FB
799 }
800 return packFloatx80( zSign, zExp, zSig0 );
801 }
802 }
803 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
804 if ( increment ) {
805 ++zSig0;
806 if ( zSig0 == 0 ) {
807 ++zExp;
808 zSig0 = LIT64( 0x8000000000000000 );
809 }
810 else {
bb98fe42 811 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
158142c2
FB
812 }
813 }
814 else {
815 if ( zSig0 == 0 ) zExp = 0;
816 }
817 return packFloatx80( zSign, zExp, zSig0 );
818
819}
820
821/*----------------------------------------------------------------------------
822| Takes an abstract floating-point value having sign `zSign', exponent
823| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
824| and returns the proper extended double-precision floating-point value
825| corresponding to the abstract input. This routine is just like
826| `roundAndPackFloatx80' except that the input significand does not have to be
827| normalized.
828*----------------------------------------------------------------------------*/
829
830static floatx80
831 normalizeRoundAndPackFloatx80(
bb98fe42 832 int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
158142c2
FB
833 STATUS_PARAM)
834{
835 int8 shiftCount;
836
837 if ( zSig0 == 0 ) {
838 zSig0 = zSig1;
839 zSig1 = 0;
840 zExp -= 64;
841 }
842 shiftCount = countLeadingZeros64( zSig0 );
843 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
844 zExp -= shiftCount;
845 return
846 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
847
848}
849
158142c2
FB
850/*----------------------------------------------------------------------------
851| Returns the least-significant 64 fraction bits of the quadruple-precision
852| floating-point value `a'.
853*----------------------------------------------------------------------------*/
854
bb98fe42 855INLINE uint64_t extractFloat128Frac1( float128 a )
158142c2
FB
856{
857
858 return a.low;
859
860}
861
862/*----------------------------------------------------------------------------
863| Returns the most-significant 48 fraction bits of the quadruple-precision
864| floating-point value `a'.
865*----------------------------------------------------------------------------*/
866
bb98fe42 867INLINE uint64_t extractFloat128Frac0( float128 a )
158142c2
FB
868{
869
870 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
871
872}
873
874/*----------------------------------------------------------------------------
875| Returns the exponent bits of the quadruple-precision floating-point value
876| `a'.
877*----------------------------------------------------------------------------*/
878
879INLINE int32 extractFloat128Exp( float128 a )
880{
881
882 return ( a.high>>48 ) & 0x7FFF;
883
884}
885
886/*----------------------------------------------------------------------------
887| Returns the sign bit of the quadruple-precision floating-point value `a'.
888*----------------------------------------------------------------------------*/
889
890INLINE flag extractFloat128Sign( float128 a )
891{
892
893 return a.high>>63;
894
895}
896
897/*----------------------------------------------------------------------------
898| Normalizes the subnormal quadruple-precision floating-point value
899| represented by the denormalized significand formed by the concatenation of
900| `aSig0' and `aSig1'. The normalized exponent is stored at the location
901| pointed to by `zExpPtr'. The most significant 49 bits of the normalized
902| significand are stored at the location pointed to by `zSig0Ptr', and the
903| least significant 64 bits of the normalized significand are stored at the
904| location pointed to by `zSig1Ptr'.
905*----------------------------------------------------------------------------*/
906
907static void
908 normalizeFloat128Subnormal(
bb98fe42
AF
909 uint64_t aSig0,
910 uint64_t aSig1,
158142c2 911 int32 *zExpPtr,
bb98fe42
AF
912 uint64_t *zSig0Ptr,
913 uint64_t *zSig1Ptr
158142c2
FB
914 )
915{
916 int8 shiftCount;
917
918 if ( aSig0 == 0 ) {
919 shiftCount = countLeadingZeros64( aSig1 ) - 15;
920 if ( shiftCount < 0 ) {
921 *zSig0Ptr = aSig1>>( - shiftCount );
922 *zSig1Ptr = aSig1<<( shiftCount & 63 );
923 }
924 else {
925 *zSig0Ptr = aSig1<<shiftCount;
926 *zSig1Ptr = 0;
927 }
928 *zExpPtr = - shiftCount - 63;
929 }
930 else {
931 shiftCount = countLeadingZeros64( aSig0 ) - 15;
932 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
933 *zExpPtr = 1 - shiftCount;
934 }
935
936}
937
938/*----------------------------------------------------------------------------
939| Packs the sign `zSign', the exponent `zExp', and the significand formed
940| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
941| floating-point value, returning the result. After being shifted into the
942| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
943| added together to form the most significant 32 bits of the result. This
944| means that any integer portion of `zSig0' will be added into the exponent.
945| Since a properly normalized significand will have an integer portion equal
946| to 1, the `zExp' input should be 1 less than the desired result exponent
947| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
948| significand.
949*----------------------------------------------------------------------------*/
950
951INLINE float128
bb98fe42 952 packFloat128( flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 )
158142c2
FB
953{
954 float128 z;
955
956 z.low = zSig1;
bb98fe42 957 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
158142c2
FB
958 return z;
959
960}
961
962/*----------------------------------------------------------------------------
963| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
964| and extended significand formed by the concatenation of `zSig0', `zSig1',
965| and `zSig2', and returns the proper quadruple-precision floating-point value
966| corresponding to the abstract input. Ordinarily, the abstract value is
967| simply rounded and packed into the quadruple-precision format, with the
968| inexact exception raised if the abstract input cannot be represented
969| exactly. However, if the abstract value is too large, the overflow and
970| inexact exceptions are raised and an infinity or maximal finite value is
971| returned. If the abstract value is too small, the input value is rounded to
972| a subnormal number, and the underflow and inexact exceptions are raised if
973| the abstract input cannot be represented exactly as a subnormal quadruple-
974| precision floating-point number.
975| The input significand must be normalized or smaller. If the input
976| significand is not normalized, `zExp' must be 0; in that case, the result
977| returned is a subnormal number, and it must not require rounding. In the
978| usual case that the input significand is normalized, `zExp' must be 1 less
979| than the ``true'' floating-point exponent. The handling of underflow and
980| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
981*----------------------------------------------------------------------------*/
982
983static float128
984 roundAndPackFloat128(
bb98fe42 985 flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1, uint64_t zSig2 STATUS_PARAM)
158142c2
FB
986{
987 int8 roundingMode;
988 flag roundNearestEven, increment, isTiny;
989
990 roundingMode = STATUS(float_rounding_mode);
991 roundNearestEven = ( roundingMode == float_round_nearest_even );
bb98fe42 992 increment = ( (int64_t) zSig2 < 0 );
158142c2
FB
993 if ( ! roundNearestEven ) {
994 if ( roundingMode == float_round_to_zero ) {
995 increment = 0;
996 }
997 else {
998 if ( zSign ) {
999 increment = ( roundingMode == float_round_down ) && zSig2;
1000 }
1001 else {
1002 increment = ( roundingMode == float_round_up ) && zSig2;
1003 }
1004 }
1005 }
bb98fe42 1006 if ( 0x7FFD <= (uint32_t) zExp ) {
158142c2
FB
1007 if ( ( 0x7FFD < zExp )
1008 || ( ( zExp == 0x7FFD )
1009 && eq128(
1010 LIT64( 0x0001FFFFFFFFFFFF ),
1011 LIT64( 0xFFFFFFFFFFFFFFFF ),
1012 zSig0,
1013 zSig1
1014 )
1015 && increment
1016 )
1017 ) {
1018 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
1019 if ( ( roundingMode == float_round_to_zero )
1020 || ( zSign && ( roundingMode == float_round_up ) )
1021 || ( ! zSign && ( roundingMode == float_round_down ) )
1022 ) {
1023 return
1024 packFloat128(
1025 zSign,
1026 0x7FFE,
1027 LIT64( 0x0000FFFFFFFFFFFF ),
1028 LIT64( 0xFFFFFFFFFFFFFFFF )
1029 );
1030 }
1031 return packFloat128( zSign, 0x7FFF, 0, 0 );
1032 }
1033 if ( zExp < 0 ) {
e6afc87f
PM
1034 if (STATUS(flush_to_zero)) {
1035 float_raise(float_flag_output_denormal STATUS_VAR);
1036 return packFloat128(zSign, 0, 0, 0);
1037 }
158142c2
FB
1038 isTiny =
1039 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
1040 || ( zExp < -1 )
1041 || ! increment
1042 || lt128(
1043 zSig0,
1044 zSig1,
1045 LIT64( 0x0001FFFFFFFFFFFF ),
1046 LIT64( 0xFFFFFFFFFFFFFFFF )
1047 );
1048 shift128ExtraRightJamming(
1049 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1050 zExp = 0;
1051 if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
1052 if ( roundNearestEven ) {
bb98fe42 1053 increment = ( (int64_t) zSig2 < 0 );
158142c2
FB
1054 }
1055 else {
1056 if ( zSign ) {
1057 increment = ( roundingMode == float_round_down ) && zSig2;
1058 }
1059 else {
1060 increment = ( roundingMode == float_round_up ) && zSig2;
1061 }
1062 }
1063 }
1064 }
1065 if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1066 if ( increment ) {
1067 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1068 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1069 }
1070 else {
1071 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1072 }
1073 return packFloat128( zSign, zExp, zSig0, zSig1 );
1074
1075}
1076
1077/*----------------------------------------------------------------------------
1078| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1079| and significand formed by the concatenation of `zSig0' and `zSig1', and
1080| returns the proper quadruple-precision floating-point value corresponding
1081| to the abstract input. This routine is just like `roundAndPackFloat128'
1082| except that the input significand has fewer bits and does not have to be
1083| normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1084| point exponent.
1085*----------------------------------------------------------------------------*/
1086
1087static float128
1088 normalizeRoundAndPackFloat128(
bb98fe42 1089 flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 STATUS_PARAM)
158142c2
FB
1090{
1091 int8 shiftCount;
bb98fe42 1092 uint64_t zSig2;
158142c2
FB
1093
1094 if ( zSig0 == 0 ) {
1095 zSig0 = zSig1;
1096 zSig1 = 0;
1097 zExp -= 64;
1098 }
1099 shiftCount = countLeadingZeros64( zSig0 ) - 15;
1100 if ( 0 <= shiftCount ) {
1101 zSig2 = 0;
1102 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1103 }
1104 else {
1105 shift128ExtraRightJamming(
1106 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1107 }
1108 zExp -= shiftCount;
1109 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1110
1111}
1112
158142c2
FB
1113/*----------------------------------------------------------------------------
1114| Returns the result of converting the 32-bit two's complement integer `a'
1115| to the single-precision floating-point format. The conversion is performed
1116| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1117*----------------------------------------------------------------------------*/
1118
1119float32 int32_to_float32( int32 a STATUS_PARAM )
1120{
1121 flag zSign;
1122
f090c9d4 1123 if ( a == 0 ) return float32_zero;
bb98fe42 1124 if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
158142c2
FB
1125 zSign = ( a < 0 );
1126 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1127
1128}
1129
1130/*----------------------------------------------------------------------------
1131| Returns the result of converting the 32-bit two's complement integer `a'
1132| to the double-precision floating-point format. The conversion is performed
1133| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1134*----------------------------------------------------------------------------*/
1135
1136float64 int32_to_float64( int32 a STATUS_PARAM )
1137{
1138 flag zSign;
1139 uint32 absA;
1140 int8 shiftCount;
bb98fe42 1141 uint64_t zSig;
158142c2 1142
f090c9d4 1143 if ( a == 0 ) return float64_zero;
158142c2
FB
1144 zSign = ( a < 0 );
1145 absA = zSign ? - a : a;
1146 shiftCount = countLeadingZeros32( absA ) + 21;
1147 zSig = absA;
1148 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1149
1150}
1151
158142c2
FB
1152/*----------------------------------------------------------------------------
1153| Returns the result of converting the 32-bit two's complement integer `a'
1154| to the extended double-precision floating-point format. The conversion
1155| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1156| Arithmetic.
1157*----------------------------------------------------------------------------*/
1158
1159floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1160{
1161 flag zSign;
1162 uint32 absA;
1163 int8 shiftCount;
bb98fe42 1164 uint64_t zSig;
158142c2
FB
1165
1166 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1167 zSign = ( a < 0 );
1168 absA = zSign ? - a : a;
1169 shiftCount = countLeadingZeros32( absA ) + 32;
1170 zSig = absA;
1171 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1172
1173}
1174
158142c2
FB
1175/*----------------------------------------------------------------------------
1176| Returns the result of converting the 32-bit two's complement integer `a' to
1177| the quadruple-precision floating-point format. The conversion is performed
1178| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1179*----------------------------------------------------------------------------*/
1180
1181float128 int32_to_float128( int32 a STATUS_PARAM )
1182{
1183 flag zSign;
1184 uint32 absA;
1185 int8 shiftCount;
bb98fe42 1186 uint64_t zSig0;
158142c2
FB
1187
1188 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1189 zSign = ( a < 0 );
1190 absA = zSign ? - a : a;
1191 shiftCount = countLeadingZeros32( absA ) + 17;
1192 zSig0 = absA;
1193 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1194
1195}
1196
158142c2
FB
1197/*----------------------------------------------------------------------------
1198| Returns the result of converting the 64-bit two's complement integer `a'
1199| to the single-precision floating-point format. The conversion is performed
1200| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1201*----------------------------------------------------------------------------*/
1202
1203float32 int64_to_float32( int64 a STATUS_PARAM )
1204{
1205 flag zSign;
1206 uint64 absA;
1207 int8 shiftCount;
1208
f090c9d4 1209 if ( a == 0 ) return float32_zero;
158142c2
FB
1210 zSign = ( a < 0 );
1211 absA = zSign ? - a : a;
1212 shiftCount = countLeadingZeros64( absA ) - 40;
1213 if ( 0 <= shiftCount ) {
1214 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1215 }
1216 else {
1217 shiftCount += 7;
1218 if ( shiftCount < 0 ) {
1219 shift64RightJamming( absA, - shiftCount, &absA );
1220 }
1221 else {
1222 absA <<= shiftCount;
1223 }
1224 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1225 }
1226
1227}
1228
3430b0be 1229float32 uint64_to_float32( uint64 a STATUS_PARAM )
75d62a58
JM
1230{
1231 int8 shiftCount;
1232
f090c9d4 1233 if ( a == 0 ) return float32_zero;
75d62a58
JM
1234 shiftCount = countLeadingZeros64( a ) - 40;
1235 if ( 0 <= shiftCount ) {
1236 return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount );
1237 }
1238 else {
1239 shiftCount += 7;
1240 if ( shiftCount < 0 ) {
1241 shift64RightJamming( a, - shiftCount, &a );
1242 }
1243 else {
1244 a <<= shiftCount;
1245 }
1246 return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR );
1247 }
1248}
1249
158142c2
FB
1250/*----------------------------------------------------------------------------
1251| Returns the result of converting the 64-bit two's complement integer `a'
1252| to the double-precision floating-point format. The conversion is performed
1253| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1254*----------------------------------------------------------------------------*/
1255
1256float64 int64_to_float64( int64 a STATUS_PARAM )
1257{
1258 flag zSign;
1259
f090c9d4 1260 if ( a == 0 ) return float64_zero;
bb98fe42 1261 if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
158142c2
FB
1262 return packFloat64( 1, 0x43E, 0 );
1263 }
1264 zSign = ( a < 0 );
1265 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1266
1267}
1268
75d62a58
JM
1269float64 uint64_to_float64( uint64 a STATUS_PARAM )
1270{
f090c9d4 1271 if ( a == 0 ) return float64_zero;
75d62a58
JM
1272 return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR );
1273
1274}
1275
158142c2
FB
1276/*----------------------------------------------------------------------------
1277| Returns the result of converting the 64-bit two's complement integer `a'
1278| to the extended double-precision floating-point format. The conversion
1279| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1280| Arithmetic.
1281*----------------------------------------------------------------------------*/
1282
1283floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1284{
1285 flag zSign;
1286 uint64 absA;
1287 int8 shiftCount;
1288
1289 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1290 zSign = ( a < 0 );
1291 absA = zSign ? - a : a;
1292 shiftCount = countLeadingZeros64( absA );
1293 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1294
1295}
1296
158142c2
FB
1297/*----------------------------------------------------------------------------
1298| Returns the result of converting the 64-bit two's complement integer `a' to
1299| the quadruple-precision floating-point format. The conversion is performed
1300| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1301*----------------------------------------------------------------------------*/
1302
1303float128 int64_to_float128( int64 a STATUS_PARAM )
1304{
1305 flag zSign;
1306 uint64 absA;
1307 int8 shiftCount;
1308 int32 zExp;
bb98fe42 1309 uint64_t zSig0, zSig1;
158142c2
FB
1310
1311 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1312 zSign = ( a < 0 );
1313 absA = zSign ? - a : a;
1314 shiftCount = countLeadingZeros64( absA ) + 49;
1315 zExp = 0x406E - shiftCount;
1316 if ( 64 <= shiftCount ) {
1317 zSig1 = 0;
1318 zSig0 = absA;
1319 shiftCount -= 64;
1320 }
1321 else {
1322 zSig1 = absA;
1323 zSig0 = 0;
1324 }
1325 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1326 return packFloat128( zSign, zExp, zSig0, zSig1 );
1327
1328}
1329
158142c2
FB
1330/*----------------------------------------------------------------------------
1331| Returns the result of converting the single-precision floating-point value
1332| `a' to the 32-bit two's complement integer format. The conversion is
1333| performed according to the IEC/IEEE Standard for Binary Floating-Point
1334| Arithmetic---which means in particular that the conversion is rounded
1335| according to the current rounding mode. If `a' is a NaN, the largest
1336| positive integer is returned. Otherwise, if the conversion overflows, the
1337| largest integer with the same sign as `a' is returned.
1338*----------------------------------------------------------------------------*/
1339
1340int32 float32_to_int32( float32 a STATUS_PARAM )
1341{
1342 flag aSign;
1343 int16 aExp, shiftCount;
bb98fe42
AF
1344 uint32_t aSig;
1345 uint64_t aSig64;
158142c2 1346
37d18660 1347 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1348 aSig = extractFloat32Frac( a );
1349 aExp = extractFloat32Exp( a );
1350 aSign = extractFloat32Sign( a );
1351 if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1352 if ( aExp ) aSig |= 0x00800000;
1353 shiftCount = 0xAF - aExp;
1354 aSig64 = aSig;
1355 aSig64 <<= 32;
1356 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1357 return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1358
1359}
1360
1361/*----------------------------------------------------------------------------
1362| Returns the result of converting the single-precision floating-point value
1363| `a' to the 32-bit two's complement integer format. The conversion is
1364| performed according to the IEC/IEEE Standard for Binary Floating-Point
1365| Arithmetic, except that the conversion is always rounded toward zero.
1366| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1367| the conversion overflows, the largest integer with the same sign as `a' is
1368| returned.
1369*----------------------------------------------------------------------------*/
1370
1371int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1372{
1373 flag aSign;
1374 int16 aExp, shiftCount;
bb98fe42 1375 uint32_t aSig;
158142c2 1376 int32 z;
37d18660 1377 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1378
1379 aSig = extractFloat32Frac( a );
1380 aExp = extractFloat32Exp( a );
1381 aSign = extractFloat32Sign( a );
1382 shiftCount = aExp - 0x9E;
1383 if ( 0 <= shiftCount ) {
f090c9d4 1384 if ( float32_val(a) != 0xCF000000 ) {
158142c2
FB
1385 float_raise( float_flag_invalid STATUS_VAR);
1386 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1387 }
bb98fe42 1388 return (int32_t) 0x80000000;
158142c2
FB
1389 }
1390 else if ( aExp <= 0x7E ) {
1391 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1392 return 0;
1393 }
1394 aSig = ( aSig | 0x00800000 )<<8;
1395 z = aSig>>( - shiftCount );
bb98fe42 1396 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
158142c2
FB
1397 STATUS(float_exception_flags) |= float_flag_inexact;
1398 }
1399 if ( aSign ) z = - z;
1400 return z;
1401
1402}
1403
cbcef455
PM
1404/*----------------------------------------------------------------------------
1405| Returns the result of converting the single-precision floating-point value
1406| `a' to the 16-bit two's complement integer format. The conversion is
1407| performed according to the IEC/IEEE Standard for Binary Floating-Point
1408| Arithmetic, except that the conversion is always rounded toward zero.
1409| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1410| the conversion overflows, the largest integer with the same sign as `a' is
1411| returned.
1412*----------------------------------------------------------------------------*/
1413
1414int16 float32_to_int16_round_to_zero( float32 a STATUS_PARAM )
1415{
1416 flag aSign;
1417 int16 aExp, shiftCount;
bb98fe42 1418 uint32_t aSig;
cbcef455
PM
1419 int32 z;
1420
1421 aSig = extractFloat32Frac( a );
1422 aExp = extractFloat32Exp( a );
1423 aSign = extractFloat32Sign( a );
1424 shiftCount = aExp - 0x8E;
1425 if ( 0 <= shiftCount ) {
1426 if ( float32_val(a) != 0xC7000000 ) {
1427 float_raise( float_flag_invalid STATUS_VAR);
1428 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1429 return 0x7FFF;
1430 }
1431 }
bb98fe42 1432 return (int32_t) 0xffff8000;
cbcef455
PM
1433 }
1434 else if ( aExp <= 0x7E ) {
1435 if ( aExp | aSig ) {
1436 STATUS(float_exception_flags) |= float_flag_inexact;
1437 }
1438 return 0;
1439 }
1440 shiftCount -= 0x10;
1441 aSig = ( aSig | 0x00800000 )<<8;
1442 z = aSig>>( - shiftCount );
bb98fe42 1443 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
cbcef455
PM
1444 STATUS(float_exception_flags) |= float_flag_inexact;
1445 }
1446 if ( aSign ) {
1447 z = - z;
1448 }
1449 return z;
1450
1451}
1452
158142c2
FB
1453/*----------------------------------------------------------------------------
1454| Returns the result of converting the single-precision floating-point value
1455| `a' to the 64-bit two's complement integer format. The conversion is
1456| performed according to the IEC/IEEE Standard for Binary Floating-Point
1457| Arithmetic---which means in particular that the conversion is rounded
1458| according to the current rounding mode. If `a' is a NaN, the largest
1459| positive integer is returned. Otherwise, if the conversion overflows, the
1460| largest integer with the same sign as `a' is returned.
1461*----------------------------------------------------------------------------*/
1462
1463int64 float32_to_int64( float32 a STATUS_PARAM )
1464{
1465 flag aSign;
1466 int16 aExp, shiftCount;
bb98fe42
AF
1467 uint32_t aSig;
1468 uint64_t aSig64, aSigExtra;
37d18660 1469 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1470
1471 aSig = extractFloat32Frac( a );
1472 aExp = extractFloat32Exp( a );
1473 aSign = extractFloat32Sign( a );
1474 shiftCount = 0xBE - aExp;
1475 if ( shiftCount < 0 ) {
1476 float_raise( float_flag_invalid STATUS_VAR);
1477 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1478 return LIT64( 0x7FFFFFFFFFFFFFFF );
1479 }
bb98fe42 1480 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
1481 }
1482 if ( aExp ) aSig |= 0x00800000;
1483 aSig64 = aSig;
1484 aSig64 <<= 40;
1485 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1486 return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1487
1488}
1489
1490/*----------------------------------------------------------------------------
1491| Returns the result of converting the single-precision floating-point value
1492| `a' to the 64-bit two's complement integer format. The conversion is
1493| performed according to the IEC/IEEE Standard for Binary Floating-Point
1494| Arithmetic, except that the conversion is always rounded toward zero. If
1495| `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1496| conversion overflows, the largest integer with the same sign as `a' is
1497| returned.
1498*----------------------------------------------------------------------------*/
1499
1500int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1501{
1502 flag aSign;
1503 int16 aExp, shiftCount;
bb98fe42
AF
1504 uint32_t aSig;
1505 uint64_t aSig64;
158142c2 1506 int64 z;
37d18660 1507 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1508
1509 aSig = extractFloat32Frac( a );
1510 aExp = extractFloat32Exp( a );
1511 aSign = extractFloat32Sign( a );
1512 shiftCount = aExp - 0xBE;
1513 if ( 0 <= shiftCount ) {
f090c9d4 1514 if ( float32_val(a) != 0xDF000000 ) {
158142c2
FB
1515 float_raise( float_flag_invalid STATUS_VAR);
1516 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1517 return LIT64( 0x7FFFFFFFFFFFFFFF );
1518 }
1519 }
bb98fe42 1520 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
1521 }
1522 else if ( aExp <= 0x7E ) {
1523 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1524 return 0;
1525 }
1526 aSig64 = aSig | 0x00800000;
1527 aSig64 <<= 40;
1528 z = aSig64>>( - shiftCount );
bb98fe42 1529 if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
158142c2
FB
1530 STATUS(float_exception_flags) |= float_flag_inexact;
1531 }
1532 if ( aSign ) z = - z;
1533 return z;
1534
1535}
1536
1537/*----------------------------------------------------------------------------
1538| Returns the result of converting the single-precision floating-point value
1539| `a' to the double-precision floating-point format. The conversion is
1540| performed according to the IEC/IEEE Standard for Binary Floating-Point
1541| Arithmetic.
1542*----------------------------------------------------------------------------*/
1543
1544float64 float32_to_float64( float32 a STATUS_PARAM )
1545{
1546 flag aSign;
1547 int16 aExp;
bb98fe42 1548 uint32_t aSig;
37d18660 1549 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1550
1551 aSig = extractFloat32Frac( a );
1552 aExp = extractFloat32Exp( a );
1553 aSign = extractFloat32Sign( a );
1554 if ( aExp == 0xFF ) {
bcd4d9af 1555 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
1556 return packFloat64( aSign, 0x7FF, 0 );
1557 }
1558 if ( aExp == 0 ) {
1559 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1560 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1561 --aExp;
1562 }
bb98fe42 1563 return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
158142c2
FB
1564
1565}
1566
158142c2
FB
1567/*----------------------------------------------------------------------------
1568| Returns the result of converting the single-precision floating-point value
1569| `a' to the extended double-precision floating-point format. The conversion
1570| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1571| Arithmetic.
1572*----------------------------------------------------------------------------*/
1573
1574floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1575{
1576 flag aSign;
1577 int16 aExp;
bb98fe42 1578 uint32_t aSig;
158142c2 1579
37d18660 1580 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1581 aSig = extractFloat32Frac( a );
1582 aExp = extractFloat32Exp( a );
1583 aSign = extractFloat32Sign( a );
1584 if ( aExp == 0xFF ) {
bcd4d9af 1585 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
1586 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1587 }
1588 if ( aExp == 0 ) {
1589 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1590 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1591 }
1592 aSig |= 0x00800000;
bb98fe42 1593 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
158142c2
FB
1594
1595}
1596
158142c2
FB
1597/*----------------------------------------------------------------------------
1598| Returns the result of converting the single-precision floating-point value
1599| `a' to the double-precision floating-point format. The conversion is
1600| performed according to the IEC/IEEE Standard for Binary Floating-Point
1601| Arithmetic.
1602*----------------------------------------------------------------------------*/
1603
1604float128 float32_to_float128( float32 a STATUS_PARAM )
1605{
1606 flag aSign;
1607 int16 aExp;
bb98fe42 1608 uint32_t aSig;
158142c2 1609
37d18660 1610 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1611 aSig = extractFloat32Frac( a );
1612 aExp = extractFloat32Exp( a );
1613 aSign = extractFloat32Sign( a );
1614 if ( aExp == 0xFF ) {
bcd4d9af 1615 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
1616 return packFloat128( aSign, 0x7FFF, 0, 0 );
1617 }
1618 if ( aExp == 0 ) {
1619 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1620 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1621 --aExp;
1622 }
bb98fe42 1623 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
158142c2
FB
1624
1625}
1626
158142c2
FB
1627/*----------------------------------------------------------------------------
1628| Rounds the single-precision floating-point value `a' to an integer, and
1629| returns the result as a single-precision floating-point value. The
1630| operation is performed according to the IEC/IEEE Standard for Binary
1631| Floating-Point Arithmetic.
1632*----------------------------------------------------------------------------*/
1633
1634float32 float32_round_to_int( float32 a STATUS_PARAM)
1635{
1636 flag aSign;
1637 int16 aExp;
bb98fe42 1638 uint32_t lastBitMask, roundBitsMask;
158142c2 1639 int8 roundingMode;
bb98fe42 1640 uint32_t z;
37d18660 1641 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1642
1643 aExp = extractFloat32Exp( a );
1644 if ( 0x96 <= aExp ) {
1645 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1646 return propagateFloat32NaN( a, a STATUS_VAR );
1647 }
1648 return a;
1649 }
1650 if ( aExp <= 0x7E ) {
bb98fe42 1651 if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
158142c2
FB
1652 STATUS(float_exception_flags) |= float_flag_inexact;
1653 aSign = extractFloat32Sign( a );
1654 switch ( STATUS(float_rounding_mode) ) {
1655 case float_round_nearest_even:
1656 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1657 return packFloat32( aSign, 0x7F, 0 );
1658 }
1659 break;
1660 case float_round_down:
f090c9d4 1661 return make_float32(aSign ? 0xBF800000 : 0);
158142c2 1662 case float_round_up:
f090c9d4 1663 return make_float32(aSign ? 0x80000000 : 0x3F800000);
158142c2
FB
1664 }
1665 return packFloat32( aSign, 0, 0 );
1666 }
1667 lastBitMask = 1;
1668 lastBitMask <<= 0x96 - aExp;
1669 roundBitsMask = lastBitMask - 1;
f090c9d4 1670 z = float32_val(a);
158142c2
FB
1671 roundingMode = STATUS(float_rounding_mode);
1672 if ( roundingMode == float_round_nearest_even ) {
1673 z += lastBitMask>>1;
1674 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1675 }
1676 else if ( roundingMode != float_round_to_zero ) {
f090c9d4 1677 if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
158142c2
FB
1678 z += roundBitsMask;
1679 }
1680 }
1681 z &= ~ roundBitsMask;
f090c9d4
PB
1682 if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1683 return make_float32(z);
158142c2
FB
1684
1685}
1686
1687/*----------------------------------------------------------------------------
1688| Returns the result of adding the absolute values of the single-precision
1689| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1690| before being returned. `zSign' is ignored if the result is a NaN.
1691| The addition is performed according to the IEC/IEEE Standard for Binary
1692| Floating-Point Arithmetic.
1693*----------------------------------------------------------------------------*/
1694
1695static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1696{
1697 int16 aExp, bExp, zExp;
bb98fe42 1698 uint32_t aSig, bSig, zSig;
158142c2
FB
1699 int16 expDiff;
1700
1701 aSig = extractFloat32Frac( a );
1702 aExp = extractFloat32Exp( a );
1703 bSig = extractFloat32Frac( b );
1704 bExp = extractFloat32Exp( b );
1705 expDiff = aExp - bExp;
1706 aSig <<= 6;
1707 bSig <<= 6;
1708 if ( 0 < expDiff ) {
1709 if ( aExp == 0xFF ) {
1710 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1711 return a;
1712 }
1713 if ( bExp == 0 ) {
1714 --expDiff;
1715 }
1716 else {
1717 bSig |= 0x20000000;
1718 }
1719 shift32RightJamming( bSig, expDiff, &bSig );
1720 zExp = aExp;
1721 }
1722 else if ( expDiff < 0 ) {
1723 if ( bExp == 0xFF ) {
1724 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1725 return packFloat32( zSign, 0xFF, 0 );
1726 }
1727 if ( aExp == 0 ) {
1728 ++expDiff;
1729 }
1730 else {
1731 aSig |= 0x20000000;
1732 }
1733 shift32RightJamming( aSig, - expDiff, &aSig );
1734 zExp = bExp;
1735 }
1736 else {
1737 if ( aExp == 0xFF ) {
1738 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1739 return a;
1740 }
fe76d976 1741 if ( aExp == 0 ) {
e6afc87f
PM
1742 if (STATUS(flush_to_zero)) {
1743 if (aSig | bSig) {
1744 float_raise(float_flag_output_denormal STATUS_VAR);
1745 }
1746 return packFloat32(zSign, 0, 0);
1747 }
fe76d976
PB
1748 return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1749 }
158142c2
FB
1750 zSig = 0x40000000 + aSig + bSig;
1751 zExp = aExp;
1752 goto roundAndPack;
1753 }
1754 aSig |= 0x20000000;
1755 zSig = ( aSig + bSig )<<1;
1756 --zExp;
bb98fe42 1757 if ( (int32_t) zSig < 0 ) {
158142c2
FB
1758 zSig = aSig + bSig;
1759 ++zExp;
1760 }
1761 roundAndPack:
1762 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1763
1764}
1765
1766/*----------------------------------------------------------------------------
1767| Returns the result of subtracting the absolute values of the single-
1768| precision floating-point values `a' and `b'. If `zSign' is 1, the
1769| difference is negated before being returned. `zSign' is ignored if the
1770| result is a NaN. The subtraction is performed according to the IEC/IEEE
1771| Standard for Binary Floating-Point Arithmetic.
1772*----------------------------------------------------------------------------*/
1773
1774static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1775{
1776 int16 aExp, bExp, zExp;
bb98fe42 1777 uint32_t aSig, bSig, zSig;
158142c2
FB
1778 int16 expDiff;
1779
1780 aSig = extractFloat32Frac( a );
1781 aExp = extractFloat32Exp( a );
1782 bSig = extractFloat32Frac( b );
1783 bExp = extractFloat32Exp( b );
1784 expDiff = aExp - bExp;
1785 aSig <<= 7;
1786 bSig <<= 7;
1787 if ( 0 < expDiff ) goto aExpBigger;
1788 if ( expDiff < 0 ) goto bExpBigger;
1789 if ( aExp == 0xFF ) {
1790 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1791 float_raise( float_flag_invalid STATUS_VAR);
1792 return float32_default_nan;
1793 }
1794 if ( aExp == 0 ) {
1795 aExp = 1;
1796 bExp = 1;
1797 }
1798 if ( bSig < aSig ) goto aBigger;
1799 if ( aSig < bSig ) goto bBigger;
1800 return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1801 bExpBigger:
1802 if ( bExp == 0xFF ) {
1803 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1804 return packFloat32( zSign ^ 1, 0xFF, 0 );
1805 }
1806 if ( aExp == 0 ) {
1807 ++expDiff;
1808 }
1809 else {
1810 aSig |= 0x40000000;
1811 }
1812 shift32RightJamming( aSig, - expDiff, &aSig );
1813 bSig |= 0x40000000;
1814 bBigger:
1815 zSig = bSig - aSig;
1816 zExp = bExp;
1817 zSign ^= 1;
1818 goto normalizeRoundAndPack;
1819 aExpBigger:
1820 if ( aExp == 0xFF ) {
1821 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1822 return a;
1823 }
1824 if ( bExp == 0 ) {
1825 --expDiff;
1826 }
1827 else {
1828 bSig |= 0x40000000;
1829 }
1830 shift32RightJamming( bSig, expDiff, &bSig );
1831 aSig |= 0x40000000;
1832 aBigger:
1833 zSig = aSig - bSig;
1834 zExp = aExp;
1835 normalizeRoundAndPack:
1836 --zExp;
1837 return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1838
1839}
1840
1841/*----------------------------------------------------------------------------
1842| Returns the result of adding the single-precision floating-point values `a'
1843| and `b'. The operation is performed according to the IEC/IEEE Standard for
1844| Binary Floating-Point Arithmetic.
1845*----------------------------------------------------------------------------*/
1846
1847float32 float32_add( float32 a, float32 b STATUS_PARAM )
1848{
1849 flag aSign, bSign;
37d18660
PM
1850 a = float32_squash_input_denormal(a STATUS_VAR);
1851 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
1852
1853 aSign = extractFloat32Sign( a );
1854 bSign = extractFloat32Sign( b );
1855 if ( aSign == bSign ) {
1856 return addFloat32Sigs( a, b, aSign STATUS_VAR);
1857 }
1858 else {
1859 return subFloat32Sigs( a, b, aSign STATUS_VAR );
1860 }
1861
1862}
1863
1864/*----------------------------------------------------------------------------
1865| Returns the result of subtracting the single-precision floating-point values
1866| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1867| for Binary Floating-Point Arithmetic.
1868*----------------------------------------------------------------------------*/
1869
1870float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1871{
1872 flag aSign, bSign;
37d18660
PM
1873 a = float32_squash_input_denormal(a STATUS_VAR);
1874 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
1875
1876 aSign = extractFloat32Sign( a );
1877 bSign = extractFloat32Sign( b );
1878 if ( aSign == bSign ) {
1879 return subFloat32Sigs( a, b, aSign STATUS_VAR );
1880 }
1881 else {
1882 return addFloat32Sigs( a, b, aSign STATUS_VAR );
1883 }
1884
1885}
1886
1887/*----------------------------------------------------------------------------
1888| Returns the result of multiplying the single-precision floating-point values
1889| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1890| for Binary Floating-Point Arithmetic.
1891*----------------------------------------------------------------------------*/
1892
1893float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1894{
1895 flag aSign, bSign, zSign;
1896 int16 aExp, bExp, zExp;
bb98fe42
AF
1897 uint32_t aSig, bSig;
1898 uint64_t zSig64;
1899 uint32_t zSig;
158142c2 1900
37d18660
PM
1901 a = float32_squash_input_denormal(a STATUS_VAR);
1902 b = float32_squash_input_denormal(b STATUS_VAR);
1903
158142c2
FB
1904 aSig = extractFloat32Frac( a );
1905 aExp = extractFloat32Exp( a );
1906 aSign = extractFloat32Sign( a );
1907 bSig = extractFloat32Frac( b );
1908 bExp = extractFloat32Exp( b );
1909 bSign = extractFloat32Sign( b );
1910 zSign = aSign ^ bSign;
1911 if ( aExp == 0xFF ) {
1912 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1913 return propagateFloat32NaN( a, b STATUS_VAR );
1914 }
1915 if ( ( bExp | bSig ) == 0 ) {
1916 float_raise( float_flag_invalid STATUS_VAR);
1917 return float32_default_nan;
1918 }
1919 return packFloat32( zSign, 0xFF, 0 );
1920 }
1921 if ( bExp == 0xFF ) {
1922 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1923 if ( ( aExp | aSig ) == 0 ) {
1924 float_raise( float_flag_invalid STATUS_VAR);
1925 return float32_default_nan;
1926 }
1927 return packFloat32( zSign, 0xFF, 0 );
1928 }
1929 if ( aExp == 0 ) {
1930 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1931 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1932 }
1933 if ( bExp == 0 ) {
1934 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1935 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1936 }
1937 zExp = aExp + bExp - 0x7F;
1938 aSig = ( aSig | 0x00800000 )<<7;
1939 bSig = ( bSig | 0x00800000 )<<8;
bb98fe42 1940 shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
158142c2 1941 zSig = zSig64;
bb98fe42 1942 if ( 0 <= (int32_t) ( zSig<<1 ) ) {
158142c2
FB
1943 zSig <<= 1;
1944 --zExp;
1945 }
1946 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1947
1948}
1949
1950/*----------------------------------------------------------------------------
1951| Returns the result of dividing the single-precision floating-point value `a'
1952| by the corresponding value `b'. The operation is performed according to the
1953| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1954*----------------------------------------------------------------------------*/
1955
1956float32 float32_div( float32 a, float32 b STATUS_PARAM )
1957{
1958 flag aSign, bSign, zSign;
1959 int16 aExp, bExp, zExp;
bb98fe42 1960 uint32_t aSig, bSig, zSig;
37d18660
PM
1961 a = float32_squash_input_denormal(a STATUS_VAR);
1962 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
1963
1964 aSig = extractFloat32Frac( a );
1965 aExp = extractFloat32Exp( a );
1966 aSign = extractFloat32Sign( a );
1967 bSig = extractFloat32Frac( b );
1968 bExp = extractFloat32Exp( b );
1969 bSign = extractFloat32Sign( b );
1970 zSign = aSign ^ bSign;
1971 if ( aExp == 0xFF ) {
1972 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1973 if ( bExp == 0xFF ) {
1974 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1975 float_raise( float_flag_invalid STATUS_VAR);
1976 return float32_default_nan;
1977 }
1978 return packFloat32( zSign, 0xFF, 0 );
1979 }
1980 if ( bExp == 0xFF ) {
1981 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1982 return packFloat32( zSign, 0, 0 );
1983 }
1984 if ( bExp == 0 ) {
1985 if ( bSig == 0 ) {
1986 if ( ( aExp | aSig ) == 0 ) {
1987 float_raise( float_flag_invalid STATUS_VAR);
1988 return float32_default_nan;
1989 }
1990 float_raise( float_flag_divbyzero STATUS_VAR);
1991 return packFloat32( zSign, 0xFF, 0 );
1992 }
1993 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1994 }
1995 if ( aExp == 0 ) {
1996 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1997 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1998 }
1999 zExp = aExp - bExp + 0x7D;
2000 aSig = ( aSig | 0x00800000 )<<7;
2001 bSig = ( bSig | 0x00800000 )<<8;
2002 if ( bSig <= ( aSig + aSig ) ) {
2003 aSig >>= 1;
2004 ++zExp;
2005 }
bb98fe42 2006 zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
158142c2 2007 if ( ( zSig & 0x3F ) == 0 ) {
bb98fe42 2008 zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
158142c2
FB
2009 }
2010 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2011
2012}
2013
2014/*----------------------------------------------------------------------------
2015| Returns the remainder of the single-precision floating-point value `a'
2016| with respect to the corresponding value `b'. The operation is performed
2017| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2018*----------------------------------------------------------------------------*/
2019
2020float32 float32_rem( float32 a, float32 b STATUS_PARAM )
2021{
ed086f3d 2022 flag aSign, zSign;
158142c2 2023 int16 aExp, bExp, expDiff;
bb98fe42
AF
2024 uint32_t aSig, bSig;
2025 uint32_t q;
2026 uint64_t aSig64, bSig64, q64;
2027 uint32_t alternateASig;
2028 int32_t sigMean;
37d18660
PM
2029 a = float32_squash_input_denormal(a STATUS_VAR);
2030 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2031
2032 aSig = extractFloat32Frac( a );
2033 aExp = extractFloat32Exp( a );
2034 aSign = extractFloat32Sign( a );
2035 bSig = extractFloat32Frac( b );
2036 bExp = extractFloat32Exp( b );
158142c2
FB
2037 if ( aExp == 0xFF ) {
2038 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2039 return propagateFloat32NaN( a, b STATUS_VAR );
2040 }
2041 float_raise( float_flag_invalid STATUS_VAR);
2042 return float32_default_nan;
2043 }
2044 if ( bExp == 0xFF ) {
2045 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2046 return a;
2047 }
2048 if ( bExp == 0 ) {
2049 if ( bSig == 0 ) {
2050 float_raise( float_flag_invalid STATUS_VAR);
2051 return float32_default_nan;
2052 }
2053 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2054 }
2055 if ( aExp == 0 ) {
2056 if ( aSig == 0 ) return a;
2057 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2058 }
2059 expDiff = aExp - bExp;
2060 aSig |= 0x00800000;
2061 bSig |= 0x00800000;
2062 if ( expDiff < 32 ) {
2063 aSig <<= 8;
2064 bSig <<= 8;
2065 if ( expDiff < 0 ) {
2066 if ( expDiff < -1 ) return a;
2067 aSig >>= 1;
2068 }
2069 q = ( bSig <= aSig );
2070 if ( q ) aSig -= bSig;
2071 if ( 0 < expDiff ) {
bb98fe42 2072 q = ( ( (uint64_t) aSig )<<32 ) / bSig;
158142c2
FB
2073 q >>= 32 - expDiff;
2074 bSig >>= 2;
2075 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2076 }
2077 else {
2078 aSig >>= 2;
2079 bSig >>= 2;
2080 }
2081 }
2082 else {
2083 if ( bSig <= aSig ) aSig -= bSig;
bb98fe42
AF
2084 aSig64 = ( (uint64_t) aSig )<<40;
2085 bSig64 = ( (uint64_t) bSig )<<40;
158142c2
FB
2086 expDiff -= 64;
2087 while ( 0 < expDiff ) {
2088 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2089 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2090 aSig64 = - ( ( bSig * q64 )<<38 );
2091 expDiff -= 62;
2092 }
2093 expDiff += 64;
2094 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2095 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2096 q = q64>>( 64 - expDiff );
2097 bSig <<= 6;
2098 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2099 }
2100 do {
2101 alternateASig = aSig;
2102 ++q;
2103 aSig -= bSig;
bb98fe42 2104 } while ( 0 <= (int32_t) aSig );
158142c2
FB
2105 sigMean = aSig + alternateASig;
2106 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2107 aSig = alternateASig;
2108 }
bb98fe42 2109 zSign = ( (int32_t) aSig < 0 );
158142c2
FB
2110 if ( zSign ) aSig = - aSig;
2111 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2112
2113}
2114
2115/*----------------------------------------------------------------------------
2116| Returns the square root of the single-precision floating-point value `a'.
2117| The operation is performed according to the IEC/IEEE Standard for Binary
2118| Floating-Point Arithmetic.
2119*----------------------------------------------------------------------------*/
2120
2121float32 float32_sqrt( float32 a STATUS_PARAM )
2122{
2123 flag aSign;
2124 int16 aExp, zExp;
bb98fe42
AF
2125 uint32_t aSig, zSig;
2126 uint64_t rem, term;
37d18660 2127 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
2128
2129 aSig = extractFloat32Frac( a );
2130 aExp = extractFloat32Exp( a );
2131 aSign = extractFloat32Sign( a );
2132 if ( aExp == 0xFF ) {
f090c9d4 2133 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
158142c2
FB
2134 if ( ! aSign ) return a;
2135 float_raise( float_flag_invalid STATUS_VAR);
2136 return float32_default_nan;
2137 }
2138 if ( aSign ) {
2139 if ( ( aExp | aSig ) == 0 ) return a;
2140 float_raise( float_flag_invalid STATUS_VAR);
2141 return float32_default_nan;
2142 }
2143 if ( aExp == 0 ) {
f090c9d4 2144 if ( aSig == 0 ) return float32_zero;
158142c2
FB
2145 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2146 }
2147 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2148 aSig = ( aSig | 0x00800000 )<<8;
2149 zSig = estimateSqrt32( aExp, aSig ) + 2;
2150 if ( ( zSig & 0x7F ) <= 5 ) {
2151 if ( zSig < 2 ) {
2152 zSig = 0x7FFFFFFF;
2153 goto roundAndPack;
2154 }
2155 aSig >>= aExp & 1;
bb98fe42
AF
2156 term = ( (uint64_t) zSig ) * zSig;
2157 rem = ( ( (uint64_t) aSig )<<32 ) - term;
2158 while ( (int64_t) rem < 0 ) {
158142c2 2159 --zSig;
bb98fe42 2160 rem += ( ( (uint64_t) zSig )<<1 ) | 1;
158142c2
FB
2161 }
2162 zSig |= ( rem != 0 );
2163 }
2164 shift32RightJamming( zSig, 1, &zSig );
2165 roundAndPack:
2166 return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2167
2168}
2169
8229c991
AJ
2170/*----------------------------------------------------------------------------
2171| Returns the binary exponential of the single-precision floating-point value
2172| `a'. The operation is performed according to the IEC/IEEE Standard for
2173| Binary Floating-Point Arithmetic.
2174|
2175| Uses the following identities:
2176|
2177| 1. -------------------------------------------------------------------------
2178| x x*ln(2)
2179| 2 = e
2180|
2181| 2. -------------------------------------------------------------------------
2182| 2 3 4 5 n
2183| x x x x x x x
2184| e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2185| 1! 2! 3! 4! 5! n!
2186*----------------------------------------------------------------------------*/
2187
2188static const float64 float32_exp2_coefficients[15] =
2189{
d5138cf4
PM
2190 const_float64( 0x3ff0000000000000ll ), /* 1 */
2191 const_float64( 0x3fe0000000000000ll ), /* 2 */
2192 const_float64( 0x3fc5555555555555ll ), /* 3 */
2193 const_float64( 0x3fa5555555555555ll ), /* 4 */
2194 const_float64( 0x3f81111111111111ll ), /* 5 */
2195 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
2196 const_float64( 0x3f2a01a01a01a01all ), /* 7 */
2197 const_float64( 0x3efa01a01a01a01all ), /* 8 */
2198 const_float64( 0x3ec71de3a556c734ll ), /* 9 */
2199 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2200 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2201 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2202 const_float64( 0x3de6124613a86d09ll ), /* 13 */
2203 const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2204 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
8229c991
AJ
2205};
2206
2207float32 float32_exp2( float32 a STATUS_PARAM )
2208{
2209 flag aSign;
2210 int16 aExp;
bb98fe42 2211 uint32_t aSig;
8229c991
AJ
2212 float64 r, x, xn;
2213 int i;
37d18660 2214 a = float32_squash_input_denormal(a STATUS_VAR);
8229c991
AJ
2215
2216 aSig = extractFloat32Frac( a );
2217 aExp = extractFloat32Exp( a );
2218 aSign = extractFloat32Sign( a );
2219
2220 if ( aExp == 0xFF) {
2221 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2222 return (aSign) ? float32_zero : a;
2223 }
2224 if (aExp == 0) {
2225 if (aSig == 0) return float32_one;
2226 }
2227
2228 float_raise( float_flag_inexact STATUS_VAR);
2229
2230 /* ******************************* */
2231 /* using float64 for approximation */
2232 /* ******************************* */
2233 x = float32_to_float64(a STATUS_VAR);
2234 x = float64_mul(x, float64_ln2 STATUS_VAR);
2235
2236 xn = x;
2237 r = float64_one;
2238 for (i = 0 ; i < 15 ; i++) {
2239 float64 f;
2240
2241 f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2242 r = float64_add(r, f STATUS_VAR);
2243
2244 xn = float64_mul(xn, x STATUS_VAR);
2245 }
2246
2247 return float64_to_float32(r, status);
2248}
2249
374dfc33
AJ
2250/*----------------------------------------------------------------------------
2251| Returns the binary log of the single-precision floating-point value `a'.
2252| The operation is performed according to the IEC/IEEE Standard for Binary
2253| Floating-Point Arithmetic.
2254*----------------------------------------------------------------------------*/
2255float32 float32_log2( float32 a STATUS_PARAM )
2256{
2257 flag aSign, zSign;
2258 int16 aExp;
bb98fe42 2259 uint32_t aSig, zSig, i;
374dfc33 2260
37d18660 2261 a = float32_squash_input_denormal(a STATUS_VAR);
374dfc33
AJ
2262 aSig = extractFloat32Frac( a );
2263 aExp = extractFloat32Exp( a );
2264 aSign = extractFloat32Sign( a );
2265
2266 if ( aExp == 0 ) {
2267 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2268 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2269 }
2270 if ( aSign ) {
2271 float_raise( float_flag_invalid STATUS_VAR);
2272 return float32_default_nan;
2273 }
2274 if ( aExp == 0xFF ) {
2275 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2276 return a;
2277 }
2278
2279 aExp -= 0x7F;
2280 aSig |= 0x00800000;
2281 zSign = aExp < 0;
2282 zSig = aExp << 23;
2283
2284 for (i = 1 << 22; i > 0; i >>= 1) {
bb98fe42 2285 aSig = ( (uint64_t)aSig * aSig ) >> 23;
374dfc33
AJ
2286 if ( aSig & 0x01000000 ) {
2287 aSig >>= 1;
2288 zSig |= i;
2289 }
2290 }
2291
2292 if ( zSign )
2293 zSig = -zSig;
2294
2295 return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2296}
2297
158142c2
FB
2298/*----------------------------------------------------------------------------
2299| Returns 1 if the single-precision floating-point value `a' is equal to
b689362d
AJ
2300| the corresponding value `b', and 0 otherwise. The invalid exception is
2301| raised if either operand is a NaN. Otherwise, the comparison is performed
158142c2
FB
2302| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2303*----------------------------------------------------------------------------*/
2304
b689362d 2305int float32_eq( float32 a, float32 b STATUS_PARAM )
158142c2 2306{
b689362d 2307 uint32_t av, bv;
37d18660
PM
2308 a = float32_squash_input_denormal(a STATUS_VAR);
2309 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2310
2311 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2312 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2313 ) {
b689362d 2314 float_raise( float_flag_invalid STATUS_VAR);
158142c2
FB
2315 return 0;
2316 }
b689362d
AJ
2317 av = float32_val(a);
2318 bv = float32_val(b);
2319 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
158142c2
FB
2320}
2321
2322/*----------------------------------------------------------------------------
2323| Returns 1 if the single-precision floating-point value `a' is less than
f5a64251
AJ
2324| or equal to the corresponding value `b', and 0 otherwise. The invalid
2325| exception is raised if either operand is a NaN. The comparison is performed
2326| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
2327*----------------------------------------------------------------------------*/
2328
750afe93 2329int float32_le( float32 a, float32 b STATUS_PARAM )
158142c2
FB
2330{
2331 flag aSign, bSign;
bb98fe42 2332 uint32_t av, bv;
37d18660
PM
2333 a = float32_squash_input_denormal(a STATUS_VAR);
2334 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2335
2336 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2337 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2338 ) {
2339 float_raise( float_flag_invalid STATUS_VAR);
2340 return 0;
2341 }
2342 aSign = extractFloat32Sign( a );
2343 bSign = extractFloat32Sign( b );
f090c9d4
PB
2344 av = float32_val(a);
2345 bv = float32_val(b);
bb98fe42 2346 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 2347 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
2348
2349}
2350
2351/*----------------------------------------------------------------------------
2352| Returns 1 if the single-precision floating-point value `a' is less than
f5a64251
AJ
2353| the corresponding value `b', and 0 otherwise. The invalid exception is
2354| raised if either operand is a NaN. The comparison is performed according
2355| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
2356*----------------------------------------------------------------------------*/
2357
750afe93 2358int float32_lt( float32 a, float32 b STATUS_PARAM )
158142c2
FB
2359{
2360 flag aSign, bSign;
bb98fe42 2361 uint32_t av, bv;
37d18660
PM
2362 a = float32_squash_input_denormal(a STATUS_VAR);
2363 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2364
2365 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2366 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2367 ) {
2368 float_raise( float_flag_invalid STATUS_VAR);
2369 return 0;
2370 }
2371 aSign = extractFloat32Sign( a );
2372 bSign = extractFloat32Sign( b );
f090c9d4
PB
2373 av = float32_val(a);
2374 bv = float32_val(b);
bb98fe42 2375 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 2376 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
2377
2378}
2379
67b7861d
AJ
2380/*----------------------------------------------------------------------------
2381| Returns 1 if the single-precision floating-point values `a' and `b' cannot
f5a64251
AJ
2382| be compared, and 0 otherwise. The invalid exception is raised if either
2383| operand is a NaN. The comparison is performed according to the IEC/IEEE
2384| Standard for Binary Floating-Point Arithmetic.
67b7861d
AJ
2385*----------------------------------------------------------------------------*/
2386
2387int float32_unordered( float32 a, float32 b STATUS_PARAM )
2388{
2389 a = float32_squash_input_denormal(a STATUS_VAR);
2390 b = float32_squash_input_denormal(b STATUS_VAR);
2391
2392 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2393 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2394 ) {
2395 float_raise( float_flag_invalid STATUS_VAR);
2396 return 1;
2397 }
2398 return 0;
2399}
b689362d 2400
158142c2
FB
2401/*----------------------------------------------------------------------------
2402| Returns 1 if the single-precision floating-point value `a' is equal to
f5a64251
AJ
2403| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2404| exception. The comparison is performed according to the IEC/IEEE Standard
2405| for Binary Floating-Point Arithmetic.
158142c2
FB
2406*----------------------------------------------------------------------------*/
2407
b689362d 2408int float32_eq_quiet( float32 a, float32 b STATUS_PARAM )
158142c2 2409{
37d18660
PM
2410 a = float32_squash_input_denormal(a STATUS_VAR);
2411 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2412
2413 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2414 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2415 ) {
b689362d
AJ
2416 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2417 float_raise( float_flag_invalid STATUS_VAR);
2418 }
158142c2
FB
2419 return 0;
2420 }
b689362d
AJ
2421 return ( float32_val(a) == float32_val(b) ) ||
2422 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
158142c2
FB
2423}
2424
2425/*----------------------------------------------------------------------------
2426| Returns 1 if the single-precision floating-point value `a' is less than or
2427| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2428| cause an exception. Otherwise, the comparison is performed according to the
2429| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2430*----------------------------------------------------------------------------*/
2431
750afe93 2432int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
158142c2
FB
2433{
2434 flag aSign, bSign;
bb98fe42 2435 uint32_t av, bv;
37d18660
PM
2436 a = float32_squash_input_denormal(a STATUS_VAR);
2437 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2438
2439 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2440 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2441 ) {
2442 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2443 float_raise( float_flag_invalid STATUS_VAR);
2444 }
2445 return 0;
2446 }
2447 aSign = extractFloat32Sign( a );
2448 bSign = extractFloat32Sign( b );
f090c9d4
PB
2449 av = float32_val(a);
2450 bv = float32_val(b);
bb98fe42 2451 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 2452 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
2453
2454}
2455
2456/*----------------------------------------------------------------------------
2457| Returns 1 if the single-precision floating-point value `a' is less than
2458| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2459| exception. Otherwise, the comparison is performed according to the IEC/IEEE
2460| Standard for Binary Floating-Point Arithmetic.
2461*----------------------------------------------------------------------------*/
2462
750afe93 2463int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
158142c2
FB
2464{
2465 flag aSign, bSign;
bb98fe42 2466 uint32_t av, bv;
37d18660
PM
2467 a = float32_squash_input_denormal(a STATUS_VAR);
2468 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2469
2470 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2471 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2472 ) {
2473 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2474 float_raise( float_flag_invalid STATUS_VAR);
2475 }
2476 return 0;
2477 }
2478 aSign = extractFloat32Sign( a );
2479 bSign = extractFloat32Sign( b );
f090c9d4
PB
2480 av = float32_val(a);
2481 bv = float32_val(b);
bb98fe42 2482 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 2483 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
2484
2485}
2486
67b7861d
AJ
2487/*----------------------------------------------------------------------------
2488| Returns 1 if the single-precision floating-point values `a' and `b' cannot
2489| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
2490| comparison is performed according to the IEC/IEEE Standard for Binary
2491| Floating-Point Arithmetic.
2492*----------------------------------------------------------------------------*/
2493
2494int float32_unordered_quiet( float32 a, float32 b STATUS_PARAM )
2495{
2496 a = float32_squash_input_denormal(a STATUS_VAR);
2497 b = float32_squash_input_denormal(b STATUS_VAR);
2498
2499 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2500 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2501 ) {
2502 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2503 float_raise( float_flag_invalid STATUS_VAR);
2504 }
2505 return 1;
2506 }
2507 return 0;
2508}
2509
158142c2
FB
2510/*----------------------------------------------------------------------------
2511| Returns the result of converting the double-precision floating-point value
2512| `a' to the 32-bit two's complement integer format. The conversion is
2513| performed according to the IEC/IEEE Standard for Binary Floating-Point
2514| Arithmetic---which means in particular that the conversion is rounded
2515| according to the current rounding mode. If `a' is a NaN, the largest
2516| positive integer is returned. Otherwise, if the conversion overflows, the
2517| largest integer with the same sign as `a' is returned.
2518*----------------------------------------------------------------------------*/
2519
2520int32 float64_to_int32( float64 a STATUS_PARAM )
2521{
2522 flag aSign;
2523 int16 aExp, shiftCount;
bb98fe42 2524 uint64_t aSig;
37d18660 2525 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
2526
2527 aSig = extractFloat64Frac( a );
2528 aExp = extractFloat64Exp( a );
2529 aSign = extractFloat64Sign( a );
2530 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2531 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2532 shiftCount = 0x42C - aExp;
2533 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2534 return roundAndPackInt32( aSign, aSig STATUS_VAR );
2535
2536}
2537
2538/*----------------------------------------------------------------------------
2539| Returns the result of converting the double-precision floating-point value
2540| `a' to the 32-bit two's complement integer format. The conversion is
2541| performed according to the IEC/IEEE Standard for Binary Floating-Point
2542| Arithmetic, except that the conversion is always rounded toward zero.
2543| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2544| the conversion overflows, the largest integer with the same sign as `a' is
2545| returned.
2546*----------------------------------------------------------------------------*/
2547
2548int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2549{
2550 flag aSign;
2551 int16 aExp, shiftCount;
bb98fe42 2552 uint64_t aSig, savedASig;
158142c2 2553 int32 z;
37d18660 2554 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
2555
2556 aSig = extractFloat64Frac( a );
2557 aExp = extractFloat64Exp( a );
2558 aSign = extractFloat64Sign( a );
2559 if ( 0x41E < aExp ) {
2560 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2561 goto invalid;
2562 }
2563 else if ( aExp < 0x3FF ) {
2564 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2565 return 0;
2566 }
2567 aSig |= LIT64( 0x0010000000000000 );
2568 shiftCount = 0x433 - aExp;
2569 savedASig = aSig;
2570 aSig >>= shiftCount;
2571 z = aSig;
2572 if ( aSign ) z = - z;
2573 if ( ( z < 0 ) ^ aSign ) {
2574 invalid:
2575 float_raise( float_flag_invalid STATUS_VAR);
bb98fe42 2576 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2
FB
2577 }
2578 if ( ( aSig<<shiftCount ) != savedASig ) {
2579 STATUS(float_exception_flags) |= float_flag_inexact;
2580 }
2581 return z;
2582
2583}
2584
cbcef455
PM
2585/*----------------------------------------------------------------------------
2586| Returns the result of converting the double-precision floating-point value
2587| `a' to the 16-bit two's complement integer format. The conversion is
2588| performed according to the IEC/IEEE Standard for Binary Floating-Point
2589| Arithmetic, except that the conversion is always rounded toward zero.
2590| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2591| the conversion overflows, the largest integer with the same sign as `a' is
2592| returned.
2593*----------------------------------------------------------------------------*/
2594
2595int16 float64_to_int16_round_to_zero( float64 a STATUS_PARAM )
2596{
2597 flag aSign;
2598 int16 aExp, shiftCount;
bb98fe42 2599 uint64_t aSig, savedASig;
cbcef455
PM
2600 int32 z;
2601
2602 aSig = extractFloat64Frac( a );
2603 aExp = extractFloat64Exp( a );
2604 aSign = extractFloat64Sign( a );
2605 if ( 0x40E < aExp ) {
2606 if ( ( aExp == 0x7FF ) && aSig ) {
2607 aSign = 0;
2608 }
2609 goto invalid;
2610 }
2611 else if ( aExp < 0x3FF ) {
2612 if ( aExp || aSig ) {
2613 STATUS(float_exception_flags) |= float_flag_inexact;
2614 }
2615 return 0;
2616 }
2617 aSig |= LIT64( 0x0010000000000000 );
2618 shiftCount = 0x433 - aExp;
2619 savedASig = aSig;
2620 aSig >>= shiftCount;
2621 z = aSig;
2622 if ( aSign ) {
2623 z = - z;
2624 }
2625 if ( ( (int16_t)z < 0 ) ^ aSign ) {
2626 invalid:
2627 float_raise( float_flag_invalid STATUS_VAR);
bb98fe42 2628 return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
cbcef455
PM
2629 }
2630 if ( ( aSig<<shiftCount ) != savedASig ) {
2631 STATUS(float_exception_flags) |= float_flag_inexact;
2632 }
2633 return z;
2634}
2635
158142c2
FB
2636/*----------------------------------------------------------------------------
2637| Returns the result of converting the double-precision floating-point value
2638| `a' to the 64-bit two's complement integer format. The conversion is
2639| performed according to the IEC/IEEE Standard for Binary Floating-Point
2640| Arithmetic---which means in particular that the conversion is rounded
2641| according to the current rounding mode. If `a' is a NaN, the largest
2642| positive integer is returned. Otherwise, if the conversion overflows, the
2643| largest integer with the same sign as `a' is returned.
2644*----------------------------------------------------------------------------*/
2645
2646int64 float64_to_int64( float64 a STATUS_PARAM )
2647{
2648 flag aSign;
2649 int16 aExp, shiftCount;
bb98fe42 2650 uint64_t aSig, aSigExtra;
37d18660 2651 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
2652
2653 aSig = extractFloat64Frac( a );
2654 aExp = extractFloat64Exp( a );
2655 aSign = extractFloat64Sign( a );
2656 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2657 shiftCount = 0x433 - aExp;
2658 if ( shiftCount <= 0 ) {
2659 if ( 0x43E < aExp ) {
2660 float_raise( float_flag_invalid STATUS_VAR);
2661 if ( ! aSign
2662 || ( ( aExp == 0x7FF )
2663 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2664 ) {
2665 return LIT64( 0x7FFFFFFFFFFFFFFF );
2666 }
bb98fe42 2667 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
2668 }
2669 aSigExtra = 0;
2670 aSig <<= - shiftCount;
2671 }
2672 else {
2673 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2674 }
2675 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2676
2677}
2678
2679/*----------------------------------------------------------------------------
2680| Returns the result of converting the double-precision floating-point value
2681| `a' to the 64-bit two's complement integer format. The conversion is
2682| performed according to the IEC/IEEE Standard for Binary Floating-Point
2683| Arithmetic, except that the conversion is always rounded toward zero.
2684| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2685| the conversion overflows, the largest integer with the same sign as `a' is
2686| returned.
2687*----------------------------------------------------------------------------*/
2688
2689int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2690{
2691 flag aSign;
2692 int16 aExp, shiftCount;
bb98fe42 2693 uint64_t aSig;
158142c2 2694 int64 z;
37d18660 2695 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
2696
2697 aSig = extractFloat64Frac( a );
2698 aExp = extractFloat64Exp( a );
2699 aSign = extractFloat64Sign( a );
2700 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2701 shiftCount = aExp - 0x433;
2702 if ( 0 <= shiftCount ) {
2703 if ( 0x43E <= aExp ) {
f090c9d4 2704 if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
158142c2
FB
2705 float_raise( float_flag_invalid STATUS_VAR);
2706 if ( ! aSign
2707 || ( ( aExp == 0x7FF )
2708 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2709 ) {
2710 return LIT64( 0x7FFFFFFFFFFFFFFF );
2711 }
2712 }
bb98fe42 2713 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
2714 }
2715 z = aSig<<shiftCount;
2716 }
2717 else {
2718 if ( aExp < 0x3FE ) {
2719 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2720 return 0;
2721 }
2722 z = aSig>>( - shiftCount );
bb98fe42 2723 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
158142c2
FB
2724 STATUS(float_exception_flags) |= float_flag_inexact;
2725 }
2726 }
2727 if ( aSign ) z = - z;
2728 return z;
2729
2730}
2731
2732/*----------------------------------------------------------------------------
2733| Returns the result of converting the double-precision floating-point value
2734| `a' to the single-precision floating-point format. The conversion is
2735| performed according to the IEC/IEEE Standard for Binary Floating-Point
2736| Arithmetic.
2737*----------------------------------------------------------------------------*/
2738
2739float32 float64_to_float32( float64 a STATUS_PARAM )
2740{
2741 flag aSign;
2742 int16 aExp;
bb98fe42
AF
2743 uint64_t aSig;
2744 uint32_t zSig;
37d18660 2745 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
2746
2747 aSig = extractFloat64Frac( a );
2748 aExp = extractFloat64Exp( a );
2749 aSign = extractFloat64Sign( a );
2750 if ( aExp == 0x7FF ) {
bcd4d9af 2751 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
2752 return packFloat32( aSign, 0xFF, 0 );
2753 }
2754 shift64RightJamming( aSig, 22, &aSig );
2755 zSig = aSig;
2756 if ( aExp || zSig ) {
2757 zSig |= 0x40000000;
2758 aExp -= 0x381;
2759 }
2760 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2761
2762}
2763
60011498
PB
2764
2765/*----------------------------------------------------------------------------
2766| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2767| half-precision floating-point value, returning the result. After being
2768| shifted into the proper positions, the three fields are simply added
2769| together to form the result. This means that any integer portion of `zSig'
2770| will be added into the exponent. Since a properly normalized significand
2771| will have an integer portion equal to 1, the `zExp' input should be 1 less
2772| than the desired result exponent whenever `zSig' is a complete, normalized
2773| significand.
2774*----------------------------------------------------------------------------*/
bb98fe42 2775static float16 packFloat16(flag zSign, int16 zExp, uint16_t zSig)
60011498 2776{
bb4d4bb3 2777 return make_float16(
bb98fe42 2778 (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
60011498
PB
2779}
2780
2781/* Half precision floats come in two formats: standard IEEE and "ARM" format.
2782 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
bb4d4bb3
PM
2783
2784float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM)
60011498
PB
2785{
2786 flag aSign;
2787 int16 aExp;
bb98fe42 2788 uint32_t aSig;
60011498 2789
bb4d4bb3
PM
2790 aSign = extractFloat16Sign(a);
2791 aExp = extractFloat16Exp(a);
2792 aSig = extractFloat16Frac(a);
60011498
PB
2793
2794 if (aExp == 0x1f && ieee) {
2795 if (aSig) {
f591e1be 2796 return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
60011498
PB
2797 }
2798 return packFloat32(aSign, 0xff, aSig << 13);
2799 }
2800 if (aExp == 0) {
2801 int8 shiftCount;
2802
2803 if (aSig == 0) {
2804 return packFloat32(aSign, 0, 0);
2805 }
2806
2807 shiftCount = countLeadingZeros32( aSig ) - 21;
2808 aSig = aSig << shiftCount;
2809 aExp = -shiftCount;
2810 }
2811 return packFloat32( aSign, aExp + 0x70, aSig << 13);
2812}
2813
bb4d4bb3 2814float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM)
60011498
PB
2815{
2816 flag aSign;
2817 int16 aExp;
bb98fe42
AF
2818 uint32_t aSig;
2819 uint32_t mask;
2820 uint32_t increment;
60011498 2821 int8 roundingMode;
37d18660 2822 a = float32_squash_input_denormal(a STATUS_VAR);
60011498
PB
2823
2824 aSig = extractFloat32Frac( a );
2825 aExp = extractFloat32Exp( a );
2826 aSign = extractFloat32Sign( a );
2827 if ( aExp == 0xFF ) {
2828 if (aSig) {
600e30d2
PM
2829 /* Input is a NaN */
2830 float16 r = commonNaNToFloat16( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2831 if (!ieee) {
2832 return packFloat16(aSign, 0, 0);
2833 }
2834 return r;
60011498 2835 }
600e30d2
PM
2836 /* Infinity */
2837 if (!ieee) {
2838 float_raise(float_flag_invalid STATUS_VAR);
2839 return packFloat16(aSign, 0x1f, 0x3ff);
2840 }
2841 return packFloat16(aSign, 0x1f, 0);
60011498 2842 }
600e30d2 2843 if (aExp == 0 && aSig == 0) {
60011498
PB
2844 return packFloat16(aSign, 0, 0);
2845 }
2846 /* Decimal point between bits 22 and 23. */
2847 aSig |= 0x00800000;
2848 aExp -= 0x7f;
2849 if (aExp < -14) {
600e30d2
PM
2850 mask = 0x00ffffff;
2851 if (aExp >= -24) {
2852 mask >>= 25 + aExp;
60011498
PB
2853 }
2854 } else {
2855 mask = 0x00001fff;
2856 }
2857 if (aSig & mask) {
2858 float_raise( float_flag_underflow STATUS_VAR );
2859 roundingMode = STATUS(float_rounding_mode);
2860 switch (roundingMode) {
2861 case float_round_nearest_even:
2862 increment = (mask + 1) >> 1;
2863 if ((aSig & mask) == increment) {
2864 increment = aSig & (increment << 1);
2865 }
2866 break;
2867 case float_round_up:
2868 increment = aSign ? 0 : mask;
2869 break;
2870 case float_round_down:
2871 increment = aSign ? mask : 0;
2872 break;
2873 default: /* round_to_zero */
2874 increment = 0;
2875 break;
2876 }
2877 aSig += increment;
2878 if (aSig >= 0x01000000) {
2879 aSig >>= 1;
2880 aExp++;
2881 }
2882 } else if (aExp < -14
2883 && STATUS(float_detect_tininess) == float_tininess_before_rounding) {
2884 float_raise( float_flag_underflow STATUS_VAR);
2885 }
2886
2887 if (ieee) {
2888 if (aExp > 15) {
2889 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
2890 return packFloat16(aSign, 0x1f, 0);
2891 }
2892 } else {
2893 if (aExp > 16) {
600e30d2 2894 float_raise(float_flag_invalid | float_flag_inexact STATUS_VAR);
60011498
PB
2895 return packFloat16(aSign, 0x1f, 0x3ff);
2896 }
2897 }
2898 if (aExp < -24) {
2899 return packFloat16(aSign, 0, 0);
2900 }
2901 if (aExp < -14) {
2902 aSig >>= -14 - aExp;
2903 aExp = -14;
2904 }
2905 return packFloat16(aSign, aExp + 14, aSig >> 13);
2906}
2907
158142c2
FB
2908/*----------------------------------------------------------------------------
2909| Returns the result of converting the double-precision floating-point value
2910| `a' to the extended double-precision floating-point format. The conversion
2911| is performed according to the IEC/IEEE Standard for Binary Floating-Point
2912| Arithmetic.
2913*----------------------------------------------------------------------------*/
2914
2915floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
2916{
2917 flag aSign;
2918 int16 aExp;
bb98fe42 2919 uint64_t aSig;
158142c2 2920
37d18660 2921 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
2922 aSig = extractFloat64Frac( a );
2923 aExp = extractFloat64Exp( a );
2924 aSign = extractFloat64Sign( a );
2925 if ( aExp == 0x7FF ) {
bcd4d9af 2926 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
2927 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2928 }
2929 if ( aExp == 0 ) {
2930 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2931 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2932 }
2933 return
2934 packFloatx80(
2935 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2936
2937}
2938
158142c2
FB
2939/*----------------------------------------------------------------------------
2940| Returns the result of converting the double-precision floating-point value
2941| `a' to the quadruple-precision floating-point format. The conversion is
2942| performed according to the IEC/IEEE Standard for Binary Floating-Point
2943| Arithmetic.
2944*----------------------------------------------------------------------------*/
2945
2946float128 float64_to_float128( float64 a STATUS_PARAM )
2947{
2948 flag aSign;
2949 int16 aExp;
bb98fe42 2950 uint64_t aSig, zSig0, zSig1;
158142c2 2951
37d18660 2952 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
2953 aSig = extractFloat64Frac( a );
2954 aExp = extractFloat64Exp( a );
2955 aSign = extractFloat64Sign( a );
2956 if ( aExp == 0x7FF ) {
bcd4d9af 2957 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
2958 return packFloat128( aSign, 0x7FFF, 0, 0 );
2959 }
2960 if ( aExp == 0 ) {
2961 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2962 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2963 --aExp;
2964 }
2965 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2966 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2967
2968}
2969
158142c2
FB
2970/*----------------------------------------------------------------------------
2971| Rounds the double-precision floating-point value `a' to an integer, and
2972| returns the result as a double-precision floating-point value. The
2973| operation is performed according to the IEC/IEEE Standard for Binary
2974| Floating-Point Arithmetic.
2975*----------------------------------------------------------------------------*/
2976
2977float64 float64_round_to_int( float64 a STATUS_PARAM )
2978{
2979 flag aSign;
2980 int16 aExp;
bb98fe42 2981 uint64_t lastBitMask, roundBitsMask;
158142c2 2982 int8 roundingMode;
bb98fe42 2983 uint64_t z;
37d18660 2984 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
2985
2986 aExp = extractFloat64Exp( a );
2987 if ( 0x433 <= aExp ) {
2988 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2989 return propagateFloat64NaN( a, a STATUS_VAR );
2990 }
2991 return a;
2992 }
2993 if ( aExp < 0x3FF ) {
bb98fe42 2994 if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
158142c2
FB
2995 STATUS(float_exception_flags) |= float_flag_inexact;
2996 aSign = extractFloat64Sign( a );
2997 switch ( STATUS(float_rounding_mode) ) {
2998 case float_round_nearest_even:
2999 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3000 return packFloat64( aSign, 0x3FF, 0 );
3001 }
3002 break;
3003 case float_round_down:
f090c9d4 3004 return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
158142c2 3005 case float_round_up:
f090c9d4
PB
3006 return make_float64(
3007 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
158142c2
FB
3008 }
3009 return packFloat64( aSign, 0, 0 );
3010 }
3011 lastBitMask = 1;
3012 lastBitMask <<= 0x433 - aExp;
3013 roundBitsMask = lastBitMask - 1;
f090c9d4 3014 z = float64_val(a);
158142c2
FB
3015 roundingMode = STATUS(float_rounding_mode);
3016 if ( roundingMode == float_round_nearest_even ) {
3017 z += lastBitMask>>1;
3018 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
3019 }
3020 else if ( roundingMode != float_round_to_zero ) {
f090c9d4 3021 if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
158142c2
FB
3022 z += roundBitsMask;
3023 }
3024 }
3025 z &= ~ roundBitsMask;
f090c9d4
PB
3026 if ( z != float64_val(a) )
3027 STATUS(float_exception_flags) |= float_flag_inexact;
3028 return make_float64(z);
158142c2
FB
3029
3030}
3031
e6e5906b
PB
3032float64 float64_trunc_to_int( float64 a STATUS_PARAM)
3033{
3034 int oldmode;
3035 float64 res;
3036 oldmode = STATUS(float_rounding_mode);
3037 STATUS(float_rounding_mode) = float_round_to_zero;
3038 res = float64_round_to_int(a STATUS_VAR);
3039 STATUS(float_rounding_mode) = oldmode;
3040 return res;
3041}
3042
158142c2
FB
3043/*----------------------------------------------------------------------------
3044| Returns the result of adding the absolute values of the double-precision
3045| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
3046| before being returned. `zSign' is ignored if the result is a NaN.
3047| The addition is performed according to the IEC/IEEE Standard for Binary
3048| Floating-Point Arithmetic.
3049*----------------------------------------------------------------------------*/
3050
3051static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3052{
3053 int16 aExp, bExp, zExp;
bb98fe42 3054 uint64_t aSig, bSig, zSig;
158142c2
FB
3055 int16 expDiff;
3056
3057 aSig = extractFloat64Frac( a );
3058 aExp = extractFloat64Exp( a );
3059 bSig = extractFloat64Frac( b );
3060 bExp = extractFloat64Exp( b );
3061 expDiff = aExp - bExp;
3062 aSig <<= 9;
3063 bSig <<= 9;
3064 if ( 0 < expDiff ) {
3065 if ( aExp == 0x7FF ) {
3066 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3067 return a;
3068 }
3069 if ( bExp == 0 ) {
3070 --expDiff;
3071 }
3072 else {
3073 bSig |= LIT64( 0x2000000000000000 );
3074 }
3075 shift64RightJamming( bSig, expDiff, &bSig );
3076 zExp = aExp;
3077 }
3078 else if ( expDiff < 0 ) {
3079 if ( bExp == 0x7FF ) {
3080 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3081 return packFloat64( zSign, 0x7FF, 0 );
3082 }
3083 if ( aExp == 0 ) {
3084 ++expDiff;
3085 }
3086 else {
3087 aSig |= LIT64( 0x2000000000000000 );
3088 }
3089 shift64RightJamming( aSig, - expDiff, &aSig );
3090 zExp = bExp;
3091 }
3092 else {
3093 if ( aExp == 0x7FF ) {
3094 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3095 return a;
3096 }
fe76d976 3097 if ( aExp == 0 ) {
e6afc87f
PM
3098 if (STATUS(flush_to_zero)) {
3099 if (aSig | bSig) {
3100 float_raise(float_flag_output_denormal STATUS_VAR);
3101 }
3102 return packFloat64(zSign, 0, 0);
3103 }
fe76d976
PB
3104 return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3105 }
158142c2
FB
3106 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3107 zExp = aExp;
3108 goto roundAndPack;
3109 }
3110 aSig |= LIT64( 0x2000000000000000 );
3111 zSig = ( aSig + bSig )<<1;
3112 --zExp;
bb98fe42 3113 if ( (int64_t) zSig < 0 ) {
158142c2
FB
3114 zSig = aSig + bSig;
3115 ++zExp;
3116 }
3117 roundAndPack:
3118 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3119
3120}
3121
3122/*----------------------------------------------------------------------------
3123| Returns the result of subtracting the absolute values of the double-
3124| precision floating-point values `a' and `b'. If `zSign' is 1, the
3125| difference is negated before being returned. `zSign' is ignored if the
3126| result is a NaN. The subtraction is performed according to the IEC/IEEE
3127| Standard for Binary Floating-Point Arithmetic.
3128*----------------------------------------------------------------------------*/
3129
3130static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3131{
3132 int16 aExp, bExp, zExp;
bb98fe42 3133 uint64_t aSig, bSig, zSig;
158142c2
FB
3134 int16 expDiff;
3135
3136 aSig = extractFloat64Frac( a );
3137 aExp = extractFloat64Exp( a );
3138 bSig = extractFloat64Frac( b );
3139 bExp = extractFloat64Exp( b );
3140 expDiff = aExp - bExp;
3141 aSig <<= 10;
3142 bSig <<= 10;
3143 if ( 0 < expDiff ) goto aExpBigger;
3144 if ( expDiff < 0 ) goto bExpBigger;
3145 if ( aExp == 0x7FF ) {
3146 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3147 float_raise( float_flag_invalid STATUS_VAR);
3148 return float64_default_nan;
3149 }
3150 if ( aExp == 0 ) {
3151 aExp = 1;
3152 bExp = 1;
3153 }
3154 if ( bSig < aSig ) goto aBigger;
3155 if ( aSig < bSig ) goto bBigger;
3156 return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3157 bExpBigger:
3158 if ( bExp == 0x7FF ) {
3159 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3160 return packFloat64( zSign ^ 1, 0x7FF, 0 );
3161 }
3162 if ( aExp == 0 ) {
3163 ++expDiff;
3164 }
3165 else {
3166 aSig |= LIT64( 0x4000000000000000 );
3167 }
3168 shift64RightJamming( aSig, - expDiff, &aSig );
3169 bSig |= LIT64( 0x4000000000000000 );
3170 bBigger:
3171 zSig = bSig - aSig;
3172 zExp = bExp;
3173 zSign ^= 1;
3174 goto normalizeRoundAndPack;
3175 aExpBigger:
3176 if ( aExp == 0x7FF ) {
3177 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3178 return a;
3179 }
3180 if ( bExp == 0 ) {
3181 --expDiff;
3182 }
3183 else {
3184 bSig |= LIT64( 0x4000000000000000 );
3185 }
3186 shift64RightJamming( bSig, expDiff, &bSig );
3187 aSig |= LIT64( 0x4000000000000000 );
3188 aBigger:
3189 zSig = aSig - bSig;
3190 zExp = aExp;
3191 normalizeRoundAndPack:
3192 --zExp;
3193 return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3194
3195}
3196
3197/*----------------------------------------------------------------------------
3198| Returns the result of adding the double-precision floating-point values `a'
3199| and `b'. The operation is performed according to the IEC/IEEE Standard for
3200| Binary Floating-Point Arithmetic.
3201*----------------------------------------------------------------------------*/
3202
3203float64 float64_add( float64 a, float64 b STATUS_PARAM )
3204{
3205 flag aSign, bSign;
37d18660
PM
3206 a = float64_squash_input_denormal(a STATUS_VAR);
3207 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
3208
3209 aSign = extractFloat64Sign( a );
3210 bSign = extractFloat64Sign( b );
3211 if ( aSign == bSign ) {
3212 return addFloat64Sigs( a, b, aSign STATUS_VAR );
3213 }
3214 else {
3215 return subFloat64Sigs( a, b, aSign STATUS_VAR );
3216 }
3217
3218}
3219
3220/*----------------------------------------------------------------------------
3221| Returns the result of subtracting the double-precision floating-point values
3222| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3223| for Binary Floating-Point Arithmetic.
3224*----------------------------------------------------------------------------*/
3225
3226float64 float64_sub( float64 a, float64 b STATUS_PARAM )
3227{
3228 flag aSign, bSign;
37d18660
PM
3229 a = float64_squash_input_denormal(a STATUS_VAR);
3230 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
3231
3232 aSign = extractFloat64Sign( a );
3233 bSign = extractFloat64Sign( b );
3234 if ( aSign == bSign ) {
3235 return subFloat64Sigs( a, b, aSign STATUS_VAR );
3236 }
3237 else {
3238 return addFloat64Sigs( a, b, aSign STATUS_VAR );
3239 }
3240
3241}
3242
3243/*----------------------------------------------------------------------------
3244| Returns the result of multiplying the double-precision floating-point values
3245| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3246| for Binary Floating-Point Arithmetic.
3247*----------------------------------------------------------------------------*/
3248
3249float64 float64_mul( float64 a, float64 b STATUS_PARAM )
3250{
3251 flag aSign, bSign, zSign;
3252 int16 aExp, bExp, zExp;
bb98fe42 3253 uint64_t aSig, bSig, zSig0, zSig1;
158142c2 3254
37d18660
PM
3255 a = float64_squash_input_denormal(a STATUS_VAR);
3256 b = float64_squash_input_denormal(b STATUS_VAR);
3257
158142c2
FB
3258 aSig = extractFloat64Frac( a );
3259 aExp = extractFloat64Exp( a );
3260 aSign = extractFloat64Sign( a );
3261 bSig = extractFloat64Frac( b );
3262 bExp = extractFloat64Exp( b );
3263 bSign = extractFloat64Sign( b );
3264 zSign = aSign ^ bSign;
3265 if ( aExp == 0x7FF ) {
3266 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3267 return propagateFloat64NaN( a, b STATUS_VAR );
3268 }
3269 if ( ( bExp | bSig ) == 0 ) {
3270 float_raise( float_flag_invalid STATUS_VAR);
3271 return float64_default_nan;
3272 }
3273 return packFloat64( zSign, 0x7FF, 0 );
3274 }
3275 if ( bExp == 0x7FF ) {
3276 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3277 if ( ( aExp | aSig ) == 0 ) {
3278 float_raise( float_flag_invalid STATUS_VAR);
3279 return float64_default_nan;
3280 }
3281 return packFloat64( zSign, 0x7FF, 0 );
3282 }
3283 if ( aExp == 0 ) {
3284 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3285 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3286 }
3287 if ( bExp == 0 ) {
3288 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
3289 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3290 }
3291 zExp = aExp + bExp - 0x3FF;
3292 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3293 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3294 mul64To128( aSig, bSig, &zSig0, &zSig1 );
3295 zSig0 |= ( zSig1 != 0 );
bb98fe42 3296 if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
158142c2
FB
3297 zSig0 <<= 1;
3298 --zExp;
3299 }
3300 return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
3301
3302}
3303
3304/*----------------------------------------------------------------------------
3305| Returns the result of dividing the double-precision floating-point value `a'
3306| by the corresponding value `b'. The operation is performed according to
3307| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3308*----------------------------------------------------------------------------*/
3309
3310float64 float64_div( float64 a, float64 b STATUS_PARAM )
3311{
3312 flag aSign, bSign, zSign;
3313 int16 aExp, bExp, zExp;
bb98fe42
AF
3314 uint64_t aSig, bSig, zSig;
3315 uint64_t rem0, rem1;
3316 uint64_t term0, term1;
37d18660
PM
3317 a = float64_squash_input_denormal(a STATUS_VAR);
3318 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
3319
3320 aSig = extractFloat64Frac( a );
3321 aExp = extractFloat64Exp( a );
3322 aSign = extractFloat64Sign( a );
3323 bSig = extractFloat64Frac( b );
3324 bExp = extractFloat64Exp( b );
3325 bSign = extractFloat64Sign( b );
3326 zSign = aSign ^ bSign;
3327 if ( aExp == 0x7FF ) {
3328 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3329 if ( bExp == 0x7FF ) {
3330 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3331 float_raise( float_flag_invalid STATUS_VAR);
3332 return float64_default_nan;
3333 }
3334 return packFloat64( zSign, 0x7FF, 0 );
3335 }
3336 if ( bExp == 0x7FF ) {
3337 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3338 return packFloat64( zSign, 0, 0 );
3339 }
3340 if ( bExp == 0 ) {
3341 if ( bSig == 0 ) {
3342 if ( ( aExp | aSig ) == 0 ) {
3343 float_raise( float_flag_invalid STATUS_VAR);
3344 return float64_default_nan;
3345 }
3346 float_raise( float_flag_divbyzero STATUS_VAR);
3347 return packFloat64( zSign, 0x7FF, 0 );
3348 }
3349 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3350 }
3351 if ( aExp == 0 ) {
3352 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3353 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3354 }
3355 zExp = aExp - bExp + 0x3FD;
3356 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3357 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3358 if ( bSig <= ( aSig + aSig ) ) {
3359 aSig >>= 1;
3360 ++zExp;
3361 }
3362 zSig = estimateDiv128To64( aSig, 0, bSig );
3363 if ( ( zSig & 0x1FF ) <= 2 ) {
3364 mul64To128( bSig, zSig, &term0, &term1 );
3365 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
bb98fe42 3366 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
3367 --zSig;
3368 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3369 }
3370 zSig |= ( rem1 != 0 );
3371 }
3372 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3373
3374}
3375
3376/*----------------------------------------------------------------------------
3377| Returns the remainder of the double-precision floating-point value `a'
3378| with respect to the corresponding value `b'. The operation is performed
3379| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3380*----------------------------------------------------------------------------*/
3381
3382float64 float64_rem( float64 a, float64 b STATUS_PARAM )
3383{
ed086f3d 3384 flag aSign, zSign;
158142c2 3385 int16 aExp, bExp, expDiff;
bb98fe42
AF
3386 uint64_t aSig, bSig;
3387 uint64_t q, alternateASig;
3388 int64_t sigMean;
158142c2 3389
37d18660
PM
3390 a = float64_squash_input_denormal(a STATUS_VAR);
3391 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
3392 aSig = extractFloat64Frac( a );
3393 aExp = extractFloat64Exp( a );
3394 aSign = extractFloat64Sign( a );
3395 bSig = extractFloat64Frac( b );
3396 bExp = extractFloat64Exp( b );
158142c2
FB
3397 if ( aExp == 0x7FF ) {
3398 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3399 return propagateFloat64NaN( a, b STATUS_VAR );
3400 }
3401 float_raise( float_flag_invalid STATUS_VAR);
3402 return float64_default_nan;
3403 }
3404 if ( bExp == 0x7FF ) {
3405 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3406 return a;
3407 }
3408 if ( bExp == 0 ) {
3409 if ( bSig == 0 ) {
3410 float_raise( float_flag_invalid STATUS_VAR);
3411 return float64_default_nan;
3412 }
3413 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3414 }
3415 if ( aExp == 0 ) {
3416 if ( aSig == 0 ) return a;
3417 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3418 }
3419 expDiff = aExp - bExp;
3420 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3421 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3422 if ( expDiff < 0 ) {
3423 if ( expDiff < -1 ) return a;
3424 aSig >>= 1;
3425 }
3426 q = ( bSig <= aSig );
3427 if ( q ) aSig -= bSig;
3428 expDiff -= 64;
3429 while ( 0 < expDiff ) {
3430 q = estimateDiv128To64( aSig, 0, bSig );
3431 q = ( 2 < q ) ? q - 2 : 0;
3432 aSig = - ( ( bSig>>2 ) * q );
3433 expDiff -= 62;
3434 }
3435 expDiff += 64;
3436 if ( 0 < expDiff ) {
3437 q = estimateDiv128To64( aSig, 0, bSig );
3438 q = ( 2 < q ) ? q - 2 : 0;
3439 q >>= 64 - expDiff;
3440 bSig >>= 2;
3441 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3442 }
3443 else {
3444 aSig >>= 2;
3445 bSig >>= 2;
3446 }
3447 do {
3448 alternateASig = aSig;
3449 ++q;
3450 aSig -= bSig;
bb98fe42 3451 } while ( 0 <= (int64_t) aSig );
158142c2
FB
3452 sigMean = aSig + alternateASig;
3453 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3454 aSig = alternateASig;
3455 }
bb98fe42 3456 zSign = ( (int64_t) aSig < 0 );
158142c2
FB
3457 if ( zSign ) aSig = - aSig;
3458 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
3459
3460}
3461
3462/*----------------------------------------------------------------------------
3463| Returns the square root of the double-precision floating-point value `a'.
3464| The operation is performed according to the IEC/IEEE Standard for Binary
3465| Floating-Point Arithmetic.
3466*----------------------------------------------------------------------------*/
3467
3468float64 float64_sqrt( float64 a STATUS_PARAM )
3469{
3470 flag aSign;
3471 int16 aExp, zExp;
bb98fe42
AF
3472 uint64_t aSig, zSig, doubleZSig;
3473 uint64_t rem0, rem1, term0, term1;
37d18660 3474 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
3475
3476 aSig = extractFloat64Frac( a );
3477 aExp = extractFloat64Exp( a );
3478 aSign = extractFloat64Sign( a );
3479 if ( aExp == 0x7FF ) {
3480 if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
3481 if ( ! aSign ) return a;
3482 float_raise( float_flag_invalid STATUS_VAR);
3483 return float64_default_nan;
3484 }
3485 if ( aSign ) {
3486 if ( ( aExp | aSig ) == 0 ) return a;
3487 float_raise( float_flag_invalid STATUS_VAR);
3488 return float64_default_nan;
3489 }
3490 if ( aExp == 0 ) {
f090c9d4 3491 if ( aSig == 0 ) return float64_zero;
158142c2
FB
3492 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3493 }
3494 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3495 aSig |= LIT64( 0x0010000000000000 );
3496 zSig = estimateSqrt32( aExp, aSig>>21 );
3497 aSig <<= 9 - ( aExp & 1 );
3498 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3499 if ( ( zSig & 0x1FF ) <= 5 ) {
3500 doubleZSig = zSig<<1;
3501 mul64To128( zSig, zSig, &term0, &term1 );
3502 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
bb98fe42 3503 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
3504 --zSig;
3505 doubleZSig -= 2;
3506 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3507 }
3508 zSig |= ( ( rem0 | rem1 ) != 0 );
3509 }
3510 return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
3511
3512}
3513
374dfc33
AJ
3514/*----------------------------------------------------------------------------
3515| Returns the binary log of the double-precision floating-point value `a'.
3516| The operation is performed according to the IEC/IEEE Standard for Binary
3517| Floating-Point Arithmetic.
3518*----------------------------------------------------------------------------*/
3519float64 float64_log2( float64 a STATUS_PARAM )
3520{
3521 flag aSign, zSign;
3522 int16 aExp;
bb98fe42 3523 uint64_t aSig, aSig0, aSig1, zSig, i;
37d18660 3524 a = float64_squash_input_denormal(a STATUS_VAR);
374dfc33
AJ
3525
3526 aSig = extractFloat64Frac( a );
3527 aExp = extractFloat64Exp( a );
3528 aSign = extractFloat64Sign( a );
3529
3530 if ( aExp == 0 ) {
3531 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
3532 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3533 }
3534 if ( aSign ) {
3535 float_raise( float_flag_invalid STATUS_VAR);
3536 return float64_default_nan;
3537 }
3538 if ( aExp == 0x7FF ) {
3539 if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
3540 return a;
3541 }
3542
3543 aExp -= 0x3FF;
3544 aSig |= LIT64( 0x0010000000000000 );
3545 zSign = aExp < 0;
bb98fe42 3546 zSig = (uint64_t)aExp << 52;
374dfc33
AJ
3547 for (i = 1LL << 51; i > 0; i >>= 1) {
3548 mul64To128( aSig, aSig, &aSig0, &aSig1 );
3549 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
3550 if ( aSig & LIT64( 0x0020000000000000 ) ) {
3551 aSig >>= 1;
3552 zSig |= i;
3553 }
3554 }
3555
3556 if ( zSign )
3557 zSig = -zSig;
3558 return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
3559}
3560
158142c2
FB
3561/*----------------------------------------------------------------------------
3562| Returns 1 if the double-precision floating-point value `a' is equal to the
b689362d
AJ
3563| corresponding value `b', and 0 otherwise. The invalid exception is raised
3564| if either operand is a NaN. Otherwise, the comparison is performed
158142c2
FB
3565| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3566*----------------------------------------------------------------------------*/
3567
b689362d 3568int float64_eq( float64 a, float64 b STATUS_PARAM )
158142c2 3569{
bb98fe42 3570 uint64_t av, bv;
37d18660
PM
3571 a = float64_squash_input_denormal(a STATUS_VAR);
3572 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
3573
3574 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3575 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3576 ) {
b689362d 3577 float_raise( float_flag_invalid STATUS_VAR);
158142c2
FB
3578 return 0;
3579 }
f090c9d4 3580 av = float64_val(a);
a1b91bb4 3581 bv = float64_val(b);
bb98fe42 3582 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
158142c2
FB
3583
3584}
3585
3586/*----------------------------------------------------------------------------
3587| Returns 1 if the double-precision floating-point value `a' is less than or
f5a64251
AJ
3588| equal to the corresponding value `b', and 0 otherwise. The invalid
3589| exception is raised if either operand is a NaN. The comparison is performed
3590| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
3591*----------------------------------------------------------------------------*/
3592
750afe93 3593int float64_le( float64 a, float64 b STATUS_PARAM )
158142c2
FB
3594{
3595 flag aSign, bSign;
bb98fe42 3596 uint64_t av, bv;
37d18660
PM
3597 a = float64_squash_input_denormal(a STATUS_VAR);
3598 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
3599
3600 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3601 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3602 ) {
3603 float_raise( float_flag_invalid STATUS_VAR);
3604 return 0;
3605 }
3606 aSign = extractFloat64Sign( a );
3607 bSign = extractFloat64Sign( b );
f090c9d4 3608 av = float64_val(a);
a1b91bb4 3609 bv = float64_val(b);
bb98fe42 3610 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 3611 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
3612
3613}
3614
3615/*----------------------------------------------------------------------------
3616| Returns 1 if the double-precision floating-point value `a' is less than
f5a64251
AJ
3617| the corresponding value `b', and 0 otherwise. The invalid exception is
3618| raised if either operand is a NaN. The comparison is performed according
3619| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
3620*----------------------------------------------------------------------------*/
3621
750afe93 3622int float64_lt( float64 a, float64 b STATUS_PARAM )
158142c2
FB
3623{
3624 flag aSign, bSign;
bb98fe42 3625 uint64_t av, bv;
158142c2 3626
37d18660
PM
3627 a = float64_squash_input_denormal(a STATUS_VAR);
3628 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
3629 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3630 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3631 ) {
3632 float_raise( float_flag_invalid STATUS_VAR);
3633 return 0;
3634 }
3635 aSign = extractFloat64Sign( a );
3636 bSign = extractFloat64Sign( b );
f090c9d4 3637 av = float64_val(a);
a1b91bb4 3638 bv = float64_val(b);
bb98fe42 3639 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 3640 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
3641
3642}
3643
67b7861d
AJ
3644/*----------------------------------------------------------------------------
3645| Returns 1 if the double-precision floating-point values `a' and `b' cannot
f5a64251
AJ
3646| be compared, and 0 otherwise. The invalid exception is raised if either
3647| operand is a NaN. The comparison is performed according to the IEC/IEEE
3648| Standard for Binary Floating-Point Arithmetic.
67b7861d
AJ
3649*----------------------------------------------------------------------------*/
3650
3651int float64_unordered( float64 a, float64 b STATUS_PARAM )
3652{
3653 a = float64_squash_input_denormal(a STATUS_VAR);
3654 b = float64_squash_input_denormal(b STATUS_VAR);
3655
3656 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3657 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3658 ) {
3659 float_raise( float_flag_invalid STATUS_VAR);
3660 return 1;
3661 }
3662 return 0;
3663}
3664
158142c2
FB
3665/*----------------------------------------------------------------------------
3666| Returns 1 if the double-precision floating-point value `a' is equal to the
f5a64251
AJ
3667| corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3668| exception.The comparison is performed according to the IEC/IEEE Standard
3669| for Binary Floating-Point Arithmetic.
158142c2
FB
3670*----------------------------------------------------------------------------*/
3671
b689362d 3672int float64_eq_quiet( float64 a, float64 b STATUS_PARAM )
158142c2 3673{
bb98fe42 3674 uint64_t av, bv;
37d18660
PM
3675 a = float64_squash_input_denormal(a STATUS_VAR);
3676 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
3677
3678 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3679 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3680 ) {
b689362d
AJ
3681 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3682 float_raise( float_flag_invalid STATUS_VAR);
3683 }
158142c2
FB
3684 return 0;
3685 }
f090c9d4 3686 av = float64_val(a);
a1b91bb4 3687 bv = float64_val(b);
bb98fe42 3688 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
158142c2
FB
3689
3690}
3691
3692/*----------------------------------------------------------------------------
3693| Returns 1 if the double-precision floating-point value `a' is less than or
3694| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3695| cause an exception. Otherwise, the comparison is performed according to the
3696| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3697*----------------------------------------------------------------------------*/
3698
750afe93 3699int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
158142c2
FB
3700{
3701 flag aSign, bSign;
bb98fe42 3702 uint64_t av, bv;
37d18660
PM
3703 a = float64_squash_input_denormal(a STATUS_VAR);
3704 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
3705
3706 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3707 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3708 ) {
3709 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3710 float_raise( float_flag_invalid STATUS_VAR);
3711 }
3712 return 0;
3713 }
3714 aSign = extractFloat64Sign( a );
3715 bSign = extractFloat64Sign( b );
f090c9d4 3716 av = float64_val(a);
a1b91bb4 3717 bv = float64_val(b);
bb98fe42 3718 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 3719 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
3720
3721}
3722
3723/*----------------------------------------------------------------------------
3724| Returns 1 if the double-precision floating-point value `a' is less than
3725| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3726| exception. Otherwise, the comparison is performed according to the IEC/IEEE
3727| Standard for Binary Floating-Point Arithmetic.
3728*----------------------------------------------------------------------------*/
3729
750afe93 3730int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
158142c2
FB
3731{
3732 flag aSign, bSign;
bb98fe42 3733 uint64_t av, bv;
37d18660
PM
3734 a = float64_squash_input_denormal(a STATUS_VAR);
3735 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
3736
3737 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3738 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3739 ) {
3740 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3741 float_raise( float_flag_invalid STATUS_VAR);
3742 }
3743 return 0;
3744 }
3745 aSign = extractFloat64Sign( a );
3746 bSign = extractFloat64Sign( b );
f090c9d4 3747 av = float64_val(a);
a1b91bb4 3748 bv = float64_val(b);
bb98fe42 3749 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 3750 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
3751
3752}
3753
67b7861d
AJ
3754/*----------------------------------------------------------------------------
3755| Returns 1 if the double-precision floating-point values `a' and `b' cannot
3756| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
3757| comparison is performed according to the IEC/IEEE Standard for Binary
3758| Floating-Point Arithmetic.
3759*----------------------------------------------------------------------------*/
3760
3761int float64_unordered_quiet( float64 a, float64 b STATUS_PARAM )
3762{
3763 a = float64_squash_input_denormal(a STATUS_VAR);
3764 b = float64_squash_input_denormal(b STATUS_VAR);
3765
3766 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3767 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3768 ) {
3769 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3770 float_raise( float_flag_invalid STATUS_VAR);
3771 }
3772 return 1;
3773 }
3774 return 0;
3775}
3776
158142c2
FB
3777/*----------------------------------------------------------------------------
3778| Returns the result of converting the extended double-precision floating-
3779| point value `a' to the 32-bit two's complement integer format. The
3780| conversion is performed according to the IEC/IEEE Standard for Binary
3781| Floating-Point Arithmetic---which means in particular that the conversion
3782| is rounded according to the current rounding mode. If `a' is a NaN, the
3783| largest positive integer is returned. Otherwise, if the conversion
3784| overflows, the largest integer with the same sign as `a' is returned.
3785*----------------------------------------------------------------------------*/
3786
3787int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
3788{
3789 flag aSign;
3790 int32 aExp, shiftCount;
bb98fe42 3791 uint64_t aSig;
158142c2
FB
3792
3793 aSig = extractFloatx80Frac( a );
3794 aExp = extractFloatx80Exp( a );
3795 aSign = extractFloatx80Sign( a );
bb98fe42 3796 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
158142c2
FB
3797 shiftCount = 0x4037 - aExp;
3798 if ( shiftCount <= 0 ) shiftCount = 1;
3799 shift64RightJamming( aSig, shiftCount, &aSig );
3800 return roundAndPackInt32( aSign, aSig STATUS_VAR );
3801
3802}
3803
3804/*----------------------------------------------------------------------------
3805| Returns the result of converting the extended double-precision floating-
3806| point value `a' to the 32-bit two's complement integer format. The
3807| conversion is performed according to the IEC/IEEE Standard for Binary
3808| Floating-Point Arithmetic, except that the conversion is always rounded
3809| toward zero. If `a' is a NaN, the largest positive integer is returned.
3810| Otherwise, if the conversion overflows, the largest integer with the same
3811| sign as `a' is returned.
3812*----------------------------------------------------------------------------*/
3813
3814int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
3815{
3816 flag aSign;
3817 int32 aExp, shiftCount;
bb98fe42 3818 uint64_t aSig, savedASig;
158142c2
FB
3819 int32 z;
3820
3821 aSig = extractFloatx80Frac( a );
3822 aExp = extractFloatx80Exp( a );
3823 aSign = extractFloatx80Sign( a );
3824 if ( 0x401E < aExp ) {
bb98fe42 3825 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
158142c2
FB
3826 goto invalid;
3827 }
3828 else if ( aExp < 0x3FFF ) {
3829 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3830 return 0;
3831 }
3832 shiftCount = 0x403E - aExp;
3833 savedASig = aSig;
3834 aSig >>= shiftCount;
3835 z = aSig;
3836 if ( aSign ) z = - z;
3837 if ( ( z < 0 ) ^ aSign ) {
3838 invalid:
3839 float_raise( float_flag_invalid STATUS_VAR);
bb98fe42 3840 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2
FB
3841 }
3842 if ( ( aSig<<shiftCount ) != savedASig ) {
3843 STATUS(float_exception_flags) |= float_flag_inexact;
3844 }
3845 return z;
3846
3847}
3848
3849/*----------------------------------------------------------------------------
3850| Returns the result of converting the extended double-precision floating-
3851| point value `a' to the 64-bit two's complement integer format. The
3852| conversion is performed according to the IEC/IEEE Standard for Binary
3853| Floating-Point Arithmetic---which means in particular that the conversion
3854| is rounded according to the current rounding mode. If `a' is a NaN,
3855| the largest positive integer is returned. Otherwise, if the conversion
3856| overflows, the largest integer with the same sign as `a' is returned.
3857*----------------------------------------------------------------------------*/
3858
3859int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
3860{
3861 flag aSign;
3862 int32 aExp, shiftCount;
bb98fe42 3863 uint64_t aSig, aSigExtra;
158142c2
FB
3864
3865 aSig = extractFloatx80Frac( a );
3866 aExp = extractFloatx80Exp( a );
3867 aSign = extractFloatx80Sign( a );
3868 shiftCount = 0x403E - aExp;
3869 if ( shiftCount <= 0 ) {
3870 if ( shiftCount ) {
3871 float_raise( float_flag_invalid STATUS_VAR);
3872 if ( ! aSign
3873 || ( ( aExp == 0x7FFF )
3874 && ( aSig != LIT64( 0x8000000000000000 ) ) )
3875 ) {
3876 return LIT64( 0x7FFFFFFFFFFFFFFF );
3877 }
bb98fe42 3878 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
3879 }
3880 aSigExtra = 0;
3881 }
3882 else {
3883 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3884 }
3885 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3886
3887}
3888
3889/*----------------------------------------------------------------------------
3890| Returns the result of converting the extended double-precision floating-
3891| point value `a' to the 64-bit two's complement integer format. The
3892| conversion is performed according to the IEC/IEEE Standard for Binary
3893| Floating-Point Arithmetic, except that the conversion is always rounded
3894| toward zero. If `a' is a NaN, the largest positive integer is returned.
3895| Otherwise, if the conversion overflows, the largest integer with the same
3896| sign as `a' is returned.
3897*----------------------------------------------------------------------------*/
3898
3899int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
3900{
3901 flag aSign;
3902 int32 aExp, shiftCount;
bb98fe42 3903 uint64_t aSig;
158142c2
FB
3904 int64 z;
3905
3906 aSig = extractFloatx80Frac( a );
3907 aExp = extractFloatx80Exp( a );
3908 aSign = extractFloatx80Sign( a );
3909 shiftCount = aExp - 0x403E;
3910 if ( 0 <= shiftCount ) {
3911 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3912 if ( ( a.high != 0xC03E ) || aSig ) {
3913 float_raise( float_flag_invalid STATUS_VAR);
3914 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3915 return LIT64( 0x7FFFFFFFFFFFFFFF );
3916 }
3917 }
bb98fe42 3918 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
3919 }
3920 else if ( aExp < 0x3FFF ) {
3921 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3922 return 0;
3923 }
3924 z = aSig>>( - shiftCount );
bb98fe42 3925 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
158142c2
FB
3926 STATUS(float_exception_flags) |= float_flag_inexact;
3927 }
3928 if ( aSign ) z = - z;
3929 return z;
3930
3931}
3932
3933/*----------------------------------------------------------------------------
3934| Returns the result of converting the extended double-precision floating-
3935| point value `a' to the single-precision floating-point format. The
3936| conversion is performed according to the IEC/IEEE Standard for Binary
3937| Floating-Point Arithmetic.
3938*----------------------------------------------------------------------------*/
3939
3940float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
3941{
3942 flag aSign;
3943 int32 aExp;
bb98fe42 3944 uint64_t aSig;
158142c2
FB
3945
3946 aSig = extractFloatx80Frac( a );
3947 aExp = extractFloatx80Exp( a );
3948 aSign = extractFloatx80Sign( a );
3949 if ( aExp == 0x7FFF ) {
bb98fe42 3950 if ( (uint64_t) ( aSig<<1 ) ) {
bcd4d9af 3951 return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
3952 }
3953 return packFloat32( aSign, 0xFF, 0 );
3954 }
3955 shift64RightJamming( aSig, 33, &aSig );
3956 if ( aExp || aSig ) aExp -= 0x3F81;
3957 return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
3958
3959}
3960
3961/*----------------------------------------------------------------------------
3962| Returns the result of converting the extended double-precision floating-
3963| point value `a' to the double-precision floating-point format. The
3964| conversion is performed according to the IEC/IEEE Standard for Binary
3965| Floating-Point Arithmetic.
3966*----------------------------------------------------------------------------*/
3967
3968float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
3969{
3970 flag aSign;
3971 int32 aExp;
bb98fe42 3972 uint64_t aSig, zSig;
158142c2
FB
3973
3974 aSig = extractFloatx80Frac( a );
3975 aExp = extractFloatx80Exp( a );
3976 aSign = extractFloatx80Sign( a );
3977 if ( aExp == 0x7FFF ) {
bb98fe42 3978 if ( (uint64_t) ( aSig<<1 ) ) {
bcd4d9af 3979 return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
3980 }
3981 return packFloat64( aSign, 0x7FF, 0 );
3982 }
3983 shift64RightJamming( aSig, 1, &zSig );
3984 if ( aExp || aSig ) aExp -= 0x3C01;
3985 return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
3986
3987}
3988
158142c2
FB
3989/*----------------------------------------------------------------------------
3990| Returns the result of converting the extended double-precision floating-
3991| point value `a' to the quadruple-precision floating-point format. The
3992| conversion is performed according to the IEC/IEEE Standard for Binary
3993| Floating-Point Arithmetic.
3994*----------------------------------------------------------------------------*/
3995
3996float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
3997{
3998 flag aSign;
3999 int16 aExp;
bb98fe42 4000 uint64_t aSig, zSig0, zSig1;
158142c2
FB
4001
4002 aSig = extractFloatx80Frac( a );
4003 aExp = extractFloatx80Exp( a );
4004 aSign = extractFloatx80Sign( a );
bb98fe42 4005 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
bcd4d9af 4006 return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
4007 }
4008 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4009 return packFloat128( aSign, aExp, zSig0, zSig1 );
4010
4011}
4012
158142c2
FB
4013/*----------------------------------------------------------------------------
4014| Rounds the extended double-precision floating-point value `a' to an integer,
4015| and returns the result as an extended quadruple-precision floating-point
4016| value. The operation is performed according to the IEC/IEEE Standard for
4017| Binary Floating-Point Arithmetic.
4018*----------------------------------------------------------------------------*/
4019
4020floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
4021{
4022 flag aSign;
4023 int32 aExp;
bb98fe42 4024 uint64_t lastBitMask, roundBitsMask;
158142c2
FB
4025 int8 roundingMode;
4026 floatx80 z;
4027
4028 aExp = extractFloatx80Exp( a );
4029 if ( 0x403E <= aExp ) {
bb98fe42 4030 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
158142c2
FB
4031 return propagateFloatx80NaN( a, a STATUS_VAR );
4032 }
4033 return a;
4034 }
4035 if ( aExp < 0x3FFF ) {
4036 if ( ( aExp == 0 )
bb98fe42 4037 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
158142c2
FB
4038 return a;
4039 }
4040 STATUS(float_exception_flags) |= float_flag_inexact;
4041 aSign = extractFloatx80Sign( a );
4042 switch ( STATUS(float_rounding_mode) ) {
4043 case float_round_nearest_even:
bb98fe42 4044 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
158142c2
FB
4045 ) {
4046 return
4047 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4048 }
4049 break;
4050 case float_round_down:
4051 return
4052 aSign ?
4053 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4054 : packFloatx80( 0, 0, 0 );
4055 case float_round_up:
4056 return
4057 aSign ? packFloatx80( 1, 0, 0 )
4058 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4059 }
4060 return packFloatx80( aSign, 0, 0 );
4061 }
4062 lastBitMask = 1;
4063 lastBitMask <<= 0x403E - aExp;
4064 roundBitsMask = lastBitMask - 1;
4065 z = a;
4066 roundingMode = STATUS(float_rounding_mode);
4067 if ( roundingMode == float_round_nearest_even ) {
4068 z.low += lastBitMask>>1;
4069 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4070 }
4071 else if ( roundingMode != float_round_to_zero ) {
4072 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
4073 z.low += roundBitsMask;
4074 }
4075 }
4076 z.low &= ~ roundBitsMask;
4077 if ( z.low == 0 ) {
4078 ++z.high;
4079 z.low = LIT64( 0x8000000000000000 );
4080 }
4081 if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
4082 return z;
4083
4084}
4085
4086/*----------------------------------------------------------------------------
4087| Returns the result of adding the absolute values of the extended double-
4088| precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
4089| negated before being returned. `zSign' is ignored if the result is a NaN.
4090| The addition is performed according to the IEC/IEEE Standard for Binary
4091| Floating-Point Arithmetic.
4092*----------------------------------------------------------------------------*/
4093
4094static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
4095{
4096 int32 aExp, bExp, zExp;
bb98fe42 4097 uint64_t aSig, bSig, zSig0, zSig1;
158142c2
FB
4098 int32 expDiff;
4099
4100 aSig = extractFloatx80Frac( a );
4101 aExp = extractFloatx80Exp( a );
4102 bSig = extractFloatx80Frac( b );
4103 bExp = extractFloatx80Exp( b );
4104 expDiff = aExp - bExp;
4105 if ( 0 < expDiff ) {
4106 if ( aExp == 0x7FFF ) {
bb98fe42 4107 if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
4108 return a;
4109 }
4110 if ( bExp == 0 ) --expDiff;
4111 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4112 zExp = aExp;
4113 }
4114 else if ( expDiff < 0 ) {
4115 if ( bExp == 0x7FFF ) {
bb98fe42 4116 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
4117 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4118 }
4119 if ( aExp == 0 ) ++expDiff;
4120 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4121 zExp = bExp;
4122 }
4123 else {
4124 if ( aExp == 0x7FFF ) {
bb98fe42 4125 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
158142c2
FB
4126 return propagateFloatx80NaN( a, b STATUS_VAR );
4127 }
4128 return a;
4129 }
4130 zSig1 = 0;
4131 zSig0 = aSig + bSig;
4132 if ( aExp == 0 ) {
4133 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4134 goto roundAndPack;
4135 }
4136 zExp = aExp;
4137 goto shiftRight1;
4138 }
4139 zSig0 = aSig + bSig;
bb98fe42 4140 if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
158142c2
FB
4141 shiftRight1:
4142 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4143 zSig0 |= LIT64( 0x8000000000000000 );
4144 ++zExp;
4145 roundAndPack:
4146 return
4147 roundAndPackFloatx80(
4148 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4149
4150}
4151
4152/*----------------------------------------------------------------------------
4153| Returns the result of subtracting the absolute values of the extended
4154| double-precision floating-point values `a' and `b'. If `zSign' is 1, the
4155| difference is negated before being returned. `zSign' is ignored if the
4156| result is a NaN. The subtraction is performed according to the IEC/IEEE
4157| Standard for Binary Floating-Point Arithmetic.
4158*----------------------------------------------------------------------------*/
4159
4160static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
4161{
4162 int32 aExp, bExp, zExp;
bb98fe42 4163 uint64_t aSig, bSig, zSig0, zSig1;
158142c2
FB
4164 int32 expDiff;
4165 floatx80 z;
4166
4167 aSig = extractFloatx80Frac( a );
4168 aExp = extractFloatx80Exp( a );
4169 bSig = extractFloatx80Frac( b );
4170 bExp = extractFloatx80Exp( b );
4171 expDiff = aExp - bExp;
4172 if ( 0 < expDiff ) goto aExpBigger;
4173 if ( expDiff < 0 ) goto bExpBigger;
4174 if ( aExp == 0x7FFF ) {
bb98fe42 4175 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
158142c2
FB
4176 return propagateFloatx80NaN( a, b STATUS_VAR );
4177 }
4178 float_raise( float_flag_invalid STATUS_VAR);
4179 z.low = floatx80_default_nan_low;
4180 z.high = floatx80_default_nan_high;
4181 return z;
4182 }
4183 if ( aExp == 0 ) {
4184 aExp = 1;
4185 bExp = 1;
4186 }
4187 zSig1 = 0;
4188 if ( bSig < aSig ) goto aBigger;
4189 if ( aSig < bSig ) goto bBigger;
4190 return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
4191 bExpBigger:
4192 if ( bExp == 0x7FFF ) {
bb98fe42 4193 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
4194 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
4195 }
4196 if ( aExp == 0 ) ++expDiff;
4197 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4198 bBigger:
4199 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4200 zExp = bExp;
4201 zSign ^= 1;
4202 goto normalizeRoundAndPack;
4203 aExpBigger:
4204 if ( aExp == 0x7FFF ) {
bb98fe42 4205 if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
4206 return a;
4207 }
4208 if ( bExp == 0 ) --expDiff;
4209 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4210 aBigger:
4211 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4212 zExp = aExp;
4213 normalizeRoundAndPack:
4214 return
4215 normalizeRoundAndPackFloatx80(
4216 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4217
4218}
4219
4220/*----------------------------------------------------------------------------
4221| Returns the result of adding the extended double-precision floating-point
4222| values `a' and `b'. The operation is performed according to the IEC/IEEE
4223| Standard for Binary Floating-Point Arithmetic.
4224*----------------------------------------------------------------------------*/
4225
4226floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
4227{
4228 flag aSign, bSign;
4229
4230 aSign = extractFloatx80Sign( a );
4231 bSign = extractFloatx80Sign( b );
4232 if ( aSign == bSign ) {
4233 return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4234 }
4235 else {
4236 return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4237 }
4238
4239}
4240
4241/*----------------------------------------------------------------------------
4242| Returns the result of subtracting the extended double-precision floating-
4243| point values `a' and `b'. The operation is performed according to the
4244| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4245*----------------------------------------------------------------------------*/
4246
4247floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
4248{
4249 flag aSign, bSign;
4250
4251 aSign = extractFloatx80Sign( a );
4252 bSign = extractFloatx80Sign( b );
4253 if ( aSign == bSign ) {
4254 return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4255 }
4256 else {
4257 return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4258 }
4259
4260}
4261
4262/*----------------------------------------------------------------------------
4263| Returns the result of multiplying the extended double-precision floating-
4264| point values `a' and `b'. The operation is performed according to the
4265| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4266*----------------------------------------------------------------------------*/
4267
4268floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
4269{
4270 flag aSign, bSign, zSign;
4271 int32 aExp, bExp, zExp;
bb98fe42 4272 uint64_t aSig, bSig, zSig0, zSig1;
158142c2
FB
4273 floatx80 z;
4274
4275 aSig = extractFloatx80Frac( a );
4276 aExp = extractFloatx80Exp( a );
4277 aSign = extractFloatx80Sign( a );
4278 bSig = extractFloatx80Frac( b );
4279 bExp = extractFloatx80Exp( b );
4280 bSign = extractFloatx80Sign( b );
4281 zSign = aSign ^ bSign;
4282 if ( aExp == 0x7FFF ) {
bb98fe42
AF
4283 if ( (uint64_t) ( aSig<<1 )
4284 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
158142c2
FB
4285 return propagateFloatx80NaN( a, b STATUS_VAR );
4286 }
4287 if ( ( bExp | bSig ) == 0 ) goto invalid;
4288 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4289 }
4290 if ( bExp == 0x7FFF ) {
bb98fe42 4291 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
4292 if ( ( aExp | aSig ) == 0 ) {
4293 invalid:
4294 float_raise( float_flag_invalid STATUS_VAR);
4295 z.low = floatx80_default_nan_low;
4296 z.high = floatx80_default_nan_high;
4297 return z;
4298 }
4299 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4300 }
4301 if ( aExp == 0 ) {
4302 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4303 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4304 }
4305 if ( bExp == 0 ) {
4306 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
4307 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4308 }
4309 zExp = aExp + bExp - 0x3FFE;
4310 mul64To128( aSig, bSig, &zSig0, &zSig1 );
bb98fe42 4311 if ( 0 < (int64_t) zSig0 ) {
158142c2
FB
4312 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
4313 --zExp;
4314 }
4315 return
4316 roundAndPackFloatx80(
4317 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4318
4319}
4320
4321/*----------------------------------------------------------------------------
4322| Returns the result of dividing the extended double-precision floating-point
4323| value `a' by the corresponding value `b'. The operation is performed
4324| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4325*----------------------------------------------------------------------------*/
4326
4327floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
4328{
4329 flag aSign, bSign, zSign;
4330 int32 aExp, bExp, zExp;
bb98fe42
AF
4331 uint64_t aSig, bSig, zSig0, zSig1;
4332 uint64_t rem0, rem1, rem2, term0, term1, term2;
158142c2
FB
4333 floatx80 z;
4334
4335 aSig = extractFloatx80Frac( a );
4336 aExp = extractFloatx80Exp( a );
4337 aSign = extractFloatx80Sign( a );
4338 bSig = extractFloatx80Frac( b );
4339 bExp = extractFloatx80Exp( b );
4340 bSign = extractFloatx80Sign( b );
4341 zSign = aSign ^ bSign;
4342 if ( aExp == 0x7FFF ) {
bb98fe42 4343 if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2 4344 if ( bExp == 0x7FFF ) {
bb98fe42 4345 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
4346 goto invalid;
4347 }
4348 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4349 }
4350 if ( bExp == 0x7FFF ) {
bb98fe42 4351 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
4352 return packFloatx80( zSign, 0, 0 );
4353 }
4354 if ( bExp == 0 ) {
4355 if ( bSig == 0 ) {
4356 if ( ( aExp | aSig ) == 0 ) {
4357 invalid:
4358 float_raise( float_flag_invalid STATUS_VAR);
4359 z.low = floatx80_default_nan_low;
4360 z.high = floatx80_default_nan_high;
4361 return z;
4362 }
4363 float_raise( float_flag_divbyzero STATUS_VAR);
4364 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4365 }
4366 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4367 }
4368 if ( aExp == 0 ) {
4369 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4370 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4371 }
4372 zExp = aExp - bExp + 0x3FFE;
4373 rem1 = 0;
4374 if ( bSig <= aSig ) {
4375 shift128Right( aSig, 0, 1, &aSig, &rem1 );
4376 ++zExp;
4377 }
4378 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
4379 mul64To128( bSig, zSig0, &term0, &term1 );
4380 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
bb98fe42 4381 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
4382 --zSig0;
4383 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4384 }
4385 zSig1 = estimateDiv128To64( rem1, 0, bSig );
bb98fe42 4386 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
158142c2
FB
4387 mul64To128( bSig, zSig1, &term1, &term2 );
4388 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
bb98fe42 4389 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
4390 --zSig1;
4391 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
4392 }
4393 zSig1 |= ( ( rem1 | rem2 ) != 0 );
4394 }
4395 return
4396 roundAndPackFloatx80(
4397 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4398
4399}
4400
4401/*----------------------------------------------------------------------------
4402| Returns the remainder of the extended double-precision floating-point value
4403| `a' with respect to the corresponding value `b'. The operation is performed
4404| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4405*----------------------------------------------------------------------------*/
4406
4407floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
4408{
ed086f3d 4409 flag aSign, zSign;
158142c2 4410 int32 aExp, bExp, expDiff;
bb98fe42
AF
4411 uint64_t aSig0, aSig1, bSig;
4412 uint64_t q, term0, term1, alternateASig0, alternateASig1;
158142c2
FB
4413 floatx80 z;
4414
4415 aSig0 = extractFloatx80Frac( a );
4416 aExp = extractFloatx80Exp( a );
4417 aSign = extractFloatx80Sign( a );
4418 bSig = extractFloatx80Frac( b );
4419 bExp = extractFloatx80Exp( b );
158142c2 4420 if ( aExp == 0x7FFF ) {
bb98fe42
AF
4421 if ( (uint64_t) ( aSig0<<1 )
4422 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
158142c2
FB
4423 return propagateFloatx80NaN( a, b STATUS_VAR );
4424 }
4425 goto invalid;
4426 }
4427 if ( bExp == 0x7FFF ) {
bb98fe42 4428 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
4429 return a;
4430 }
4431 if ( bExp == 0 ) {
4432 if ( bSig == 0 ) {
4433 invalid:
4434 float_raise( float_flag_invalid STATUS_VAR);
4435 z.low = floatx80_default_nan_low;
4436 z.high = floatx80_default_nan_high;
4437 return z;
4438 }
4439 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4440 }
4441 if ( aExp == 0 ) {
bb98fe42 4442 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
158142c2
FB
4443 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4444 }
4445 bSig |= LIT64( 0x8000000000000000 );
4446 zSign = aSign;
4447 expDiff = aExp - bExp;
4448 aSig1 = 0;
4449 if ( expDiff < 0 ) {
4450 if ( expDiff < -1 ) return a;
4451 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
4452 expDiff = 0;
4453 }
4454 q = ( bSig <= aSig0 );
4455 if ( q ) aSig0 -= bSig;
4456 expDiff -= 64;
4457 while ( 0 < expDiff ) {
4458 q = estimateDiv128To64( aSig0, aSig1, bSig );
4459 q = ( 2 < q ) ? q - 2 : 0;
4460 mul64To128( bSig, q, &term0, &term1 );
4461 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4462 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
4463 expDiff -= 62;
4464 }
4465 expDiff += 64;
4466 if ( 0 < expDiff ) {
4467 q = estimateDiv128To64( aSig0, aSig1, bSig );
4468 q = ( 2 < q ) ? q - 2 : 0;
4469 q >>= 64 - expDiff;
4470 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
4471 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4472 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4473 while ( le128( term0, term1, aSig0, aSig1 ) ) {
4474 ++q;
4475 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4476 }
4477 }
4478 else {
4479 term1 = 0;
4480 term0 = bSig;
4481 }
4482 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4483 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4484 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4485 && ( q & 1 ) )
4486 ) {
4487 aSig0 = alternateASig0;
4488 aSig1 = alternateASig1;
4489 zSign = ! zSign;
4490 }
4491 return
4492 normalizeRoundAndPackFloatx80(
4493 80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
4494
4495}
4496
4497/*----------------------------------------------------------------------------
4498| Returns the square root of the extended double-precision floating-point
4499| value `a'. The operation is performed according to the IEC/IEEE Standard
4500| for Binary Floating-Point Arithmetic.
4501*----------------------------------------------------------------------------*/
4502
4503floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
4504{
4505 flag aSign;
4506 int32 aExp, zExp;
bb98fe42
AF
4507 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4508 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
158142c2
FB
4509 floatx80 z;
4510
4511 aSig0 = extractFloatx80Frac( a );
4512 aExp = extractFloatx80Exp( a );
4513 aSign = extractFloatx80Sign( a );
4514 if ( aExp == 0x7FFF ) {
bb98fe42 4515 if ( (uint64_t) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
158142c2
FB
4516 if ( ! aSign ) return a;
4517 goto invalid;
4518 }
4519 if ( aSign ) {
4520 if ( ( aExp | aSig0 ) == 0 ) return a;
4521 invalid:
4522 float_raise( float_flag_invalid STATUS_VAR);
4523 z.low = floatx80_default_nan_low;
4524 z.high = floatx80_default_nan_high;
4525 return z;
4526 }
4527 if ( aExp == 0 ) {
4528 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4529 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4530 }
4531 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4532 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4533 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4534 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4535 doubleZSig0 = zSig0<<1;
4536 mul64To128( zSig0, zSig0, &term0, &term1 );
4537 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
bb98fe42 4538 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
4539 --zSig0;
4540 doubleZSig0 -= 2;
4541 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4542 }
4543 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4544 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4545 if ( zSig1 == 0 ) zSig1 = 1;
4546 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4547 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4548 mul64To128( zSig1, zSig1, &term2, &term3 );
4549 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
bb98fe42 4550 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
4551 --zSig1;
4552 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4553 term3 |= 1;
4554 term2 |= doubleZSig0;
4555 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4556 }
4557 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4558 }
4559 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4560 zSig0 |= doubleZSig0;
4561 return
4562 roundAndPackFloatx80(
4563 STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
4564
4565}
4566
4567/*----------------------------------------------------------------------------
b689362d
AJ
4568| Returns 1 if the extended double-precision floating-point value `a' is equal
4569| to the corresponding value `b', and 0 otherwise. The invalid exception is
4570| raised if either operand is a NaN. Otherwise, the comparison is performed
4571| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
4572*----------------------------------------------------------------------------*/
4573
b689362d 4574int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
158142c2
FB
4575{
4576
4577 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 4578 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 4579 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 4580 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2 4581 ) {
b689362d 4582 float_raise( float_flag_invalid STATUS_VAR);
158142c2
FB
4583 return 0;
4584 }
4585 return
4586 ( a.low == b.low )
4587 && ( ( a.high == b.high )
4588 || ( ( a.low == 0 )
bb98fe42 4589 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
4590 );
4591
4592}
4593
4594/*----------------------------------------------------------------------------
4595| Returns 1 if the extended double-precision floating-point value `a' is
4596| less than or equal to the corresponding value `b', and 0 otherwise. The
f5a64251
AJ
4597| invalid exception is raised if either operand is a NaN. The comparison is
4598| performed according to the IEC/IEEE Standard for Binary Floating-Point
4599| Arithmetic.
158142c2
FB
4600*----------------------------------------------------------------------------*/
4601
750afe93 4602int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
158142c2
FB
4603{
4604 flag aSign, bSign;
4605
4606 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 4607 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 4608 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 4609 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2
FB
4610 ) {
4611 float_raise( float_flag_invalid STATUS_VAR);
4612 return 0;
4613 }
4614 aSign = extractFloatx80Sign( a );
4615 bSign = extractFloatx80Sign( b );
4616 if ( aSign != bSign ) {
4617 return
4618 aSign
bb98fe42 4619 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
4620 == 0 );
4621 }
4622 return
4623 aSign ? le128( b.high, b.low, a.high, a.low )
4624 : le128( a.high, a.low, b.high, b.low );
4625
4626}
4627
4628/*----------------------------------------------------------------------------
4629| Returns 1 if the extended double-precision floating-point value `a' is
f5a64251
AJ
4630| less than the corresponding value `b', and 0 otherwise. The invalid
4631| exception is raised if either operand is a NaN. The comparison is performed
4632| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
4633*----------------------------------------------------------------------------*/
4634
750afe93 4635int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
158142c2
FB
4636{
4637 flag aSign, bSign;
4638
4639 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 4640 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 4641 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 4642 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2
FB
4643 ) {
4644 float_raise( float_flag_invalid STATUS_VAR);
4645 return 0;
4646 }
4647 aSign = extractFloatx80Sign( a );
4648 bSign = extractFloatx80Sign( b );
4649 if ( aSign != bSign ) {
4650 return
4651 aSign
bb98fe42 4652 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
4653 != 0 );
4654 }
4655 return
4656 aSign ? lt128( b.high, b.low, a.high, a.low )
4657 : lt128( a.high, a.low, b.high, b.low );
4658
4659}
4660
67b7861d
AJ
4661/*----------------------------------------------------------------------------
4662| Returns 1 if the extended double-precision floating-point values `a' and `b'
f5a64251
AJ
4663| cannot be compared, and 0 otherwise. The invalid exception is raised if
4664| either operand is a NaN. The comparison is performed according to the
4665| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
67b7861d
AJ
4666*----------------------------------------------------------------------------*/
4667int floatx80_unordered( floatx80 a, floatx80 b STATUS_PARAM )
4668{
4669 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4670 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4671 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4672 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4673 ) {
4674 float_raise( float_flag_invalid STATUS_VAR);
4675 return 1;
4676 }
4677 return 0;
4678}
4679
158142c2 4680/*----------------------------------------------------------------------------
b689362d 4681| Returns 1 if the extended double-precision floating-point value `a' is
f5a64251
AJ
4682| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4683| cause an exception. The comparison is performed according to the IEC/IEEE
4684| Standard for Binary Floating-Point Arithmetic.
158142c2
FB
4685*----------------------------------------------------------------------------*/
4686
b689362d 4687int floatx80_eq_quiet( floatx80 a, floatx80 b STATUS_PARAM )
158142c2
FB
4688{
4689
4690 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 4691 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 4692 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 4693 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2 4694 ) {
b689362d
AJ
4695 if ( floatx80_is_signaling_nan( a )
4696 || floatx80_is_signaling_nan( b ) ) {
4697 float_raise( float_flag_invalid STATUS_VAR);
4698 }
158142c2
FB
4699 return 0;
4700 }
4701 return
4702 ( a.low == b.low )
4703 && ( ( a.high == b.high )
4704 || ( ( a.low == 0 )
bb98fe42 4705 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
4706 );
4707
4708}
4709
4710/*----------------------------------------------------------------------------
4711| Returns 1 if the extended double-precision floating-point value `a' is less
4712| than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
4713| do not cause an exception. Otherwise, the comparison is performed according
4714| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4715*----------------------------------------------------------------------------*/
4716
750afe93 4717int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
158142c2
FB
4718{
4719 flag aSign, bSign;
4720
4721 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 4722 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 4723 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 4724 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2
FB
4725 ) {
4726 if ( floatx80_is_signaling_nan( a )
4727 || floatx80_is_signaling_nan( b ) ) {
4728 float_raise( float_flag_invalid STATUS_VAR);
4729 }
4730 return 0;
4731 }
4732 aSign = extractFloatx80Sign( a );
4733 bSign = extractFloatx80Sign( b );
4734 if ( aSign != bSign ) {
4735 return
4736 aSign
bb98fe42 4737 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
4738 == 0 );
4739 }
4740 return
4741 aSign ? le128( b.high, b.low, a.high, a.low )
4742 : le128( a.high, a.low, b.high, b.low );
4743
4744}
4745
4746/*----------------------------------------------------------------------------
4747| Returns 1 if the extended double-precision floating-point value `a' is less
4748| than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
4749| an exception. Otherwise, the comparison is performed according to the
4750| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4751*----------------------------------------------------------------------------*/
4752
750afe93 4753int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
158142c2
FB
4754{
4755 flag aSign, bSign;
4756
4757 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 4758 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 4759 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 4760 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2
FB
4761 ) {
4762 if ( floatx80_is_signaling_nan( a )
4763 || floatx80_is_signaling_nan( b ) ) {
4764 float_raise( float_flag_invalid STATUS_VAR);
4765 }
4766 return 0;
4767 }
4768 aSign = extractFloatx80Sign( a );
4769 bSign = extractFloatx80Sign( b );
4770 if ( aSign != bSign ) {
4771 return
4772 aSign
bb98fe42 4773 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
4774 != 0 );
4775 }
4776 return
4777 aSign ? lt128( b.high, b.low, a.high, a.low )
4778 : lt128( a.high, a.low, b.high, b.low );
4779
4780}
4781
67b7861d
AJ
4782/*----------------------------------------------------------------------------
4783| Returns 1 if the extended double-precision floating-point values `a' and `b'
4784| cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
4785| The comparison is performed according to the IEC/IEEE Standard for Binary
4786| Floating-Point Arithmetic.
4787*----------------------------------------------------------------------------*/
4788int floatx80_unordered_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4789{
4790 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4791 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4792 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4793 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4794 ) {
4795 if ( floatx80_is_signaling_nan( a )
4796 || floatx80_is_signaling_nan( b ) ) {
4797 float_raise( float_flag_invalid STATUS_VAR);
4798 }
4799 return 1;
4800 }
4801 return 0;
4802}
4803
158142c2
FB
4804/*----------------------------------------------------------------------------
4805| Returns the result of converting the quadruple-precision floating-point
4806| value `a' to the 32-bit two's complement integer format. The conversion
4807| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4808| Arithmetic---which means in particular that the conversion is rounded
4809| according to the current rounding mode. If `a' is a NaN, the largest
4810| positive integer is returned. Otherwise, if the conversion overflows, the
4811| largest integer with the same sign as `a' is returned.
4812*----------------------------------------------------------------------------*/
4813
4814int32 float128_to_int32( float128 a STATUS_PARAM )
4815{
4816 flag aSign;
4817 int32 aExp, shiftCount;
bb98fe42 4818 uint64_t aSig0, aSig1;
158142c2
FB
4819
4820 aSig1 = extractFloat128Frac1( a );
4821 aSig0 = extractFloat128Frac0( a );
4822 aExp = extractFloat128Exp( a );
4823 aSign = extractFloat128Sign( a );
4824 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4825 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4826 aSig0 |= ( aSig1 != 0 );
4827 shiftCount = 0x4028 - aExp;
4828 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4829 return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
4830
4831}
4832
4833/*----------------------------------------------------------------------------
4834| Returns the result of converting the quadruple-precision floating-point
4835| value `a' to the 32-bit two's complement integer format. The conversion
4836| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4837| Arithmetic, except that the conversion is always rounded toward zero. If
4838| `a' is a NaN, the largest positive integer is returned. Otherwise, if the
4839| conversion overflows, the largest integer with the same sign as `a' is
4840| returned.
4841*----------------------------------------------------------------------------*/
4842
4843int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
4844{
4845 flag aSign;
4846 int32 aExp, shiftCount;
bb98fe42 4847 uint64_t aSig0, aSig1, savedASig;
158142c2
FB
4848 int32 z;
4849
4850 aSig1 = extractFloat128Frac1( a );
4851 aSig0 = extractFloat128Frac0( a );
4852 aExp = extractFloat128Exp( a );
4853 aSign = extractFloat128Sign( a );
4854 aSig0 |= ( aSig1 != 0 );
4855 if ( 0x401E < aExp ) {
4856 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4857 goto invalid;
4858 }
4859 else if ( aExp < 0x3FFF ) {
4860 if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
4861 return 0;
4862 }
4863 aSig0 |= LIT64( 0x0001000000000000 );
4864 shiftCount = 0x402F - aExp;
4865 savedASig = aSig0;
4866 aSig0 >>= shiftCount;
4867 z = aSig0;
4868 if ( aSign ) z = - z;
4869 if ( ( z < 0 ) ^ aSign ) {
4870 invalid:
4871 float_raise( float_flag_invalid STATUS_VAR);
bb98fe42 4872 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2
FB
4873 }
4874 if ( ( aSig0<<shiftCount ) != savedASig ) {
4875 STATUS(float_exception_flags) |= float_flag_inexact;
4876 }
4877 return z;
4878
4879}
4880
4881/*----------------------------------------------------------------------------
4882| Returns the result of converting the quadruple-precision floating-point
4883| value `a' to the 64-bit two's complement integer format. The conversion
4884| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4885| Arithmetic---which means in particular that the conversion is rounded
4886| according to the current rounding mode. If `a' is a NaN, the largest
4887| positive integer is returned. Otherwise, if the conversion overflows, the
4888| largest integer with the same sign as `a' is returned.
4889*----------------------------------------------------------------------------*/
4890
4891int64 float128_to_int64( float128 a STATUS_PARAM )
4892{
4893 flag aSign;
4894 int32 aExp, shiftCount;
bb98fe42 4895 uint64_t aSig0, aSig1;
158142c2
FB
4896
4897 aSig1 = extractFloat128Frac1( a );
4898 aSig0 = extractFloat128Frac0( a );
4899 aExp = extractFloat128Exp( a );
4900 aSign = extractFloat128Sign( a );
4901 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4902 shiftCount = 0x402F - aExp;
4903 if ( shiftCount <= 0 ) {
4904 if ( 0x403E < aExp ) {
4905 float_raise( float_flag_invalid STATUS_VAR);
4906 if ( ! aSign
4907 || ( ( aExp == 0x7FFF )
4908 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4909 )
4910 ) {
4911 return LIT64( 0x7FFFFFFFFFFFFFFF );
4912 }
bb98fe42 4913 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
4914 }
4915 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4916 }
4917 else {
4918 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4919 }
4920 return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
4921
4922}
4923
4924/*----------------------------------------------------------------------------
4925| Returns the result of converting the quadruple-precision floating-point
4926| value `a' to the 64-bit two's complement integer format. The conversion
4927| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4928| Arithmetic, except that the conversion is always rounded toward zero.
4929| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
4930| the conversion overflows, the largest integer with the same sign as `a' is
4931| returned.
4932*----------------------------------------------------------------------------*/
4933
4934int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
4935{
4936 flag aSign;
4937 int32 aExp, shiftCount;
bb98fe42 4938 uint64_t aSig0, aSig1;
158142c2
FB
4939 int64 z;
4940
4941 aSig1 = extractFloat128Frac1( a );
4942 aSig0 = extractFloat128Frac0( a );
4943 aExp = extractFloat128Exp( a );
4944 aSign = extractFloat128Sign( a );
4945 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4946 shiftCount = aExp - 0x402F;
4947 if ( 0 < shiftCount ) {
4948 if ( 0x403E <= aExp ) {
4949 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4950 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
4951 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4952 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
4953 }
4954 else {
4955 float_raise( float_flag_invalid STATUS_VAR);
4956 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4957 return LIT64( 0x7FFFFFFFFFFFFFFF );
4958 }
4959 }
bb98fe42 4960 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
4961 }
4962 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
bb98fe42 4963 if ( (uint64_t) ( aSig1<<shiftCount ) ) {
158142c2
FB
4964 STATUS(float_exception_flags) |= float_flag_inexact;
4965 }
4966 }
4967 else {
4968 if ( aExp < 0x3FFF ) {
4969 if ( aExp | aSig0 | aSig1 ) {
4970 STATUS(float_exception_flags) |= float_flag_inexact;
4971 }
4972 return 0;
4973 }
4974 z = aSig0>>( - shiftCount );
4975 if ( aSig1
bb98fe42 4976 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
158142c2
FB
4977 STATUS(float_exception_flags) |= float_flag_inexact;
4978 }
4979 }
4980 if ( aSign ) z = - z;
4981 return z;
4982
4983}
4984
4985/*----------------------------------------------------------------------------
4986| Returns the result of converting the quadruple-precision floating-point
4987| value `a' to the single-precision floating-point format. The conversion
4988| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4989| Arithmetic.
4990*----------------------------------------------------------------------------*/
4991
4992float32 float128_to_float32( float128 a STATUS_PARAM )
4993{
4994 flag aSign;
4995 int32 aExp;
bb98fe42
AF
4996 uint64_t aSig0, aSig1;
4997 uint32_t zSig;
158142c2
FB
4998
4999 aSig1 = extractFloat128Frac1( a );
5000 aSig0 = extractFloat128Frac0( a );
5001 aExp = extractFloat128Exp( a );
5002 aSign = extractFloat128Sign( a );
5003 if ( aExp == 0x7FFF ) {
5004 if ( aSig0 | aSig1 ) {
bcd4d9af 5005 return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
5006 }
5007 return packFloat32( aSign, 0xFF, 0 );
5008 }
5009 aSig0 |= ( aSig1 != 0 );
5010 shift64RightJamming( aSig0, 18, &aSig0 );
5011 zSig = aSig0;
5012 if ( aExp || zSig ) {
5013 zSig |= 0x40000000;
5014 aExp -= 0x3F81;
5015 }
5016 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
5017
5018}
5019
5020/*----------------------------------------------------------------------------
5021| Returns the result of converting the quadruple-precision floating-point
5022| value `a' to the double-precision floating-point format. The conversion
5023| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5024| Arithmetic.
5025*----------------------------------------------------------------------------*/
5026
5027float64 float128_to_float64( float128 a STATUS_PARAM )
5028{
5029 flag aSign;
5030 int32 aExp;
bb98fe42 5031 uint64_t aSig0, aSig1;
158142c2
FB
5032
5033 aSig1 = extractFloat128Frac1( a );
5034 aSig0 = extractFloat128Frac0( a );
5035 aExp = extractFloat128Exp( a );
5036 aSign = extractFloat128Sign( a );
5037 if ( aExp == 0x7FFF ) {
5038 if ( aSig0 | aSig1 ) {
bcd4d9af 5039 return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
5040 }
5041 return packFloat64( aSign, 0x7FF, 0 );
5042 }
5043 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5044 aSig0 |= ( aSig1 != 0 );
5045 if ( aExp || aSig0 ) {
5046 aSig0 |= LIT64( 0x4000000000000000 );
5047 aExp -= 0x3C01;
5048 }
5049 return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
5050
5051}
5052
158142c2
FB
5053/*----------------------------------------------------------------------------
5054| Returns the result of converting the quadruple-precision floating-point
5055| value `a' to the extended double-precision floating-point format. The
5056| conversion is performed according to the IEC/IEEE Standard for Binary
5057| Floating-Point Arithmetic.
5058*----------------------------------------------------------------------------*/
5059
5060floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
5061{
5062 flag aSign;
5063 int32 aExp;
bb98fe42 5064 uint64_t aSig0, aSig1;
158142c2
FB
5065
5066 aSig1 = extractFloat128Frac1( a );
5067 aSig0 = extractFloat128Frac0( a );
5068 aExp = extractFloat128Exp( a );
5069 aSign = extractFloat128Sign( a );
5070 if ( aExp == 0x7FFF ) {
5071 if ( aSig0 | aSig1 ) {
bcd4d9af 5072 return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
5073 }
5074 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5075 }
5076 if ( aExp == 0 ) {
5077 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5078 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5079 }
5080 else {
5081 aSig0 |= LIT64( 0x0001000000000000 );
5082 }
5083 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5084 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
5085
5086}
5087
158142c2
FB
5088/*----------------------------------------------------------------------------
5089| Rounds the quadruple-precision floating-point value `a' to an integer, and
5090| returns the result as a quadruple-precision floating-point value. The
5091| operation is performed according to the IEC/IEEE Standard for Binary
5092| Floating-Point Arithmetic.
5093*----------------------------------------------------------------------------*/
5094
5095float128 float128_round_to_int( float128 a STATUS_PARAM )
5096{
5097 flag aSign;
5098 int32 aExp;
bb98fe42 5099 uint64_t lastBitMask, roundBitsMask;
158142c2
FB
5100 int8 roundingMode;
5101 float128 z;
5102
5103 aExp = extractFloat128Exp( a );
5104 if ( 0x402F <= aExp ) {
5105 if ( 0x406F <= aExp ) {
5106 if ( ( aExp == 0x7FFF )
5107 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5108 ) {
5109 return propagateFloat128NaN( a, a STATUS_VAR );
5110 }
5111 return a;
5112 }
5113 lastBitMask = 1;
5114 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5115 roundBitsMask = lastBitMask - 1;
5116 z = a;
5117 roundingMode = STATUS(float_rounding_mode);
5118 if ( roundingMode == float_round_nearest_even ) {
5119 if ( lastBitMask ) {
5120 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
5121 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
5122 }
5123 else {
bb98fe42 5124 if ( (int64_t) z.low < 0 ) {
158142c2 5125 ++z.high;
bb98fe42 5126 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
158142c2
FB
5127 }
5128 }
5129 }
5130 else if ( roundingMode != float_round_to_zero ) {
5131 if ( extractFloat128Sign( z )
5132 ^ ( roundingMode == float_round_up ) ) {
5133 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
5134 }
5135 }
5136 z.low &= ~ roundBitsMask;
5137 }
5138 else {
5139 if ( aExp < 0x3FFF ) {
bb98fe42 5140 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
158142c2
FB
5141 STATUS(float_exception_flags) |= float_flag_inexact;
5142 aSign = extractFloat128Sign( a );
5143 switch ( STATUS(float_rounding_mode) ) {
5144 case float_round_nearest_even:
5145 if ( ( aExp == 0x3FFE )
5146 && ( extractFloat128Frac0( a )
5147 | extractFloat128Frac1( a ) )
5148 ) {
5149 return packFloat128( aSign, 0x3FFF, 0, 0 );
5150 }
5151 break;
5152 case float_round_down:
5153 return
5154 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
5155 : packFloat128( 0, 0, 0, 0 );
5156 case float_round_up:
5157 return
5158 aSign ? packFloat128( 1, 0, 0, 0 )
5159 : packFloat128( 0, 0x3FFF, 0, 0 );
5160 }
5161 return packFloat128( aSign, 0, 0, 0 );
5162 }
5163 lastBitMask = 1;
5164 lastBitMask <<= 0x402F - aExp;
5165 roundBitsMask = lastBitMask - 1;
5166 z.low = 0;
5167 z.high = a.high;
5168 roundingMode = STATUS(float_rounding_mode);
5169 if ( roundingMode == float_round_nearest_even ) {
5170 z.high += lastBitMask>>1;
5171 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
5172 z.high &= ~ lastBitMask;
5173 }
5174 }
5175 else if ( roundingMode != float_round_to_zero ) {
5176 if ( extractFloat128Sign( z )
5177 ^ ( roundingMode == float_round_up ) ) {
5178 z.high |= ( a.low != 0 );
5179 z.high += roundBitsMask;
5180 }
5181 }
5182 z.high &= ~ roundBitsMask;
5183 }
5184 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
5185 STATUS(float_exception_flags) |= float_flag_inexact;
5186 }
5187 return z;
5188
5189}
5190
5191/*----------------------------------------------------------------------------
5192| Returns the result of adding the absolute values of the quadruple-precision
5193| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
5194| before being returned. `zSign' is ignored if the result is a NaN.
5195| The addition is performed according to the IEC/IEEE Standard for Binary
5196| Floating-Point Arithmetic.
5197*----------------------------------------------------------------------------*/
5198
5199static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5200{
5201 int32 aExp, bExp, zExp;
bb98fe42 5202 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
158142c2
FB
5203 int32 expDiff;
5204
5205 aSig1 = extractFloat128Frac1( a );
5206 aSig0 = extractFloat128Frac0( a );
5207 aExp = extractFloat128Exp( a );
5208 bSig1 = extractFloat128Frac1( b );
5209 bSig0 = extractFloat128Frac0( b );
5210 bExp = extractFloat128Exp( b );
5211 expDiff = aExp - bExp;
5212 if ( 0 < expDiff ) {
5213 if ( aExp == 0x7FFF ) {
5214 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5215 return a;
5216 }
5217 if ( bExp == 0 ) {
5218 --expDiff;
5219 }
5220 else {
5221 bSig0 |= LIT64( 0x0001000000000000 );
5222 }
5223 shift128ExtraRightJamming(
5224 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
5225 zExp = aExp;
5226 }
5227 else if ( expDiff < 0 ) {
5228 if ( bExp == 0x7FFF ) {
5229 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5230 return packFloat128( zSign, 0x7FFF, 0, 0 );
5231 }
5232 if ( aExp == 0 ) {
5233 ++expDiff;
5234 }
5235 else {
5236 aSig0 |= LIT64( 0x0001000000000000 );
5237 }
5238 shift128ExtraRightJamming(
5239 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
5240 zExp = bExp;
5241 }
5242 else {
5243 if ( aExp == 0x7FFF ) {
5244 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5245 return propagateFloat128NaN( a, b STATUS_VAR );
5246 }
5247 return a;
5248 }
5249 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
fe76d976 5250 if ( aExp == 0 ) {
e6afc87f
PM
5251 if (STATUS(flush_to_zero)) {
5252 if (zSig0 | zSig1) {
5253 float_raise(float_flag_output_denormal STATUS_VAR);
5254 }
5255 return packFloat128(zSign, 0, 0, 0);
5256 }
fe76d976
PB
5257 return packFloat128( zSign, 0, zSig0, zSig1 );
5258 }
158142c2
FB
5259 zSig2 = 0;
5260 zSig0 |= LIT64( 0x0002000000000000 );
5261 zExp = aExp;
5262 goto shiftRight1;
5263 }
5264 aSig0 |= LIT64( 0x0001000000000000 );
5265 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5266 --zExp;
5267 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
5268 ++zExp;
5269 shiftRight1:
5270 shift128ExtraRightJamming(
5271 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5272 roundAndPack:
5273 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5274
5275}
5276
5277/*----------------------------------------------------------------------------
5278| Returns the result of subtracting the absolute values of the quadruple-
5279| precision floating-point values `a' and `b'. If `zSign' is 1, the
5280| difference is negated before being returned. `zSign' is ignored if the
5281| result is a NaN. The subtraction is performed according to the IEC/IEEE
5282| Standard for Binary Floating-Point Arithmetic.
5283*----------------------------------------------------------------------------*/
5284
5285static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5286{
5287 int32 aExp, bExp, zExp;
bb98fe42 5288 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
158142c2
FB
5289 int32 expDiff;
5290 float128 z;
5291
5292 aSig1 = extractFloat128Frac1( a );
5293 aSig0 = extractFloat128Frac0( a );
5294 aExp = extractFloat128Exp( a );
5295 bSig1 = extractFloat128Frac1( b );
5296 bSig0 = extractFloat128Frac0( b );
5297 bExp = extractFloat128Exp( b );
5298 expDiff = aExp - bExp;
5299 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5300 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
5301 if ( 0 < expDiff ) goto aExpBigger;
5302 if ( expDiff < 0 ) goto bExpBigger;
5303 if ( aExp == 0x7FFF ) {
5304 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5305 return propagateFloat128NaN( a, b STATUS_VAR );
5306 }
5307 float_raise( float_flag_invalid STATUS_VAR);
5308 z.low = float128_default_nan_low;
5309 z.high = float128_default_nan_high;
5310 return z;
5311 }
5312 if ( aExp == 0 ) {
5313 aExp = 1;
5314 bExp = 1;
5315 }
5316 if ( bSig0 < aSig0 ) goto aBigger;
5317 if ( aSig0 < bSig0 ) goto bBigger;
5318 if ( bSig1 < aSig1 ) goto aBigger;
5319 if ( aSig1 < bSig1 ) goto bBigger;
5320 return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
5321 bExpBigger:
5322 if ( bExp == 0x7FFF ) {
5323 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5324 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
5325 }
5326 if ( aExp == 0 ) {
5327 ++expDiff;
5328 }
5329 else {
5330 aSig0 |= LIT64( 0x4000000000000000 );
5331 }
5332 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5333 bSig0 |= LIT64( 0x4000000000000000 );
5334 bBigger:
5335 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
5336 zExp = bExp;
5337 zSign ^= 1;
5338 goto normalizeRoundAndPack;
5339 aExpBigger:
5340 if ( aExp == 0x7FFF ) {
5341 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5342 return a;
5343 }
5344 if ( bExp == 0 ) {
5345 --expDiff;
5346 }
5347 else {
5348 bSig0 |= LIT64( 0x4000000000000000 );
5349 }
5350 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
5351 aSig0 |= LIT64( 0x4000000000000000 );
5352 aBigger:
5353 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5354 zExp = aExp;
5355 normalizeRoundAndPack:
5356 --zExp;
5357 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
5358
5359}
5360
5361/*----------------------------------------------------------------------------
5362| Returns the result of adding the quadruple-precision floating-point values
5363| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
5364| for Binary Floating-Point Arithmetic.
5365*----------------------------------------------------------------------------*/
5366
5367float128 float128_add( float128 a, float128 b STATUS_PARAM )
5368{
5369 flag aSign, bSign;
5370
5371 aSign = extractFloat128Sign( a );
5372 bSign = extractFloat128Sign( b );
5373 if ( aSign == bSign ) {
5374 return addFloat128Sigs( a, b, aSign STATUS_VAR );
5375 }
5376 else {
5377 return subFloat128Sigs( a, b, aSign STATUS_VAR );
5378 }
5379
5380}
5381
5382/*----------------------------------------------------------------------------
5383| Returns the result of subtracting the quadruple-precision floating-point
5384| values `a' and `b'. The operation is performed according to the IEC/IEEE
5385| Standard for Binary Floating-Point Arithmetic.
5386*----------------------------------------------------------------------------*/
5387
5388float128 float128_sub( float128 a, float128 b STATUS_PARAM )
5389{
5390 flag aSign, bSign;
5391
5392 aSign = extractFloat128Sign( a );
5393 bSign = extractFloat128Sign( b );
5394 if ( aSign == bSign ) {
5395 return subFloat128Sigs( a, b, aSign STATUS_VAR );
5396 }
5397 else {
5398 return addFloat128Sigs( a, b, aSign STATUS_VAR );
5399 }
5400
5401}
5402
5403/*----------------------------------------------------------------------------
5404| Returns the result of multiplying the quadruple-precision floating-point
5405| values `a' and `b'. The operation is performed according to the IEC/IEEE
5406| Standard for Binary Floating-Point Arithmetic.
5407*----------------------------------------------------------------------------*/
5408
5409float128 float128_mul( float128 a, float128 b STATUS_PARAM )
5410{
5411 flag aSign, bSign, zSign;
5412 int32 aExp, bExp, zExp;
bb98fe42 5413 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
158142c2
FB
5414 float128 z;
5415
5416 aSig1 = extractFloat128Frac1( a );
5417 aSig0 = extractFloat128Frac0( a );
5418 aExp = extractFloat128Exp( a );
5419 aSign = extractFloat128Sign( a );
5420 bSig1 = extractFloat128Frac1( b );
5421 bSig0 = extractFloat128Frac0( b );
5422 bExp = extractFloat128Exp( b );
5423 bSign = extractFloat128Sign( b );
5424 zSign = aSign ^ bSign;
5425 if ( aExp == 0x7FFF ) {
5426 if ( ( aSig0 | aSig1 )
5427 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5428 return propagateFloat128NaN( a, b STATUS_VAR );
5429 }
5430 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
5431 return packFloat128( zSign, 0x7FFF, 0, 0 );
5432 }
5433 if ( bExp == 0x7FFF ) {
5434 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5435 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5436 invalid:
5437 float_raise( float_flag_invalid STATUS_VAR);
5438 z.low = float128_default_nan_low;
5439 z.high = float128_default_nan_high;
5440 return z;
5441 }
5442 return packFloat128( zSign, 0x7FFF, 0, 0 );
5443 }
5444 if ( aExp == 0 ) {
5445 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5446 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5447 }
5448 if ( bExp == 0 ) {
5449 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5450 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5451 }
5452 zExp = aExp + bExp - 0x4000;
5453 aSig0 |= LIT64( 0x0001000000000000 );
5454 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5455 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5456 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5457 zSig2 |= ( zSig3 != 0 );
5458 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5459 shift128ExtraRightJamming(
5460 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5461 ++zExp;
5462 }
5463 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5464
5465}
5466
5467/*----------------------------------------------------------------------------
5468| Returns the result of dividing the quadruple-precision floating-point value
5469| `a' by the corresponding value `b'. The operation is performed according to
5470| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5471*----------------------------------------------------------------------------*/
5472
5473float128 float128_div( float128 a, float128 b STATUS_PARAM )
5474{
5475 flag aSign, bSign, zSign;
5476 int32 aExp, bExp, zExp;
bb98fe42
AF
5477 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5478 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
158142c2
FB
5479 float128 z;
5480
5481 aSig1 = extractFloat128Frac1( a );
5482 aSig0 = extractFloat128Frac0( a );
5483 aExp = extractFloat128Exp( a );
5484 aSign = extractFloat128Sign( a );
5485 bSig1 = extractFloat128Frac1( b );
5486 bSig0 = extractFloat128Frac0( b );
5487 bExp = extractFloat128Exp( b );
5488 bSign = extractFloat128Sign( b );
5489 zSign = aSign ^ bSign;
5490 if ( aExp == 0x7FFF ) {
5491 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5492 if ( bExp == 0x7FFF ) {
5493 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5494 goto invalid;
5495 }
5496 return packFloat128( zSign, 0x7FFF, 0, 0 );
5497 }
5498 if ( bExp == 0x7FFF ) {
5499 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5500 return packFloat128( zSign, 0, 0, 0 );
5501 }
5502 if ( bExp == 0 ) {
5503 if ( ( bSig0 | bSig1 ) == 0 ) {
5504 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5505 invalid:
5506 float_raise( float_flag_invalid STATUS_VAR);
5507 z.low = float128_default_nan_low;
5508 z.high = float128_default_nan_high;
5509 return z;
5510 }
5511 float_raise( float_flag_divbyzero STATUS_VAR);
5512 return packFloat128( zSign, 0x7FFF, 0, 0 );
5513 }
5514 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5515 }
5516 if ( aExp == 0 ) {
5517 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5518 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5519 }
5520 zExp = aExp - bExp + 0x3FFD;
5521 shortShift128Left(
5522 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5523 shortShift128Left(
5524 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5525 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5526 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5527 ++zExp;
5528 }
5529 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5530 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5531 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
bb98fe42 5532 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
5533 --zSig0;
5534 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5535 }
5536 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5537 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5538 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5539 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
bb98fe42 5540 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
5541 --zSig1;
5542 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5543 }
5544 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5545 }
5546 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5547 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5548
5549}
5550
5551/*----------------------------------------------------------------------------
5552| Returns the remainder of the quadruple-precision floating-point value `a'
5553| with respect to the corresponding value `b'. The operation is performed
5554| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5555*----------------------------------------------------------------------------*/
5556
5557float128 float128_rem( float128 a, float128 b STATUS_PARAM )
5558{
ed086f3d 5559 flag aSign, zSign;
158142c2 5560 int32 aExp, bExp, expDiff;
bb98fe42
AF
5561 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5562 uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
5563 int64_t sigMean0;
158142c2
FB
5564 float128 z;
5565
5566 aSig1 = extractFloat128Frac1( a );
5567 aSig0 = extractFloat128Frac0( a );
5568 aExp = extractFloat128Exp( a );
5569 aSign = extractFloat128Sign( a );
5570 bSig1 = extractFloat128Frac1( b );
5571 bSig0 = extractFloat128Frac0( b );
5572 bExp = extractFloat128Exp( b );
158142c2
FB
5573 if ( aExp == 0x7FFF ) {
5574 if ( ( aSig0 | aSig1 )
5575 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5576 return propagateFloat128NaN( a, b STATUS_VAR );
5577 }
5578 goto invalid;
5579 }
5580 if ( bExp == 0x7FFF ) {
5581 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5582 return a;
5583 }
5584 if ( bExp == 0 ) {
5585 if ( ( bSig0 | bSig1 ) == 0 ) {
5586 invalid:
5587 float_raise( float_flag_invalid STATUS_VAR);
5588 z.low = float128_default_nan_low;
5589 z.high = float128_default_nan_high;
5590 return z;
5591 }
5592 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5593 }
5594 if ( aExp == 0 ) {
5595 if ( ( aSig0 | aSig1 ) == 0 ) return a;
5596 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5597 }
5598 expDiff = aExp - bExp;
5599 if ( expDiff < -1 ) return a;
5600 shortShift128Left(
5601 aSig0 | LIT64( 0x0001000000000000 ),
5602 aSig1,
5603 15 - ( expDiff < 0 ),
5604 &aSig0,
5605 &aSig1
5606 );
5607 shortShift128Left(
5608 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5609 q = le128( bSig0, bSig1, aSig0, aSig1 );
5610 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5611 expDiff -= 64;
5612 while ( 0 < expDiff ) {
5613 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5614 q = ( 4 < q ) ? q - 4 : 0;
5615 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5616 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5617 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5618 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5619 expDiff -= 61;
5620 }
5621 if ( -64 < expDiff ) {
5622 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5623 q = ( 4 < q ) ? q - 4 : 0;
5624 q >>= - expDiff;
5625 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5626 expDiff += 52;
5627 if ( expDiff < 0 ) {
5628 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5629 }
5630 else {
5631 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5632 }
5633 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5634 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5635 }
5636 else {
5637 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5638 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5639 }
5640 do {
5641 alternateASig0 = aSig0;
5642 alternateASig1 = aSig1;
5643 ++q;
5644 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
bb98fe42 5645 } while ( 0 <= (int64_t) aSig0 );
158142c2 5646 add128(
bb98fe42 5647 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
158142c2
FB
5648 if ( ( sigMean0 < 0 )
5649 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5650 aSig0 = alternateASig0;
5651 aSig1 = alternateASig1;
5652 }
bb98fe42 5653 zSign = ( (int64_t) aSig0 < 0 );
158142c2
FB
5654 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5655 return
5656 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
5657
5658}
5659
5660/*----------------------------------------------------------------------------
5661| Returns the square root of the quadruple-precision floating-point value `a'.
5662| The operation is performed according to the IEC/IEEE Standard for Binary
5663| Floating-Point Arithmetic.
5664*----------------------------------------------------------------------------*/
5665
5666float128 float128_sqrt( float128 a STATUS_PARAM )
5667{
5668 flag aSign;
5669 int32 aExp, zExp;
bb98fe42
AF
5670 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5671 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
158142c2
FB
5672 float128 z;
5673
5674 aSig1 = extractFloat128Frac1( a );
5675 aSig0 = extractFloat128Frac0( a );
5676 aExp = extractFloat128Exp( a );
5677 aSign = extractFloat128Sign( a );
5678 if ( aExp == 0x7FFF ) {
5679 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
5680 if ( ! aSign ) return a;
5681 goto invalid;
5682 }
5683 if ( aSign ) {
5684 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5685 invalid:
5686 float_raise( float_flag_invalid STATUS_VAR);
5687 z.low = float128_default_nan_low;
5688 z.high = float128_default_nan_high;
5689 return z;
5690 }
5691 if ( aExp == 0 ) {
5692 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5693 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5694 }
5695 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5696 aSig0 |= LIT64( 0x0001000000000000 );
5697 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5698 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5699 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5700 doubleZSig0 = zSig0<<1;
5701 mul64To128( zSig0, zSig0, &term0, &term1 );
5702 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
bb98fe42 5703 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
5704 --zSig0;
5705 doubleZSig0 -= 2;
5706 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5707 }
5708 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5709 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5710 if ( zSig1 == 0 ) zSig1 = 1;
5711 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5712 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5713 mul64To128( zSig1, zSig1, &term2, &term3 );
5714 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
bb98fe42 5715 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
5716 --zSig1;
5717 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5718 term3 |= 1;
5719 term2 |= doubleZSig0;
5720 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5721 }
5722 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5723 }
5724 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5725 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5726
5727}
5728
5729/*----------------------------------------------------------------------------
5730| Returns 1 if the quadruple-precision floating-point value `a' is equal to
b689362d
AJ
5731| the corresponding value `b', and 0 otherwise. The invalid exception is
5732| raised if either operand is a NaN. Otherwise, the comparison is performed
158142c2
FB
5733| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5734*----------------------------------------------------------------------------*/
5735
b689362d 5736int float128_eq( float128 a, float128 b STATUS_PARAM )
158142c2
FB
5737{
5738
5739 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5740 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5741 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5742 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5743 ) {
b689362d 5744 float_raise( float_flag_invalid STATUS_VAR);
158142c2
FB
5745 return 0;
5746 }
5747 return
5748 ( a.low == b.low )
5749 && ( ( a.high == b.high )
5750 || ( ( a.low == 0 )
bb98fe42 5751 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
5752 );
5753
5754}
5755
5756/*----------------------------------------------------------------------------
5757| Returns 1 if the quadruple-precision floating-point value `a' is less than
f5a64251
AJ
5758| or equal to the corresponding value `b', and 0 otherwise. The invalid
5759| exception is raised if either operand is a NaN. The comparison is performed
5760| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
5761*----------------------------------------------------------------------------*/
5762
750afe93 5763int float128_le( float128 a, float128 b STATUS_PARAM )
158142c2
FB
5764{
5765 flag aSign, bSign;
5766
5767 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5768 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5769 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5770 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5771 ) {
5772 float_raise( float_flag_invalid STATUS_VAR);
5773 return 0;
5774 }
5775 aSign = extractFloat128Sign( a );
5776 bSign = extractFloat128Sign( b );
5777 if ( aSign != bSign ) {
5778 return
5779 aSign
bb98fe42 5780 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5781 == 0 );
5782 }
5783 return
5784 aSign ? le128( b.high, b.low, a.high, a.low )
5785 : le128( a.high, a.low, b.high, b.low );
5786
5787}
5788
5789/*----------------------------------------------------------------------------
5790| Returns 1 if the quadruple-precision floating-point value `a' is less than
f5a64251
AJ
5791| the corresponding value `b', and 0 otherwise. The invalid exception is
5792| raised if either operand is a NaN. The comparison is performed according
5793| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
5794*----------------------------------------------------------------------------*/
5795
750afe93 5796int float128_lt( float128 a, float128 b STATUS_PARAM )
158142c2
FB
5797{
5798 flag aSign, bSign;
5799
5800 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5801 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5802 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5803 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5804 ) {
5805 float_raise( float_flag_invalid STATUS_VAR);
5806 return 0;
5807 }
5808 aSign = extractFloat128Sign( a );
5809 bSign = extractFloat128Sign( b );
5810 if ( aSign != bSign ) {
5811 return
5812 aSign
bb98fe42 5813 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5814 != 0 );
5815 }
5816 return
5817 aSign ? lt128( b.high, b.low, a.high, a.low )
5818 : lt128( a.high, a.low, b.high, b.low );
5819
5820}
5821
67b7861d
AJ
5822/*----------------------------------------------------------------------------
5823| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
f5a64251
AJ
5824| be compared, and 0 otherwise. The invalid exception is raised if either
5825| operand is a NaN. The comparison is performed according to the IEC/IEEE
5826| Standard for Binary Floating-Point Arithmetic.
67b7861d
AJ
5827*----------------------------------------------------------------------------*/
5828
5829int float128_unordered( float128 a, float128 b STATUS_PARAM )
5830{
5831 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5832 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5833 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5834 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5835 ) {
5836 float_raise( float_flag_invalid STATUS_VAR);
5837 return 1;
5838 }
5839 return 0;
5840}
5841
158142c2
FB
5842/*----------------------------------------------------------------------------
5843| Returns 1 if the quadruple-precision floating-point value `a' is equal to
f5a64251
AJ
5844| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5845| exception. The comparison is performed according to the IEC/IEEE Standard
5846| for Binary Floating-Point Arithmetic.
158142c2
FB
5847*----------------------------------------------------------------------------*/
5848
b689362d 5849int float128_eq_quiet( float128 a, float128 b STATUS_PARAM )
158142c2
FB
5850{
5851
5852 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5853 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5854 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5855 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5856 ) {
b689362d
AJ
5857 if ( float128_is_signaling_nan( a )
5858 || float128_is_signaling_nan( b ) ) {
5859 float_raise( float_flag_invalid STATUS_VAR);
5860 }
158142c2
FB
5861 return 0;
5862 }
5863 return
5864 ( a.low == b.low )
5865 && ( ( a.high == b.high )
5866 || ( ( a.low == 0 )
bb98fe42 5867 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
5868 );
5869
5870}
5871
5872/*----------------------------------------------------------------------------
5873| Returns 1 if the quadruple-precision floating-point value `a' is less than
5874| or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5875| cause an exception. Otherwise, the comparison is performed according to the
5876| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5877*----------------------------------------------------------------------------*/
5878
750afe93 5879int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
158142c2
FB
5880{
5881 flag aSign, bSign;
5882
5883 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5884 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5885 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5886 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5887 ) {
5888 if ( float128_is_signaling_nan( a )
5889 || float128_is_signaling_nan( b ) ) {
5890 float_raise( float_flag_invalid STATUS_VAR);
5891 }
5892 return 0;
5893 }
5894 aSign = extractFloat128Sign( a );
5895 bSign = extractFloat128Sign( b );
5896 if ( aSign != bSign ) {
5897 return
5898 aSign
bb98fe42 5899 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5900 == 0 );
5901 }
5902 return
5903 aSign ? le128( b.high, b.low, a.high, a.low )
5904 : le128( a.high, a.low, b.high, b.low );
5905
5906}
5907
5908/*----------------------------------------------------------------------------
5909| Returns 1 if the quadruple-precision floating-point value `a' is less than
5910| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5911| exception. Otherwise, the comparison is performed according to the IEC/IEEE
5912| Standard for Binary Floating-Point Arithmetic.
5913*----------------------------------------------------------------------------*/
5914
750afe93 5915int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
158142c2
FB
5916{
5917 flag aSign, bSign;
5918
5919 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5920 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5921 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5922 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5923 ) {
5924 if ( float128_is_signaling_nan( a )
5925 || float128_is_signaling_nan( b ) ) {
5926 float_raise( float_flag_invalid STATUS_VAR);
5927 }
5928 return 0;
5929 }
5930 aSign = extractFloat128Sign( a );
5931 bSign = extractFloat128Sign( b );
5932 if ( aSign != bSign ) {
5933 return
5934 aSign
bb98fe42 5935 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5936 != 0 );
5937 }
5938 return
5939 aSign ? lt128( b.high, b.low, a.high, a.low )
5940 : lt128( a.high, a.low, b.high, b.low );
5941
5942}
5943
67b7861d
AJ
5944/*----------------------------------------------------------------------------
5945| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
5946| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
5947| comparison is performed according to the IEC/IEEE Standard for Binary
5948| Floating-Point Arithmetic.
5949*----------------------------------------------------------------------------*/
5950
5951int float128_unordered_quiet( float128 a, float128 b STATUS_PARAM )
5952{
5953 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5954 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5955 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5956 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5957 ) {
5958 if ( float128_is_signaling_nan( a )
5959 || float128_is_signaling_nan( b ) ) {
5960 float_raise( float_flag_invalid STATUS_VAR);
5961 }
5962 return 1;
5963 }
5964 return 0;
5965}
5966
1d6bda35
FB
5967/* misc functions */
5968float32 uint32_to_float32( unsigned int a STATUS_PARAM )
5969{
5970 return int64_to_float32(a STATUS_VAR);
5971}
5972
5973float64 uint32_to_float64( unsigned int a STATUS_PARAM )
5974{
5975 return int64_to_float64(a STATUS_VAR);
5976}
5977
5978unsigned int float32_to_uint32( float32 a STATUS_PARAM )
5979{
5980 int64_t v;
5981 unsigned int res;
5982
5983 v = float32_to_int64(a STATUS_VAR);
5984 if (v < 0) {
5985 res = 0;
5986 float_raise( float_flag_invalid STATUS_VAR);
5987 } else if (v > 0xffffffff) {
5988 res = 0xffffffff;
5989 float_raise( float_flag_invalid STATUS_VAR);
5990 } else {
5991 res = v;
5992 }
5993 return res;
5994}
5995
5996unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
5997{
5998 int64_t v;
5999 unsigned int res;
6000
6001 v = float32_to_int64_round_to_zero(a STATUS_VAR);
6002 if (v < 0) {
6003 res = 0;
6004 float_raise( float_flag_invalid STATUS_VAR);
6005 } else if (v > 0xffffffff) {
6006 res = 0xffffffff;
6007 float_raise( float_flag_invalid STATUS_VAR);
6008 } else {
6009 res = v;
6010 }
6011 return res;
6012}
6013
cbcef455
PM
6014unsigned int float32_to_uint16_round_to_zero( float32 a STATUS_PARAM )
6015{
6016 int64_t v;
6017 unsigned int res;
6018
6019 v = float32_to_int64_round_to_zero(a STATUS_VAR);
6020 if (v < 0) {
6021 res = 0;
6022 float_raise( float_flag_invalid STATUS_VAR);
6023 } else if (v > 0xffff) {
6024 res = 0xffff;
6025 float_raise( float_flag_invalid STATUS_VAR);
6026 } else {
6027 res = v;
6028 }
6029 return res;
6030}
6031
1d6bda35
FB
6032unsigned int float64_to_uint32( float64 a STATUS_PARAM )
6033{
6034 int64_t v;
6035 unsigned int res;
6036
6037 v = float64_to_int64(a STATUS_VAR);
6038 if (v < 0) {
6039 res = 0;
6040 float_raise( float_flag_invalid STATUS_VAR);
6041 } else if (v > 0xffffffff) {
6042 res = 0xffffffff;
6043 float_raise( float_flag_invalid STATUS_VAR);
6044 } else {
6045 res = v;
6046 }
6047 return res;
6048}
6049
6050unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
6051{
6052 int64_t v;
6053 unsigned int res;
6054
6055 v = float64_to_int64_round_to_zero(a STATUS_VAR);
6056 if (v < 0) {
6057 res = 0;
6058 float_raise( float_flag_invalid STATUS_VAR);
6059 } else if (v > 0xffffffff) {
6060 res = 0xffffffff;
6061 float_raise( float_flag_invalid STATUS_VAR);
6062 } else {
6063 res = v;
6064 }
6065 return res;
6066}
6067
cbcef455
PM
6068unsigned int float64_to_uint16_round_to_zero( float64 a STATUS_PARAM )
6069{
6070 int64_t v;
6071 unsigned int res;
6072
6073 v = float64_to_int64_round_to_zero(a STATUS_VAR);
6074 if (v < 0) {
6075 res = 0;
6076 float_raise( float_flag_invalid STATUS_VAR);
6077 } else if (v > 0xffff) {
6078 res = 0xffff;
6079 float_raise( float_flag_invalid STATUS_VAR);
6080 } else {
6081 res = v;
6082 }
6083 return res;
6084}
6085
f090c9d4 6086/* FIXME: This looks broken. */
75d62a58
JM
6087uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
6088{
6089 int64_t v;
6090
f090c9d4
PB
6091 v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
6092 v += float64_val(a);
6093 v = float64_to_int64(make_float64(v) STATUS_VAR);
75d62a58
JM
6094
6095 return v - INT64_MIN;
6096}
6097
6098uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
6099{
6100 int64_t v;
6101
f090c9d4
PB
6102 v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
6103 v += float64_val(a);
6104 v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);
75d62a58
JM
6105
6106 return v - INT64_MIN;
6107}
6108
1d6bda35 6109#define COMPARE(s, nan_exp) \
750afe93 6110INLINE int float ## s ## _compare_internal( float ## s a, float ## s b, \
1d6bda35
FB
6111 int is_quiet STATUS_PARAM ) \
6112{ \
6113 flag aSign, bSign; \
bb98fe42 6114 uint ## s ## _t av, bv; \
37d18660
PM
6115 a = float ## s ## _squash_input_denormal(a STATUS_VAR); \
6116 b = float ## s ## _squash_input_denormal(b STATUS_VAR); \
1d6bda35
FB
6117 \
6118 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
6119 extractFloat ## s ## Frac( a ) ) || \
6120 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
6121 extractFloat ## s ## Frac( b ) )) { \
6122 if (!is_quiet || \
6123 float ## s ## _is_signaling_nan( a ) || \
6124 float ## s ## _is_signaling_nan( b ) ) { \
6125 float_raise( float_flag_invalid STATUS_VAR); \
6126 } \
6127 return float_relation_unordered; \
6128 } \
6129 aSign = extractFloat ## s ## Sign( a ); \
6130 bSign = extractFloat ## s ## Sign( b ); \
f090c9d4 6131 av = float ## s ## _val(a); \
cd8a2533 6132 bv = float ## s ## _val(b); \
1d6bda35 6133 if ( aSign != bSign ) { \
bb98fe42 6134 if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) { \
1d6bda35
FB
6135 /* zero case */ \
6136 return float_relation_equal; \
6137 } else { \
6138 return 1 - (2 * aSign); \
6139 } \
6140 } else { \
f090c9d4 6141 if (av == bv) { \
1d6bda35
FB
6142 return float_relation_equal; \
6143 } else { \
f090c9d4 6144 return 1 - 2 * (aSign ^ ( av < bv )); \
1d6bda35
FB
6145 } \
6146 } \
6147} \
6148 \
750afe93 6149int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM ) \
1d6bda35
FB
6150{ \
6151 return float ## s ## _compare_internal(a, b, 0 STATUS_VAR); \
6152} \
6153 \
750afe93 6154int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \
1d6bda35
FB
6155{ \
6156 return float ## s ## _compare_internal(a, b, 1 STATUS_VAR); \
6157}
6158
6159COMPARE(32, 0xff)
6160COMPARE(64, 0x7ff)
9ee6e8bb 6161
f6714d36
AJ
6162INLINE int floatx80_compare_internal( floatx80 a, floatx80 b,
6163 int is_quiet STATUS_PARAM )
6164{
6165 flag aSign, bSign;
6166
6167 if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
6168 ( extractFloatx80Frac( a )<<1 ) ) ||
6169 ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
6170 ( extractFloatx80Frac( b )<<1 ) )) {
6171 if (!is_quiet ||
6172 floatx80_is_signaling_nan( a ) ||
6173 floatx80_is_signaling_nan( b ) ) {
6174 float_raise( float_flag_invalid STATUS_VAR);
6175 }
6176 return float_relation_unordered;
6177 }
6178 aSign = extractFloatx80Sign( a );
6179 bSign = extractFloatx80Sign( b );
6180 if ( aSign != bSign ) {
6181
6182 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
6183 ( ( a.low | b.low ) == 0 ) ) {
6184 /* zero case */
6185 return float_relation_equal;
6186 } else {
6187 return 1 - (2 * aSign);
6188 }
6189 } else {
6190 if (a.low == b.low && a.high == b.high) {
6191 return float_relation_equal;
6192 } else {
6193 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6194 }
6195 }
6196}
6197
6198int floatx80_compare( floatx80 a, floatx80 b STATUS_PARAM )
6199{
6200 return floatx80_compare_internal(a, b, 0 STATUS_VAR);
6201}
6202
6203int floatx80_compare_quiet( floatx80 a, floatx80 b STATUS_PARAM )
6204{
6205 return floatx80_compare_internal(a, b, 1 STATUS_VAR);
6206}
6207
1f587329
BS
6208INLINE int float128_compare_internal( float128 a, float128 b,
6209 int is_quiet STATUS_PARAM )
6210{
6211 flag aSign, bSign;
6212
6213 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6214 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6215 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6216 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6217 if (!is_quiet ||
6218 float128_is_signaling_nan( a ) ||
6219 float128_is_signaling_nan( b ) ) {
6220 float_raise( float_flag_invalid STATUS_VAR);
6221 }
6222 return float_relation_unordered;
6223 }
6224 aSign = extractFloat128Sign( a );
6225 bSign = extractFloat128Sign( b );
6226 if ( aSign != bSign ) {
6227 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6228 /* zero case */
6229 return float_relation_equal;
6230 } else {
6231 return 1 - (2 * aSign);
6232 }
6233 } else {
6234 if (a.low == b.low && a.high == b.high) {
6235 return float_relation_equal;
6236 } else {
6237 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6238 }
6239 }
6240}
6241
6242int float128_compare( float128 a, float128 b STATUS_PARAM )
6243{
6244 return float128_compare_internal(a, b, 0 STATUS_VAR);
6245}
6246
6247int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
6248{
6249 return float128_compare_internal(a, b, 1 STATUS_VAR);
6250}
6251
274f1b04
PM
6252/* min() and max() functions. These can't be implemented as
6253 * 'compare and pick one input' because that would mishandle
6254 * NaNs and +0 vs -0.
6255 */
6256#define MINMAX(s, nan_exp) \
6257INLINE float ## s float ## s ## _minmax(float ## s a, float ## s b, \
6258 int ismin STATUS_PARAM ) \
6259{ \
6260 flag aSign, bSign; \
6261 uint ## s ## _t av, bv; \
6262 a = float ## s ## _squash_input_denormal(a STATUS_VAR); \
6263 b = float ## s ## _squash_input_denormal(b STATUS_VAR); \
6264 if (float ## s ## _is_any_nan(a) || \
6265 float ## s ## _is_any_nan(b)) { \
6266 return propagateFloat ## s ## NaN(a, b STATUS_VAR); \
6267 } \
6268 aSign = extractFloat ## s ## Sign(a); \
6269 bSign = extractFloat ## s ## Sign(b); \
6270 av = float ## s ## _val(a); \
6271 bv = float ## s ## _val(b); \
6272 if (aSign != bSign) { \
6273 if (ismin) { \
6274 return aSign ? a : b; \
6275 } else { \
6276 return aSign ? b : a; \
6277 } \
6278 } else { \
6279 if (ismin) { \
6280 return (aSign ^ (av < bv)) ? a : b; \
6281 } else { \
6282 return (aSign ^ (av < bv)) ? b : a; \
6283 } \
6284 } \
6285} \
6286 \
6287float ## s float ## s ## _min(float ## s a, float ## s b STATUS_PARAM) \
6288{ \
6289 return float ## s ## _minmax(a, b, 1 STATUS_VAR); \
6290} \
6291 \
6292float ## s float ## s ## _max(float ## s a, float ## s b STATUS_PARAM) \
6293{ \
6294 return float ## s ## _minmax(a, b, 0 STATUS_VAR); \
6295}
6296
6297MINMAX(32, 0xff)
6298MINMAX(64, 0x7ff)
6299
6300
9ee6e8bb
PB
6301/* Multiply A by 2 raised to the power N. */
6302float32 float32_scalbn( float32 a, int n STATUS_PARAM )
6303{
6304 flag aSign;
326b9e98 6305 int16_t aExp;
bb98fe42 6306 uint32_t aSig;
9ee6e8bb 6307
37d18660 6308 a = float32_squash_input_denormal(a STATUS_VAR);
9ee6e8bb
PB
6309 aSig = extractFloat32Frac( a );
6310 aExp = extractFloat32Exp( a );
6311 aSign = extractFloat32Sign( a );
6312
6313 if ( aExp == 0xFF ) {
326b9e98
AJ
6314 if ( aSig ) {
6315 return propagateFloat32NaN( a, a STATUS_VAR );
6316 }
9ee6e8bb
PB
6317 return a;
6318 }
69397542
PB
6319 if ( aExp != 0 )
6320 aSig |= 0x00800000;
6321 else if ( aSig == 0 )
6322 return a;
6323
326b9e98
AJ
6324 if (n > 0x200) {
6325 n = 0x200;
6326 } else if (n < -0x200) {
6327 n = -0x200;
6328 }
6329
69397542
PB
6330 aExp += n - 1;
6331 aSig <<= 7;
6332 return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
9ee6e8bb
PB
6333}
6334
6335float64 float64_scalbn( float64 a, int n STATUS_PARAM )
6336{
6337 flag aSign;
326b9e98 6338 int16_t aExp;
bb98fe42 6339 uint64_t aSig;
9ee6e8bb 6340
37d18660 6341 a = float64_squash_input_denormal(a STATUS_VAR);
9ee6e8bb
PB
6342 aSig = extractFloat64Frac( a );
6343 aExp = extractFloat64Exp( a );
6344 aSign = extractFloat64Sign( a );
6345
6346 if ( aExp == 0x7FF ) {
326b9e98
AJ
6347 if ( aSig ) {
6348 return propagateFloat64NaN( a, a STATUS_VAR );
6349 }
9ee6e8bb
PB
6350 return a;
6351 }
69397542
PB
6352 if ( aExp != 0 )
6353 aSig |= LIT64( 0x0010000000000000 );
6354 else if ( aSig == 0 )
6355 return a;
6356
326b9e98
AJ
6357 if (n > 0x1000) {
6358 n = 0x1000;
6359 } else if (n < -0x1000) {
6360 n = -0x1000;
6361 }
6362
69397542
PB
6363 aExp += n - 1;
6364 aSig <<= 10;
6365 return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
9ee6e8bb
PB
6366}
6367
9ee6e8bb
PB
6368floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
6369{
6370 flag aSign;
326b9e98 6371 int32_t aExp;
bb98fe42 6372 uint64_t aSig;
9ee6e8bb
PB
6373
6374 aSig = extractFloatx80Frac( a );
6375 aExp = extractFloatx80Exp( a );
6376 aSign = extractFloatx80Sign( a );
6377
326b9e98
AJ
6378 if ( aExp == 0x7FFF ) {
6379 if ( aSig<<1 ) {
6380 return propagateFloatx80NaN( a, a STATUS_VAR );
6381 }
9ee6e8bb
PB
6382 return a;
6383 }
326b9e98 6384
69397542
PB
6385 if (aExp == 0 && aSig == 0)
6386 return a;
6387
326b9e98
AJ
6388 if (n > 0x10000) {
6389 n = 0x10000;
6390 } else if (n < -0x10000) {
6391 n = -0x10000;
6392 }
6393
9ee6e8bb 6394 aExp += n;
69397542
PB
6395 return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
6396 aSign, aExp, aSig, 0 STATUS_VAR );
9ee6e8bb 6397}
9ee6e8bb 6398
9ee6e8bb
PB
6399float128 float128_scalbn( float128 a, int n STATUS_PARAM )
6400{
6401 flag aSign;
326b9e98 6402 int32_t aExp;
bb98fe42 6403 uint64_t aSig0, aSig1;
9ee6e8bb
PB
6404
6405 aSig1 = extractFloat128Frac1( a );
6406 aSig0 = extractFloat128Frac0( a );
6407 aExp = extractFloat128Exp( a );
6408 aSign = extractFloat128Sign( a );
6409 if ( aExp == 0x7FFF ) {
326b9e98
AJ
6410 if ( aSig0 | aSig1 ) {
6411 return propagateFloat128NaN( a, a STATUS_VAR );
6412 }
9ee6e8bb
PB
6413 return a;
6414 }
69397542
PB
6415 if ( aExp != 0 )
6416 aSig0 |= LIT64( 0x0001000000000000 );
6417 else if ( aSig0 == 0 && aSig1 == 0 )
6418 return a;
6419
326b9e98
AJ
6420 if (n > 0x10000) {
6421 n = 0x10000;
6422 } else if (n < -0x10000) {
6423 n = -0x10000;
6424 }
6425
69397542
PB
6426 aExp += n - 1;
6427 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
6428 STATUS_VAR );
9ee6e8bb
PB
6429
6430}