]> git.proxmox.com Git - mirror_qemu.git/blame - fpu/softfloat.c
target-arm: Add ARM UDIV/SDIV support
[mirror_qemu.git] / fpu / softfloat.c
CommitLineData
8d725fac
AF
1/*
2 * QEMU float support
3 *
4 * Derived from SoftFloat.
5 */
158142c2
FB
6
7/*============================================================================
8
9This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
10Package, Release 2b.
11
12Written by John R. Hauser. This work was made possible in part by the
13International Computer Science Institute, located at Suite 600, 1947 Center
14Street, Berkeley, California 94704. Funding was partially provided by the
15National Science Foundation under grant MIP-9311980. The original version
16of this code was written as part of a project to build a fixed-point vector
17processor in collaboration with the University of California at Berkeley,
18overseen by Profs. Nelson Morgan and John Wawrzynek. More information
19is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
20arithmetic/SoftFloat.html'.
21
22THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
23been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
24RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
25AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
26COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
27EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
28INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
29OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
30
31Derivative works are acceptable, even for commercial purposes, so long as
32(1) the source code for the derivative work includes prominent notice that
33the work is derivative, and (2) the source code includes prominent notice with
34these four paragraphs for those parts of this code that are retained.
35
36=============================================================================*/
37
2ac8bd03
PM
38/* softfloat (and in particular the code in softfloat-specialize.h) is
39 * target-dependent and needs the TARGET_* macros.
40 */
41#include "config.h"
42
158142c2
FB
43#include "softfloat.h"
44
45/*----------------------------------------------------------------------------
46| Primitive arithmetic functions, including multi-word arithmetic, and
47| division and square root approximations. (Can be specialized to target if
48| desired.)
49*----------------------------------------------------------------------------*/
50#include "softfloat-macros.h"
51
52/*----------------------------------------------------------------------------
53| Functions and definitions to determine: (1) whether tininess for underflow
54| is detected before or after rounding by default, (2) what (if anything)
55| happens when exceptions are raised, (3) how signaling NaNs are distinguished
56| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
57| are propagated from function inputs to output. These details are target-
58| specific.
59*----------------------------------------------------------------------------*/
60#include "softfloat-specialize.h"
61
62void set_float_rounding_mode(int val STATUS_PARAM)
63{
64 STATUS(float_rounding_mode) = val;
65}
66
1d6bda35
FB
67void set_float_exception_flags(int val STATUS_PARAM)
68{
69 STATUS(float_exception_flags) = val;
70}
71
158142c2
FB
72void set_floatx80_rounding_precision(int val STATUS_PARAM)
73{
74 STATUS(floatx80_rounding_precision) = val;
75}
158142c2 76
bb4d4bb3
PM
77/*----------------------------------------------------------------------------
78| Returns the fraction bits of the half-precision floating-point value `a'.
79*----------------------------------------------------------------------------*/
80
81INLINE uint32_t extractFloat16Frac(float16 a)
82{
83 return float16_val(a) & 0x3ff;
84}
85
86/*----------------------------------------------------------------------------
87| Returns the exponent bits of the half-precision floating-point value `a'.
88*----------------------------------------------------------------------------*/
89
90INLINE int16 extractFloat16Exp(float16 a)
91{
92 return (float16_val(a) >> 10) & 0x1f;
93}
94
95/*----------------------------------------------------------------------------
96| Returns the sign bit of the single-precision floating-point value `a'.
97*----------------------------------------------------------------------------*/
98
99INLINE flag extractFloat16Sign(float16 a)
100{
101 return float16_val(a)>>15;
102}
103
158142c2
FB
104/*----------------------------------------------------------------------------
105| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
106| and 7, and returns the properly rounded 32-bit integer corresponding to the
107| input. If `zSign' is 1, the input is negated before being converted to an
108| integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
109| is simply rounded to an integer, with the inexact exception raised if the
110| input cannot be represented exactly as an integer. However, if the fixed-
111| point input is too large, the invalid exception is raised and the largest
112| positive or negative integer is returned.
113*----------------------------------------------------------------------------*/
114
bb98fe42 115static int32 roundAndPackInt32( flag zSign, uint64_t absZ STATUS_PARAM)
158142c2
FB
116{
117 int8 roundingMode;
118 flag roundNearestEven;
119 int8 roundIncrement, roundBits;
120 int32 z;
121
122 roundingMode = STATUS(float_rounding_mode);
123 roundNearestEven = ( roundingMode == float_round_nearest_even );
124 roundIncrement = 0x40;
125 if ( ! roundNearestEven ) {
126 if ( roundingMode == float_round_to_zero ) {
127 roundIncrement = 0;
128 }
129 else {
130 roundIncrement = 0x7F;
131 if ( zSign ) {
132 if ( roundingMode == float_round_up ) roundIncrement = 0;
133 }
134 else {
135 if ( roundingMode == float_round_down ) roundIncrement = 0;
136 }
137 }
138 }
139 roundBits = absZ & 0x7F;
140 absZ = ( absZ + roundIncrement )>>7;
141 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
142 z = absZ;
143 if ( zSign ) z = - z;
144 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
145 float_raise( float_flag_invalid STATUS_VAR);
bb98fe42 146 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2
FB
147 }
148 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
149 return z;
150
151}
152
153/*----------------------------------------------------------------------------
154| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
155| `absZ1', with binary point between bits 63 and 64 (between the input words),
156| and returns the properly rounded 64-bit integer corresponding to the input.
157| If `zSign' is 1, the input is negated before being converted to an integer.
158| Ordinarily, the fixed-point input is simply rounded to an integer, with
159| the inexact exception raised if the input cannot be represented exactly as
160| an integer. However, if the fixed-point input is too large, the invalid
161| exception is raised and the largest positive or negative integer is
162| returned.
163*----------------------------------------------------------------------------*/
164
bb98fe42 165static int64 roundAndPackInt64( flag zSign, uint64_t absZ0, uint64_t absZ1 STATUS_PARAM)
158142c2
FB
166{
167 int8 roundingMode;
168 flag roundNearestEven, increment;
169 int64 z;
170
171 roundingMode = STATUS(float_rounding_mode);
172 roundNearestEven = ( roundingMode == float_round_nearest_even );
bb98fe42 173 increment = ( (int64_t) absZ1 < 0 );
158142c2
FB
174 if ( ! roundNearestEven ) {
175 if ( roundingMode == float_round_to_zero ) {
176 increment = 0;
177 }
178 else {
179 if ( zSign ) {
180 increment = ( roundingMode == float_round_down ) && absZ1;
181 }
182 else {
183 increment = ( roundingMode == float_round_up ) && absZ1;
184 }
185 }
186 }
187 if ( increment ) {
188 ++absZ0;
189 if ( absZ0 == 0 ) goto overflow;
bb98fe42 190 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
158142c2
FB
191 }
192 z = absZ0;
193 if ( zSign ) z = - z;
194 if ( z && ( ( z < 0 ) ^ zSign ) ) {
195 overflow:
196 float_raise( float_flag_invalid STATUS_VAR);
197 return
bb98fe42 198 zSign ? (int64_t) LIT64( 0x8000000000000000 )
158142c2
FB
199 : LIT64( 0x7FFFFFFFFFFFFFFF );
200 }
201 if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
202 return z;
203
204}
205
206/*----------------------------------------------------------------------------
207| Returns the fraction bits of the single-precision floating-point value `a'.
208*----------------------------------------------------------------------------*/
209
bb98fe42 210INLINE uint32_t extractFloat32Frac( float32 a )
158142c2
FB
211{
212
f090c9d4 213 return float32_val(a) & 0x007FFFFF;
158142c2
FB
214
215}
216
217/*----------------------------------------------------------------------------
218| Returns the exponent bits of the single-precision floating-point value `a'.
219*----------------------------------------------------------------------------*/
220
221INLINE int16 extractFloat32Exp( float32 a )
222{
223
f090c9d4 224 return ( float32_val(a)>>23 ) & 0xFF;
158142c2
FB
225
226}
227
228/*----------------------------------------------------------------------------
229| Returns the sign bit of the single-precision floating-point value `a'.
230*----------------------------------------------------------------------------*/
231
232INLINE flag extractFloat32Sign( float32 a )
233{
234
f090c9d4 235 return float32_val(a)>>31;
158142c2
FB
236
237}
238
37d18660
PM
239/*----------------------------------------------------------------------------
240| If `a' is denormal and we are in flush-to-zero mode then set the
241| input-denormal exception and return zero. Otherwise just return the value.
242*----------------------------------------------------------------------------*/
243static float32 float32_squash_input_denormal(float32 a STATUS_PARAM)
244{
245 if (STATUS(flush_inputs_to_zero)) {
246 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
247 float_raise(float_flag_input_denormal STATUS_VAR);
248 return make_float32(float32_val(a) & 0x80000000);
249 }
250 }
251 return a;
252}
253
158142c2
FB
254/*----------------------------------------------------------------------------
255| Normalizes the subnormal single-precision floating-point value represented
256| by the denormalized significand `aSig'. The normalized exponent and
257| significand are stored at the locations pointed to by `zExpPtr' and
258| `zSigPtr', respectively.
259*----------------------------------------------------------------------------*/
260
261static void
bb98fe42 262 normalizeFloat32Subnormal( uint32_t aSig, int16 *zExpPtr, uint32_t *zSigPtr )
158142c2
FB
263{
264 int8 shiftCount;
265
266 shiftCount = countLeadingZeros32( aSig ) - 8;
267 *zSigPtr = aSig<<shiftCount;
268 *zExpPtr = 1 - shiftCount;
269
270}
271
272/*----------------------------------------------------------------------------
273| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
274| single-precision floating-point value, returning the result. After being
275| shifted into the proper positions, the three fields are simply added
276| together to form the result. This means that any integer portion of `zSig'
277| will be added into the exponent. Since a properly normalized significand
278| will have an integer portion equal to 1, the `zExp' input should be 1 less
279| than the desired result exponent whenever `zSig' is a complete, normalized
280| significand.
281*----------------------------------------------------------------------------*/
282
bb98fe42 283INLINE float32 packFloat32( flag zSign, int16 zExp, uint32_t zSig )
158142c2
FB
284{
285
f090c9d4 286 return make_float32(
bb98fe42 287 ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
158142c2
FB
288
289}
290
291/*----------------------------------------------------------------------------
292| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
293| and significand `zSig', and returns the proper single-precision floating-
294| point value corresponding to the abstract input. Ordinarily, the abstract
295| value is simply rounded and packed into the single-precision format, with
296| the inexact exception raised if the abstract input cannot be represented
297| exactly. However, if the abstract value is too large, the overflow and
298| inexact exceptions are raised and an infinity or maximal finite value is
299| returned. If the abstract value is too small, the input value is rounded to
300| a subnormal number, and the underflow and inexact exceptions are raised if
301| the abstract input cannot be represented exactly as a subnormal single-
302| precision floating-point number.
303| The input significand `zSig' has its binary point between bits 30
304| and 29, which is 7 bits to the left of the usual location. This shifted
305| significand must be normalized or smaller. If `zSig' is not normalized,
306| `zExp' must be 0; in that case, the result returned is a subnormal number,
307| and it must not require rounding. In the usual case that `zSig' is
308| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
309| The handling of underflow and overflow follows the IEC/IEEE Standard for
310| Binary Floating-Point Arithmetic.
311*----------------------------------------------------------------------------*/
312
bb98fe42 313static float32 roundAndPackFloat32( flag zSign, int16 zExp, uint32_t zSig STATUS_PARAM)
158142c2
FB
314{
315 int8 roundingMode;
316 flag roundNearestEven;
317 int8 roundIncrement, roundBits;
318 flag isTiny;
319
320 roundingMode = STATUS(float_rounding_mode);
321 roundNearestEven = ( roundingMode == float_round_nearest_even );
322 roundIncrement = 0x40;
323 if ( ! roundNearestEven ) {
324 if ( roundingMode == float_round_to_zero ) {
325 roundIncrement = 0;
326 }
327 else {
328 roundIncrement = 0x7F;
329 if ( zSign ) {
330 if ( roundingMode == float_round_up ) roundIncrement = 0;
331 }
332 else {
333 if ( roundingMode == float_round_down ) roundIncrement = 0;
334 }
335 }
336 }
337 roundBits = zSig & 0x7F;
bb98fe42 338 if ( 0xFD <= (uint16_t) zExp ) {
158142c2
FB
339 if ( ( 0xFD < zExp )
340 || ( ( zExp == 0xFD )
bb98fe42 341 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
158142c2
FB
342 ) {
343 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
f090c9d4 344 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
158142c2
FB
345 }
346 if ( zExp < 0 ) {
e6afc87f
PM
347 if (STATUS(flush_to_zero)) {
348 float_raise(float_flag_output_denormal STATUS_VAR);
349 return packFloat32(zSign, 0, 0);
350 }
158142c2
FB
351 isTiny =
352 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
353 || ( zExp < -1 )
354 || ( zSig + roundIncrement < 0x80000000 );
355 shift32RightJamming( zSig, - zExp, &zSig );
356 zExp = 0;
357 roundBits = zSig & 0x7F;
358 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
359 }
360 }
361 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
362 zSig = ( zSig + roundIncrement )>>7;
363 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
364 if ( zSig == 0 ) zExp = 0;
365 return packFloat32( zSign, zExp, zSig );
366
367}
368
369/*----------------------------------------------------------------------------
370| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
371| and significand `zSig', and returns the proper single-precision floating-
372| point value corresponding to the abstract input. This routine is just like
373| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
374| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
375| floating-point exponent.
376*----------------------------------------------------------------------------*/
377
378static float32
bb98fe42 379 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, uint32_t zSig STATUS_PARAM)
158142c2
FB
380{
381 int8 shiftCount;
382
383 shiftCount = countLeadingZeros32( zSig ) - 1;
384 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
385
386}
387
388/*----------------------------------------------------------------------------
389| Returns the fraction bits of the double-precision floating-point value `a'.
390*----------------------------------------------------------------------------*/
391
bb98fe42 392INLINE uint64_t extractFloat64Frac( float64 a )
158142c2
FB
393{
394
f090c9d4 395 return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
158142c2
FB
396
397}
398
399/*----------------------------------------------------------------------------
400| Returns the exponent bits of the double-precision floating-point value `a'.
401*----------------------------------------------------------------------------*/
402
403INLINE int16 extractFloat64Exp( float64 a )
404{
405
f090c9d4 406 return ( float64_val(a)>>52 ) & 0x7FF;
158142c2
FB
407
408}
409
410/*----------------------------------------------------------------------------
411| Returns the sign bit of the double-precision floating-point value `a'.
412*----------------------------------------------------------------------------*/
413
414INLINE flag extractFloat64Sign( float64 a )
415{
416
f090c9d4 417 return float64_val(a)>>63;
158142c2
FB
418
419}
420
37d18660
PM
421/*----------------------------------------------------------------------------
422| If `a' is denormal and we are in flush-to-zero mode then set the
423| input-denormal exception and return zero. Otherwise just return the value.
424*----------------------------------------------------------------------------*/
425static float64 float64_squash_input_denormal(float64 a STATUS_PARAM)
426{
427 if (STATUS(flush_inputs_to_zero)) {
428 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
429 float_raise(float_flag_input_denormal STATUS_VAR);
430 return make_float64(float64_val(a) & (1ULL << 63));
431 }
432 }
433 return a;
434}
435
158142c2
FB
436/*----------------------------------------------------------------------------
437| Normalizes the subnormal double-precision floating-point value represented
438| by the denormalized significand `aSig'. The normalized exponent and
439| significand are stored at the locations pointed to by `zExpPtr' and
440| `zSigPtr', respectively.
441*----------------------------------------------------------------------------*/
442
443static void
bb98fe42 444 normalizeFloat64Subnormal( uint64_t aSig, int16 *zExpPtr, uint64_t *zSigPtr )
158142c2
FB
445{
446 int8 shiftCount;
447
448 shiftCount = countLeadingZeros64( aSig ) - 11;
449 *zSigPtr = aSig<<shiftCount;
450 *zExpPtr = 1 - shiftCount;
451
452}
453
454/*----------------------------------------------------------------------------
455| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
456| double-precision floating-point value, returning the result. After being
457| shifted into the proper positions, the three fields are simply added
458| together to form the result. This means that any integer portion of `zSig'
459| will be added into the exponent. Since a properly normalized significand
460| will have an integer portion equal to 1, the `zExp' input should be 1 less
461| than the desired result exponent whenever `zSig' is a complete, normalized
462| significand.
463*----------------------------------------------------------------------------*/
464
bb98fe42 465INLINE float64 packFloat64( flag zSign, int16 zExp, uint64_t zSig )
158142c2
FB
466{
467
f090c9d4 468 return make_float64(
bb98fe42 469 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
158142c2
FB
470
471}
472
473/*----------------------------------------------------------------------------
474| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
475| and significand `zSig', and returns the proper double-precision floating-
476| point value corresponding to the abstract input. Ordinarily, the abstract
477| value is simply rounded and packed into the double-precision format, with
478| the inexact exception raised if the abstract input cannot be represented
479| exactly. However, if the abstract value is too large, the overflow and
480| inexact exceptions are raised and an infinity or maximal finite value is
481| returned. If the abstract value is too small, the input value is rounded
482| to a subnormal number, and the underflow and inexact exceptions are raised
483| if the abstract input cannot be represented exactly as a subnormal double-
484| precision floating-point number.
485| The input significand `zSig' has its binary point between bits 62
486| and 61, which is 10 bits to the left of the usual location. This shifted
487| significand must be normalized or smaller. If `zSig' is not normalized,
488| `zExp' must be 0; in that case, the result returned is a subnormal number,
489| and it must not require rounding. In the usual case that `zSig' is
490| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
491| The handling of underflow and overflow follows the IEC/IEEE Standard for
492| Binary Floating-Point Arithmetic.
493*----------------------------------------------------------------------------*/
494
bb98fe42 495static float64 roundAndPackFloat64( flag zSign, int16 zExp, uint64_t zSig STATUS_PARAM)
158142c2
FB
496{
497 int8 roundingMode;
498 flag roundNearestEven;
499 int16 roundIncrement, roundBits;
500 flag isTiny;
501
502 roundingMode = STATUS(float_rounding_mode);
503 roundNearestEven = ( roundingMode == float_round_nearest_even );
504 roundIncrement = 0x200;
505 if ( ! roundNearestEven ) {
506 if ( roundingMode == float_round_to_zero ) {
507 roundIncrement = 0;
508 }
509 else {
510 roundIncrement = 0x3FF;
511 if ( zSign ) {
512 if ( roundingMode == float_round_up ) roundIncrement = 0;
513 }
514 else {
515 if ( roundingMode == float_round_down ) roundIncrement = 0;
516 }
517 }
518 }
519 roundBits = zSig & 0x3FF;
bb98fe42 520 if ( 0x7FD <= (uint16_t) zExp ) {
158142c2
FB
521 if ( ( 0x7FD < zExp )
522 || ( ( zExp == 0x7FD )
bb98fe42 523 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
158142c2
FB
524 ) {
525 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
f090c9d4 526 return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
158142c2
FB
527 }
528 if ( zExp < 0 ) {
e6afc87f
PM
529 if (STATUS(flush_to_zero)) {
530 float_raise(float_flag_output_denormal STATUS_VAR);
531 return packFloat64(zSign, 0, 0);
532 }
158142c2
FB
533 isTiny =
534 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
535 || ( zExp < -1 )
536 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
537 shift64RightJamming( zSig, - zExp, &zSig );
538 zExp = 0;
539 roundBits = zSig & 0x3FF;
540 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
541 }
542 }
543 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
544 zSig = ( zSig + roundIncrement )>>10;
545 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
546 if ( zSig == 0 ) zExp = 0;
547 return packFloat64( zSign, zExp, zSig );
548
549}
550
551/*----------------------------------------------------------------------------
552| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
553| and significand `zSig', and returns the proper double-precision floating-
554| point value corresponding to the abstract input. This routine is just like
555| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
556| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
557| floating-point exponent.
558*----------------------------------------------------------------------------*/
559
560static float64
bb98fe42 561 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, uint64_t zSig STATUS_PARAM)
158142c2
FB
562{
563 int8 shiftCount;
564
565 shiftCount = countLeadingZeros64( zSig ) - 1;
566 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
567
568}
569
158142c2
FB
570/*----------------------------------------------------------------------------
571| Returns the fraction bits of the extended double-precision floating-point
572| value `a'.
573*----------------------------------------------------------------------------*/
574
bb98fe42 575INLINE uint64_t extractFloatx80Frac( floatx80 a )
158142c2
FB
576{
577
578 return a.low;
579
580}
581
582/*----------------------------------------------------------------------------
583| Returns the exponent bits of the extended double-precision floating-point
584| value `a'.
585*----------------------------------------------------------------------------*/
586
587INLINE int32 extractFloatx80Exp( floatx80 a )
588{
589
590 return a.high & 0x7FFF;
591
592}
593
594/*----------------------------------------------------------------------------
595| Returns the sign bit of the extended double-precision floating-point value
596| `a'.
597*----------------------------------------------------------------------------*/
598
599INLINE flag extractFloatx80Sign( floatx80 a )
600{
601
602 return a.high>>15;
603
604}
605
606/*----------------------------------------------------------------------------
607| Normalizes the subnormal extended double-precision floating-point value
608| represented by the denormalized significand `aSig'. The normalized exponent
609| and significand are stored at the locations pointed to by `zExpPtr' and
610| `zSigPtr', respectively.
611*----------------------------------------------------------------------------*/
612
613static void
bb98fe42 614 normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr )
158142c2
FB
615{
616 int8 shiftCount;
617
618 shiftCount = countLeadingZeros64( aSig );
619 *zSigPtr = aSig<<shiftCount;
620 *zExpPtr = 1 - shiftCount;
621
622}
623
624/*----------------------------------------------------------------------------
625| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
626| extended double-precision floating-point value, returning the result.
627*----------------------------------------------------------------------------*/
628
bb98fe42 629INLINE floatx80 packFloatx80( flag zSign, int32 zExp, uint64_t zSig )
158142c2
FB
630{
631 floatx80 z;
632
633 z.low = zSig;
bb98fe42 634 z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
158142c2
FB
635 return z;
636
637}
638
639/*----------------------------------------------------------------------------
640| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
641| and extended significand formed by the concatenation of `zSig0' and `zSig1',
642| and returns the proper extended double-precision floating-point value
643| corresponding to the abstract input. Ordinarily, the abstract value is
644| rounded and packed into the extended double-precision format, with the
645| inexact exception raised if the abstract input cannot be represented
646| exactly. However, if the abstract value is too large, the overflow and
647| inexact exceptions are raised and an infinity or maximal finite value is
648| returned. If the abstract value is too small, the input value is rounded to
649| a subnormal number, and the underflow and inexact exceptions are raised if
650| the abstract input cannot be represented exactly as a subnormal extended
651| double-precision floating-point number.
652| If `roundingPrecision' is 32 or 64, the result is rounded to the same
653| number of bits as single or double precision, respectively. Otherwise, the
654| result is rounded to the full precision of the extended double-precision
655| format.
656| The input significand must be normalized or smaller. If the input
657| significand is not normalized, `zExp' must be 0; in that case, the result
658| returned is a subnormal number, and it must not require rounding. The
659| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
660| Floating-Point Arithmetic.
661*----------------------------------------------------------------------------*/
662
663static floatx80
664 roundAndPackFloatx80(
bb98fe42 665 int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
158142c2
FB
666 STATUS_PARAM)
667{
668 int8 roundingMode;
669 flag roundNearestEven, increment, isTiny;
670 int64 roundIncrement, roundMask, roundBits;
671
672 roundingMode = STATUS(float_rounding_mode);
673 roundNearestEven = ( roundingMode == float_round_nearest_even );
674 if ( roundingPrecision == 80 ) goto precision80;
675 if ( roundingPrecision == 64 ) {
676 roundIncrement = LIT64( 0x0000000000000400 );
677 roundMask = LIT64( 0x00000000000007FF );
678 }
679 else if ( roundingPrecision == 32 ) {
680 roundIncrement = LIT64( 0x0000008000000000 );
681 roundMask = LIT64( 0x000000FFFFFFFFFF );
682 }
683 else {
684 goto precision80;
685 }
686 zSig0 |= ( zSig1 != 0 );
687 if ( ! roundNearestEven ) {
688 if ( roundingMode == float_round_to_zero ) {
689 roundIncrement = 0;
690 }
691 else {
692 roundIncrement = roundMask;
693 if ( zSign ) {
694 if ( roundingMode == float_round_up ) roundIncrement = 0;
695 }
696 else {
697 if ( roundingMode == float_round_down ) roundIncrement = 0;
698 }
699 }
700 }
701 roundBits = zSig0 & roundMask;
bb98fe42 702 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
158142c2
FB
703 if ( ( 0x7FFE < zExp )
704 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
705 ) {
706 goto overflow;
707 }
708 if ( zExp <= 0 ) {
e6afc87f
PM
709 if (STATUS(flush_to_zero)) {
710 float_raise(float_flag_output_denormal STATUS_VAR);
711 return packFloatx80(zSign, 0, 0);
712 }
158142c2
FB
713 isTiny =
714 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
715 || ( zExp < 0 )
716 || ( zSig0 <= zSig0 + roundIncrement );
717 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
718 zExp = 0;
719 roundBits = zSig0 & roundMask;
720 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
721 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
722 zSig0 += roundIncrement;
bb98fe42 723 if ( (int64_t) zSig0 < 0 ) zExp = 1;
158142c2
FB
724 roundIncrement = roundMask + 1;
725 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
726 roundMask |= roundIncrement;
727 }
728 zSig0 &= ~ roundMask;
729 return packFloatx80( zSign, zExp, zSig0 );
730 }
731 }
732 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
733 zSig0 += roundIncrement;
734 if ( zSig0 < roundIncrement ) {
735 ++zExp;
736 zSig0 = LIT64( 0x8000000000000000 );
737 }
738 roundIncrement = roundMask + 1;
739 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
740 roundMask |= roundIncrement;
741 }
742 zSig0 &= ~ roundMask;
743 if ( zSig0 == 0 ) zExp = 0;
744 return packFloatx80( zSign, zExp, zSig0 );
745 precision80:
bb98fe42 746 increment = ( (int64_t) zSig1 < 0 );
158142c2
FB
747 if ( ! roundNearestEven ) {
748 if ( roundingMode == float_round_to_zero ) {
749 increment = 0;
750 }
751 else {
752 if ( zSign ) {
753 increment = ( roundingMode == float_round_down ) && zSig1;
754 }
755 else {
756 increment = ( roundingMode == float_round_up ) && zSig1;
757 }
758 }
759 }
bb98fe42 760 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
158142c2
FB
761 if ( ( 0x7FFE < zExp )
762 || ( ( zExp == 0x7FFE )
763 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
764 && increment
765 )
766 ) {
767 roundMask = 0;
768 overflow:
769 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
770 if ( ( roundingMode == float_round_to_zero )
771 || ( zSign && ( roundingMode == float_round_up ) )
772 || ( ! zSign && ( roundingMode == float_round_down ) )
773 ) {
774 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
775 }
776 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
777 }
778 if ( zExp <= 0 ) {
779 isTiny =
780 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
781 || ( zExp < 0 )
782 || ! increment
783 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
784 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
785 zExp = 0;
786 if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
787 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
788 if ( roundNearestEven ) {
bb98fe42 789 increment = ( (int64_t) zSig1 < 0 );
158142c2
FB
790 }
791 else {
792 if ( zSign ) {
793 increment = ( roundingMode == float_round_down ) && zSig1;
794 }
795 else {
796 increment = ( roundingMode == float_round_up ) && zSig1;
797 }
798 }
799 if ( increment ) {
800 ++zSig0;
801 zSig0 &=
bb98fe42
AF
802 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
803 if ( (int64_t) zSig0 < 0 ) zExp = 1;
158142c2
FB
804 }
805 return packFloatx80( zSign, zExp, zSig0 );
806 }
807 }
808 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
809 if ( increment ) {
810 ++zSig0;
811 if ( zSig0 == 0 ) {
812 ++zExp;
813 zSig0 = LIT64( 0x8000000000000000 );
814 }
815 else {
bb98fe42 816 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
158142c2
FB
817 }
818 }
819 else {
820 if ( zSig0 == 0 ) zExp = 0;
821 }
822 return packFloatx80( zSign, zExp, zSig0 );
823
824}
825
826/*----------------------------------------------------------------------------
827| Takes an abstract floating-point value having sign `zSign', exponent
828| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
829| and returns the proper extended double-precision floating-point value
830| corresponding to the abstract input. This routine is just like
831| `roundAndPackFloatx80' except that the input significand does not have to be
832| normalized.
833*----------------------------------------------------------------------------*/
834
835static floatx80
836 normalizeRoundAndPackFloatx80(
bb98fe42 837 int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
158142c2
FB
838 STATUS_PARAM)
839{
840 int8 shiftCount;
841
842 if ( zSig0 == 0 ) {
843 zSig0 = zSig1;
844 zSig1 = 0;
845 zExp -= 64;
846 }
847 shiftCount = countLeadingZeros64( zSig0 );
848 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
849 zExp -= shiftCount;
850 return
851 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
852
853}
854
158142c2
FB
855/*----------------------------------------------------------------------------
856| Returns the least-significant 64 fraction bits of the quadruple-precision
857| floating-point value `a'.
858*----------------------------------------------------------------------------*/
859
bb98fe42 860INLINE uint64_t extractFloat128Frac1( float128 a )
158142c2
FB
861{
862
863 return a.low;
864
865}
866
867/*----------------------------------------------------------------------------
868| Returns the most-significant 48 fraction bits of the quadruple-precision
869| floating-point value `a'.
870*----------------------------------------------------------------------------*/
871
bb98fe42 872INLINE uint64_t extractFloat128Frac0( float128 a )
158142c2
FB
873{
874
875 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
876
877}
878
879/*----------------------------------------------------------------------------
880| Returns the exponent bits of the quadruple-precision floating-point value
881| `a'.
882*----------------------------------------------------------------------------*/
883
884INLINE int32 extractFloat128Exp( float128 a )
885{
886
887 return ( a.high>>48 ) & 0x7FFF;
888
889}
890
891/*----------------------------------------------------------------------------
892| Returns the sign bit of the quadruple-precision floating-point value `a'.
893*----------------------------------------------------------------------------*/
894
895INLINE flag extractFloat128Sign( float128 a )
896{
897
898 return a.high>>63;
899
900}
901
902/*----------------------------------------------------------------------------
903| Normalizes the subnormal quadruple-precision floating-point value
904| represented by the denormalized significand formed by the concatenation of
905| `aSig0' and `aSig1'. The normalized exponent is stored at the location
906| pointed to by `zExpPtr'. The most significant 49 bits of the normalized
907| significand are stored at the location pointed to by `zSig0Ptr', and the
908| least significant 64 bits of the normalized significand are stored at the
909| location pointed to by `zSig1Ptr'.
910*----------------------------------------------------------------------------*/
911
912static void
913 normalizeFloat128Subnormal(
bb98fe42
AF
914 uint64_t aSig0,
915 uint64_t aSig1,
158142c2 916 int32 *zExpPtr,
bb98fe42
AF
917 uint64_t *zSig0Ptr,
918 uint64_t *zSig1Ptr
158142c2
FB
919 )
920{
921 int8 shiftCount;
922
923 if ( aSig0 == 0 ) {
924 shiftCount = countLeadingZeros64( aSig1 ) - 15;
925 if ( shiftCount < 0 ) {
926 *zSig0Ptr = aSig1>>( - shiftCount );
927 *zSig1Ptr = aSig1<<( shiftCount & 63 );
928 }
929 else {
930 *zSig0Ptr = aSig1<<shiftCount;
931 *zSig1Ptr = 0;
932 }
933 *zExpPtr = - shiftCount - 63;
934 }
935 else {
936 shiftCount = countLeadingZeros64( aSig0 ) - 15;
937 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
938 *zExpPtr = 1 - shiftCount;
939 }
940
941}
942
943/*----------------------------------------------------------------------------
944| Packs the sign `zSign', the exponent `zExp', and the significand formed
945| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
946| floating-point value, returning the result. After being shifted into the
947| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
948| added together to form the most significant 32 bits of the result. This
949| means that any integer portion of `zSig0' will be added into the exponent.
950| Since a properly normalized significand will have an integer portion equal
951| to 1, the `zExp' input should be 1 less than the desired result exponent
952| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
953| significand.
954*----------------------------------------------------------------------------*/
955
956INLINE float128
bb98fe42 957 packFloat128( flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 )
158142c2
FB
958{
959 float128 z;
960
961 z.low = zSig1;
bb98fe42 962 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
158142c2
FB
963 return z;
964
965}
966
967/*----------------------------------------------------------------------------
968| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
969| and extended significand formed by the concatenation of `zSig0', `zSig1',
970| and `zSig2', and returns the proper quadruple-precision floating-point value
971| corresponding to the abstract input. Ordinarily, the abstract value is
972| simply rounded and packed into the quadruple-precision format, with the
973| inexact exception raised if the abstract input cannot be represented
974| exactly. However, if the abstract value is too large, the overflow and
975| inexact exceptions are raised and an infinity or maximal finite value is
976| returned. If the abstract value is too small, the input value is rounded to
977| a subnormal number, and the underflow and inexact exceptions are raised if
978| the abstract input cannot be represented exactly as a subnormal quadruple-
979| precision floating-point number.
980| The input significand must be normalized or smaller. If the input
981| significand is not normalized, `zExp' must be 0; in that case, the result
982| returned is a subnormal number, and it must not require rounding. In the
983| usual case that the input significand is normalized, `zExp' must be 1 less
984| than the ``true'' floating-point exponent. The handling of underflow and
985| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
986*----------------------------------------------------------------------------*/
987
988static float128
989 roundAndPackFloat128(
bb98fe42 990 flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1, uint64_t zSig2 STATUS_PARAM)
158142c2
FB
991{
992 int8 roundingMode;
993 flag roundNearestEven, increment, isTiny;
994
995 roundingMode = STATUS(float_rounding_mode);
996 roundNearestEven = ( roundingMode == float_round_nearest_even );
bb98fe42 997 increment = ( (int64_t) zSig2 < 0 );
158142c2
FB
998 if ( ! roundNearestEven ) {
999 if ( roundingMode == float_round_to_zero ) {
1000 increment = 0;
1001 }
1002 else {
1003 if ( zSign ) {
1004 increment = ( roundingMode == float_round_down ) && zSig2;
1005 }
1006 else {
1007 increment = ( roundingMode == float_round_up ) && zSig2;
1008 }
1009 }
1010 }
bb98fe42 1011 if ( 0x7FFD <= (uint32_t) zExp ) {
158142c2
FB
1012 if ( ( 0x7FFD < zExp )
1013 || ( ( zExp == 0x7FFD )
1014 && eq128(
1015 LIT64( 0x0001FFFFFFFFFFFF ),
1016 LIT64( 0xFFFFFFFFFFFFFFFF ),
1017 zSig0,
1018 zSig1
1019 )
1020 && increment
1021 )
1022 ) {
1023 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
1024 if ( ( roundingMode == float_round_to_zero )
1025 || ( zSign && ( roundingMode == float_round_up ) )
1026 || ( ! zSign && ( roundingMode == float_round_down ) )
1027 ) {
1028 return
1029 packFloat128(
1030 zSign,
1031 0x7FFE,
1032 LIT64( 0x0000FFFFFFFFFFFF ),
1033 LIT64( 0xFFFFFFFFFFFFFFFF )
1034 );
1035 }
1036 return packFloat128( zSign, 0x7FFF, 0, 0 );
1037 }
1038 if ( zExp < 0 ) {
e6afc87f
PM
1039 if (STATUS(flush_to_zero)) {
1040 float_raise(float_flag_output_denormal STATUS_VAR);
1041 return packFloat128(zSign, 0, 0, 0);
1042 }
158142c2
FB
1043 isTiny =
1044 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
1045 || ( zExp < -1 )
1046 || ! increment
1047 || lt128(
1048 zSig0,
1049 zSig1,
1050 LIT64( 0x0001FFFFFFFFFFFF ),
1051 LIT64( 0xFFFFFFFFFFFFFFFF )
1052 );
1053 shift128ExtraRightJamming(
1054 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1055 zExp = 0;
1056 if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
1057 if ( roundNearestEven ) {
bb98fe42 1058 increment = ( (int64_t) zSig2 < 0 );
158142c2
FB
1059 }
1060 else {
1061 if ( zSign ) {
1062 increment = ( roundingMode == float_round_down ) && zSig2;
1063 }
1064 else {
1065 increment = ( roundingMode == float_round_up ) && zSig2;
1066 }
1067 }
1068 }
1069 }
1070 if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1071 if ( increment ) {
1072 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1073 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1074 }
1075 else {
1076 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1077 }
1078 return packFloat128( zSign, zExp, zSig0, zSig1 );
1079
1080}
1081
1082/*----------------------------------------------------------------------------
1083| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1084| and significand formed by the concatenation of `zSig0' and `zSig1', and
1085| returns the proper quadruple-precision floating-point value corresponding
1086| to the abstract input. This routine is just like `roundAndPackFloat128'
1087| except that the input significand has fewer bits and does not have to be
1088| normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1089| point exponent.
1090*----------------------------------------------------------------------------*/
1091
1092static float128
1093 normalizeRoundAndPackFloat128(
bb98fe42 1094 flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 STATUS_PARAM)
158142c2
FB
1095{
1096 int8 shiftCount;
bb98fe42 1097 uint64_t zSig2;
158142c2
FB
1098
1099 if ( zSig0 == 0 ) {
1100 zSig0 = zSig1;
1101 zSig1 = 0;
1102 zExp -= 64;
1103 }
1104 shiftCount = countLeadingZeros64( zSig0 ) - 15;
1105 if ( 0 <= shiftCount ) {
1106 zSig2 = 0;
1107 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1108 }
1109 else {
1110 shift128ExtraRightJamming(
1111 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1112 }
1113 zExp -= shiftCount;
1114 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1115
1116}
1117
158142c2
FB
1118/*----------------------------------------------------------------------------
1119| Returns the result of converting the 32-bit two's complement integer `a'
1120| to the single-precision floating-point format. The conversion is performed
1121| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1122*----------------------------------------------------------------------------*/
1123
1124float32 int32_to_float32( int32 a STATUS_PARAM )
1125{
1126 flag zSign;
1127
f090c9d4 1128 if ( a == 0 ) return float32_zero;
bb98fe42 1129 if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
158142c2
FB
1130 zSign = ( a < 0 );
1131 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1132
1133}
1134
1135/*----------------------------------------------------------------------------
1136| Returns the result of converting the 32-bit two's complement integer `a'
1137| to the double-precision floating-point format. The conversion is performed
1138| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1139*----------------------------------------------------------------------------*/
1140
1141float64 int32_to_float64( int32 a STATUS_PARAM )
1142{
1143 flag zSign;
1144 uint32 absA;
1145 int8 shiftCount;
bb98fe42 1146 uint64_t zSig;
158142c2 1147
f090c9d4 1148 if ( a == 0 ) return float64_zero;
158142c2
FB
1149 zSign = ( a < 0 );
1150 absA = zSign ? - a : a;
1151 shiftCount = countLeadingZeros32( absA ) + 21;
1152 zSig = absA;
1153 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1154
1155}
1156
158142c2
FB
1157/*----------------------------------------------------------------------------
1158| Returns the result of converting the 32-bit two's complement integer `a'
1159| to the extended double-precision floating-point format. The conversion
1160| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1161| Arithmetic.
1162*----------------------------------------------------------------------------*/
1163
1164floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1165{
1166 flag zSign;
1167 uint32 absA;
1168 int8 shiftCount;
bb98fe42 1169 uint64_t zSig;
158142c2
FB
1170
1171 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1172 zSign = ( a < 0 );
1173 absA = zSign ? - a : a;
1174 shiftCount = countLeadingZeros32( absA ) + 32;
1175 zSig = absA;
1176 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1177
1178}
1179
158142c2
FB
1180/*----------------------------------------------------------------------------
1181| Returns the result of converting the 32-bit two's complement integer `a' to
1182| the quadruple-precision floating-point format. The conversion is performed
1183| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1184*----------------------------------------------------------------------------*/
1185
1186float128 int32_to_float128( int32 a STATUS_PARAM )
1187{
1188 flag zSign;
1189 uint32 absA;
1190 int8 shiftCount;
bb98fe42 1191 uint64_t zSig0;
158142c2
FB
1192
1193 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1194 zSign = ( a < 0 );
1195 absA = zSign ? - a : a;
1196 shiftCount = countLeadingZeros32( absA ) + 17;
1197 zSig0 = absA;
1198 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1199
1200}
1201
158142c2
FB
1202/*----------------------------------------------------------------------------
1203| Returns the result of converting the 64-bit two's complement integer `a'
1204| to the single-precision floating-point format. The conversion is performed
1205| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1206*----------------------------------------------------------------------------*/
1207
1208float32 int64_to_float32( int64 a STATUS_PARAM )
1209{
1210 flag zSign;
1211 uint64 absA;
1212 int8 shiftCount;
1213
f090c9d4 1214 if ( a == 0 ) return float32_zero;
158142c2
FB
1215 zSign = ( a < 0 );
1216 absA = zSign ? - a : a;
1217 shiftCount = countLeadingZeros64( absA ) - 40;
1218 if ( 0 <= shiftCount ) {
1219 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1220 }
1221 else {
1222 shiftCount += 7;
1223 if ( shiftCount < 0 ) {
1224 shift64RightJamming( absA, - shiftCount, &absA );
1225 }
1226 else {
1227 absA <<= shiftCount;
1228 }
1229 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1230 }
1231
1232}
1233
3430b0be 1234float32 uint64_to_float32( uint64 a STATUS_PARAM )
75d62a58
JM
1235{
1236 int8 shiftCount;
1237
f090c9d4 1238 if ( a == 0 ) return float32_zero;
75d62a58
JM
1239 shiftCount = countLeadingZeros64( a ) - 40;
1240 if ( 0 <= shiftCount ) {
1241 return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount );
1242 }
1243 else {
1244 shiftCount += 7;
1245 if ( shiftCount < 0 ) {
1246 shift64RightJamming( a, - shiftCount, &a );
1247 }
1248 else {
1249 a <<= shiftCount;
1250 }
1251 return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR );
1252 }
1253}
1254
158142c2
FB
1255/*----------------------------------------------------------------------------
1256| Returns the result of converting the 64-bit two's complement integer `a'
1257| to the double-precision floating-point format. The conversion is performed
1258| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1259*----------------------------------------------------------------------------*/
1260
1261float64 int64_to_float64( int64 a STATUS_PARAM )
1262{
1263 flag zSign;
1264
f090c9d4 1265 if ( a == 0 ) return float64_zero;
bb98fe42 1266 if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
158142c2
FB
1267 return packFloat64( 1, 0x43E, 0 );
1268 }
1269 zSign = ( a < 0 );
1270 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1271
1272}
1273
75d62a58
JM
1274float64 uint64_to_float64( uint64 a STATUS_PARAM )
1275{
f090c9d4 1276 if ( a == 0 ) return float64_zero;
75d62a58
JM
1277 return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR );
1278
1279}
1280
158142c2
FB
1281/*----------------------------------------------------------------------------
1282| Returns the result of converting the 64-bit two's complement integer `a'
1283| to the extended double-precision floating-point format. The conversion
1284| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1285| Arithmetic.
1286*----------------------------------------------------------------------------*/
1287
1288floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1289{
1290 flag zSign;
1291 uint64 absA;
1292 int8 shiftCount;
1293
1294 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1295 zSign = ( a < 0 );
1296 absA = zSign ? - a : a;
1297 shiftCount = countLeadingZeros64( absA );
1298 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1299
1300}
1301
158142c2
FB
1302/*----------------------------------------------------------------------------
1303| Returns the result of converting the 64-bit two's complement integer `a' to
1304| the quadruple-precision floating-point format. The conversion is performed
1305| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1306*----------------------------------------------------------------------------*/
1307
1308float128 int64_to_float128( int64 a STATUS_PARAM )
1309{
1310 flag zSign;
1311 uint64 absA;
1312 int8 shiftCount;
1313 int32 zExp;
bb98fe42 1314 uint64_t zSig0, zSig1;
158142c2
FB
1315
1316 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1317 zSign = ( a < 0 );
1318 absA = zSign ? - a : a;
1319 shiftCount = countLeadingZeros64( absA ) + 49;
1320 zExp = 0x406E - shiftCount;
1321 if ( 64 <= shiftCount ) {
1322 zSig1 = 0;
1323 zSig0 = absA;
1324 shiftCount -= 64;
1325 }
1326 else {
1327 zSig1 = absA;
1328 zSig0 = 0;
1329 }
1330 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1331 return packFloat128( zSign, zExp, zSig0, zSig1 );
1332
1333}
1334
158142c2
FB
1335/*----------------------------------------------------------------------------
1336| Returns the result of converting the single-precision floating-point value
1337| `a' to the 32-bit two's complement integer format. The conversion is
1338| performed according to the IEC/IEEE Standard for Binary Floating-Point
1339| Arithmetic---which means in particular that the conversion is rounded
1340| according to the current rounding mode. If `a' is a NaN, the largest
1341| positive integer is returned. Otherwise, if the conversion overflows, the
1342| largest integer with the same sign as `a' is returned.
1343*----------------------------------------------------------------------------*/
1344
1345int32 float32_to_int32( float32 a STATUS_PARAM )
1346{
1347 flag aSign;
1348 int16 aExp, shiftCount;
bb98fe42
AF
1349 uint32_t aSig;
1350 uint64_t aSig64;
158142c2 1351
37d18660 1352 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1353 aSig = extractFloat32Frac( a );
1354 aExp = extractFloat32Exp( a );
1355 aSign = extractFloat32Sign( a );
1356 if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1357 if ( aExp ) aSig |= 0x00800000;
1358 shiftCount = 0xAF - aExp;
1359 aSig64 = aSig;
1360 aSig64 <<= 32;
1361 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1362 return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1363
1364}
1365
1366/*----------------------------------------------------------------------------
1367| Returns the result of converting the single-precision floating-point value
1368| `a' to the 32-bit two's complement integer format. The conversion is
1369| performed according to the IEC/IEEE Standard for Binary Floating-Point
1370| Arithmetic, except that the conversion is always rounded toward zero.
1371| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1372| the conversion overflows, the largest integer with the same sign as `a' is
1373| returned.
1374*----------------------------------------------------------------------------*/
1375
1376int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1377{
1378 flag aSign;
1379 int16 aExp, shiftCount;
bb98fe42 1380 uint32_t aSig;
158142c2 1381 int32 z;
37d18660 1382 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1383
1384 aSig = extractFloat32Frac( a );
1385 aExp = extractFloat32Exp( a );
1386 aSign = extractFloat32Sign( a );
1387 shiftCount = aExp - 0x9E;
1388 if ( 0 <= shiftCount ) {
f090c9d4 1389 if ( float32_val(a) != 0xCF000000 ) {
158142c2
FB
1390 float_raise( float_flag_invalid STATUS_VAR);
1391 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1392 }
bb98fe42 1393 return (int32_t) 0x80000000;
158142c2
FB
1394 }
1395 else if ( aExp <= 0x7E ) {
1396 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1397 return 0;
1398 }
1399 aSig = ( aSig | 0x00800000 )<<8;
1400 z = aSig>>( - shiftCount );
bb98fe42 1401 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
158142c2
FB
1402 STATUS(float_exception_flags) |= float_flag_inexact;
1403 }
1404 if ( aSign ) z = - z;
1405 return z;
1406
1407}
1408
cbcef455
PM
1409/*----------------------------------------------------------------------------
1410| Returns the result of converting the single-precision floating-point value
1411| `a' to the 16-bit two's complement integer format. The conversion is
1412| performed according to the IEC/IEEE Standard for Binary Floating-Point
1413| Arithmetic, except that the conversion is always rounded toward zero.
1414| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1415| the conversion overflows, the largest integer with the same sign as `a' is
1416| returned.
1417*----------------------------------------------------------------------------*/
1418
1419int16 float32_to_int16_round_to_zero( float32 a STATUS_PARAM )
1420{
1421 flag aSign;
1422 int16 aExp, shiftCount;
bb98fe42 1423 uint32_t aSig;
cbcef455
PM
1424 int32 z;
1425
1426 aSig = extractFloat32Frac( a );
1427 aExp = extractFloat32Exp( a );
1428 aSign = extractFloat32Sign( a );
1429 shiftCount = aExp - 0x8E;
1430 if ( 0 <= shiftCount ) {
1431 if ( float32_val(a) != 0xC7000000 ) {
1432 float_raise( float_flag_invalid STATUS_VAR);
1433 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1434 return 0x7FFF;
1435 }
1436 }
bb98fe42 1437 return (int32_t) 0xffff8000;
cbcef455
PM
1438 }
1439 else if ( aExp <= 0x7E ) {
1440 if ( aExp | aSig ) {
1441 STATUS(float_exception_flags) |= float_flag_inexact;
1442 }
1443 return 0;
1444 }
1445 shiftCount -= 0x10;
1446 aSig = ( aSig | 0x00800000 )<<8;
1447 z = aSig>>( - shiftCount );
bb98fe42 1448 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
cbcef455
PM
1449 STATUS(float_exception_flags) |= float_flag_inexact;
1450 }
1451 if ( aSign ) {
1452 z = - z;
1453 }
1454 return z;
1455
1456}
1457
158142c2
FB
1458/*----------------------------------------------------------------------------
1459| Returns the result of converting the single-precision floating-point value
1460| `a' to the 64-bit two's complement integer format. The conversion is
1461| performed according to the IEC/IEEE Standard for Binary Floating-Point
1462| Arithmetic---which means in particular that the conversion is rounded
1463| according to the current rounding mode. If `a' is a NaN, the largest
1464| positive integer is returned. Otherwise, if the conversion overflows, the
1465| largest integer with the same sign as `a' is returned.
1466*----------------------------------------------------------------------------*/
1467
1468int64 float32_to_int64( float32 a STATUS_PARAM )
1469{
1470 flag aSign;
1471 int16 aExp, shiftCount;
bb98fe42
AF
1472 uint32_t aSig;
1473 uint64_t aSig64, aSigExtra;
37d18660 1474 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1475
1476 aSig = extractFloat32Frac( a );
1477 aExp = extractFloat32Exp( a );
1478 aSign = extractFloat32Sign( a );
1479 shiftCount = 0xBE - aExp;
1480 if ( shiftCount < 0 ) {
1481 float_raise( float_flag_invalid STATUS_VAR);
1482 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1483 return LIT64( 0x7FFFFFFFFFFFFFFF );
1484 }
bb98fe42 1485 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
1486 }
1487 if ( aExp ) aSig |= 0x00800000;
1488 aSig64 = aSig;
1489 aSig64 <<= 40;
1490 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1491 return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1492
1493}
1494
1495/*----------------------------------------------------------------------------
1496| Returns the result of converting the single-precision floating-point value
1497| `a' to the 64-bit two's complement integer format. The conversion is
1498| performed according to the IEC/IEEE Standard for Binary Floating-Point
1499| Arithmetic, except that the conversion is always rounded toward zero. If
1500| `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1501| conversion overflows, the largest integer with the same sign as `a' is
1502| returned.
1503*----------------------------------------------------------------------------*/
1504
1505int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1506{
1507 flag aSign;
1508 int16 aExp, shiftCount;
bb98fe42
AF
1509 uint32_t aSig;
1510 uint64_t aSig64;
158142c2 1511 int64 z;
37d18660 1512 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1513
1514 aSig = extractFloat32Frac( a );
1515 aExp = extractFloat32Exp( a );
1516 aSign = extractFloat32Sign( a );
1517 shiftCount = aExp - 0xBE;
1518 if ( 0 <= shiftCount ) {
f090c9d4 1519 if ( float32_val(a) != 0xDF000000 ) {
158142c2
FB
1520 float_raise( float_flag_invalid STATUS_VAR);
1521 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1522 return LIT64( 0x7FFFFFFFFFFFFFFF );
1523 }
1524 }
bb98fe42 1525 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
1526 }
1527 else if ( aExp <= 0x7E ) {
1528 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1529 return 0;
1530 }
1531 aSig64 = aSig | 0x00800000;
1532 aSig64 <<= 40;
1533 z = aSig64>>( - shiftCount );
bb98fe42 1534 if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
158142c2
FB
1535 STATUS(float_exception_flags) |= float_flag_inexact;
1536 }
1537 if ( aSign ) z = - z;
1538 return z;
1539
1540}
1541
1542/*----------------------------------------------------------------------------
1543| Returns the result of converting the single-precision floating-point value
1544| `a' to the double-precision floating-point format. The conversion is
1545| performed according to the IEC/IEEE Standard for Binary Floating-Point
1546| Arithmetic.
1547*----------------------------------------------------------------------------*/
1548
1549float64 float32_to_float64( float32 a STATUS_PARAM )
1550{
1551 flag aSign;
1552 int16 aExp;
bb98fe42 1553 uint32_t aSig;
37d18660 1554 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1555
1556 aSig = extractFloat32Frac( a );
1557 aExp = extractFloat32Exp( a );
1558 aSign = extractFloat32Sign( a );
1559 if ( aExp == 0xFF ) {
bcd4d9af 1560 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
1561 return packFloat64( aSign, 0x7FF, 0 );
1562 }
1563 if ( aExp == 0 ) {
1564 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1565 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1566 --aExp;
1567 }
bb98fe42 1568 return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
158142c2
FB
1569
1570}
1571
158142c2
FB
1572/*----------------------------------------------------------------------------
1573| Returns the result of converting the single-precision floating-point value
1574| `a' to the extended double-precision floating-point format. The conversion
1575| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1576| Arithmetic.
1577*----------------------------------------------------------------------------*/
1578
1579floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1580{
1581 flag aSign;
1582 int16 aExp;
bb98fe42 1583 uint32_t aSig;
158142c2 1584
37d18660 1585 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1586 aSig = extractFloat32Frac( a );
1587 aExp = extractFloat32Exp( a );
1588 aSign = extractFloat32Sign( a );
1589 if ( aExp == 0xFF ) {
bcd4d9af 1590 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
1591 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1592 }
1593 if ( aExp == 0 ) {
1594 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1595 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1596 }
1597 aSig |= 0x00800000;
bb98fe42 1598 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
158142c2
FB
1599
1600}
1601
158142c2
FB
1602/*----------------------------------------------------------------------------
1603| Returns the result of converting the single-precision floating-point value
1604| `a' to the double-precision floating-point format. The conversion is
1605| performed according to the IEC/IEEE Standard for Binary Floating-Point
1606| Arithmetic.
1607*----------------------------------------------------------------------------*/
1608
1609float128 float32_to_float128( float32 a STATUS_PARAM )
1610{
1611 flag aSign;
1612 int16 aExp;
bb98fe42 1613 uint32_t aSig;
158142c2 1614
37d18660 1615 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1616 aSig = extractFloat32Frac( a );
1617 aExp = extractFloat32Exp( a );
1618 aSign = extractFloat32Sign( a );
1619 if ( aExp == 0xFF ) {
bcd4d9af 1620 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
1621 return packFloat128( aSign, 0x7FFF, 0, 0 );
1622 }
1623 if ( aExp == 0 ) {
1624 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1625 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1626 --aExp;
1627 }
bb98fe42 1628 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
158142c2
FB
1629
1630}
1631
158142c2
FB
1632/*----------------------------------------------------------------------------
1633| Rounds the single-precision floating-point value `a' to an integer, and
1634| returns the result as a single-precision floating-point value. The
1635| operation is performed according to the IEC/IEEE Standard for Binary
1636| Floating-Point Arithmetic.
1637*----------------------------------------------------------------------------*/
1638
1639float32 float32_round_to_int( float32 a STATUS_PARAM)
1640{
1641 flag aSign;
1642 int16 aExp;
bb98fe42 1643 uint32_t lastBitMask, roundBitsMask;
158142c2 1644 int8 roundingMode;
bb98fe42 1645 uint32_t z;
37d18660 1646 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1647
1648 aExp = extractFloat32Exp( a );
1649 if ( 0x96 <= aExp ) {
1650 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1651 return propagateFloat32NaN( a, a STATUS_VAR );
1652 }
1653 return a;
1654 }
1655 if ( aExp <= 0x7E ) {
bb98fe42 1656 if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
158142c2
FB
1657 STATUS(float_exception_flags) |= float_flag_inexact;
1658 aSign = extractFloat32Sign( a );
1659 switch ( STATUS(float_rounding_mode) ) {
1660 case float_round_nearest_even:
1661 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1662 return packFloat32( aSign, 0x7F, 0 );
1663 }
1664 break;
1665 case float_round_down:
f090c9d4 1666 return make_float32(aSign ? 0xBF800000 : 0);
158142c2 1667 case float_round_up:
f090c9d4 1668 return make_float32(aSign ? 0x80000000 : 0x3F800000);
158142c2
FB
1669 }
1670 return packFloat32( aSign, 0, 0 );
1671 }
1672 lastBitMask = 1;
1673 lastBitMask <<= 0x96 - aExp;
1674 roundBitsMask = lastBitMask - 1;
f090c9d4 1675 z = float32_val(a);
158142c2
FB
1676 roundingMode = STATUS(float_rounding_mode);
1677 if ( roundingMode == float_round_nearest_even ) {
1678 z += lastBitMask>>1;
1679 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1680 }
1681 else if ( roundingMode != float_round_to_zero ) {
f090c9d4 1682 if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
158142c2
FB
1683 z += roundBitsMask;
1684 }
1685 }
1686 z &= ~ roundBitsMask;
f090c9d4
PB
1687 if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1688 return make_float32(z);
158142c2
FB
1689
1690}
1691
1692/*----------------------------------------------------------------------------
1693| Returns the result of adding the absolute values of the single-precision
1694| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1695| before being returned. `zSign' is ignored if the result is a NaN.
1696| The addition is performed according to the IEC/IEEE Standard for Binary
1697| Floating-Point Arithmetic.
1698*----------------------------------------------------------------------------*/
1699
1700static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1701{
1702 int16 aExp, bExp, zExp;
bb98fe42 1703 uint32_t aSig, bSig, zSig;
158142c2
FB
1704 int16 expDiff;
1705
1706 aSig = extractFloat32Frac( a );
1707 aExp = extractFloat32Exp( a );
1708 bSig = extractFloat32Frac( b );
1709 bExp = extractFloat32Exp( b );
1710 expDiff = aExp - bExp;
1711 aSig <<= 6;
1712 bSig <<= 6;
1713 if ( 0 < expDiff ) {
1714 if ( aExp == 0xFF ) {
1715 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1716 return a;
1717 }
1718 if ( bExp == 0 ) {
1719 --expDiff;
1720 }
1721 else {
1722 bSig |= 0x20000000;
1723 }
1724 shift32RightJamming( bSig, expDiff, &bSig );
1725 zExp = aExp;
1726 }
1727 else if ( expDiff < 0 ) {
1728 if ( bExp == 0xFF ) {
1729 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1730 return packFloat32( zSign, 0xFF, 0 );
1731 }
1732 if ( aExp == 0 ) {
1733 ++expDiff;
1734 }
1735 else {
1736 aSig |= 0x20000000;
1737 }
1738 shift32RightJamming( aSig, - expDiff, &aSig );
1739 zExp = bExp;
1740 }
1741 else {
1742 if ( aExp == 0xFF ) {
1743 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1744 return a;
1745 }
fe76d976 1746 if ( aExp == 0 ) {
e6afc87f
PM
1747 if (STATUS(flush_to_zero)) {
1748 if (aSig | bSig) {
1749 float_raise(float_flag_output_denormal STATUS_VAR);
1750 }
1751 return packFloat32(zSign, 0, 0);
1752 }
fe76d976
PB
1753 return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1754 }
158142c2
FB
1755 zSig = 0x40000000 + aSig + bSig;
1756 zExp = aExp;
1757 goto roundAndPack;
1758 }
1759 aSig |= 0x20000000;
1760 zSig = ( aSig + bSig )<<1;
1761 --zExp;
bb98fe42 1762 if ( (int32_t) zSig < 0 ) {
158142c2
FB
1763 zSig = aSig + bSig;
1764 ++zExp;
1765 }
1766 roundAndPack:
1767 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1768
1769}
1770
1771/*----------------------------------------------------------------------------
1772| Returns the result of subtracting the absolute values of the single-
1773| precision floating-point values `a' and `b'. If `zSign' is 1, the
1774| difference is negated before being returned. `zSign' is ignored if the
1775| result is a NaN. The subtraction is performed according to the IEC/IEEE
1776| Standard for Binary Floating-Point Arithmetic.
1777*----------------------------------------------------------------------------*/
1778
1779static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1780{
1781 int16 aExp, bExp, zExp;
bb98fe42 1782 uint32_t aSig, bSig, zSig;
158142c2
FB
1783 int16 expDiff;
1784
1785 aSig = extractFloat32Frac( a );
1786 aExp = extractFloat32Exp( a );
1787 bSig = extractFloat32Frac( b );
1788 bExp = extractFloat32Exp( b );
1789 expDiff = aExp - bExp;
1790 aSig <<= 7;
1791 bSig <<= 7;
1792 if ( 0 < expDiff ) goto aExpBigger;
1793 if ( expDiff < 0 ) goto bExpBigger;
1794 if ( aExp == 0xFF ) {
1795 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1796 float_raise( float_flag_invalid STATUS_VAR);
1797 return float32_default_nan;
1798 }
1799 if ( aExp == 0 ) {
1800 aExp = 1;
1801 bExp = 1;
1802 }
1803 if ( bSig < aSig ) goto aBigger;
1804 if ( aSig < bSig ) goto bBigger;
1805 return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1806 bExpBigger:
1807 if ( bExp == 0xFF ) {
1808 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1809 return packFloat32( zSign ^ 1, 0xFF, 0 );
1810 }
1811 if ( aExp == 0 ) {
1812 ++expDiff;
1813 }
1814 else {
1815 aSig |= 0x40000000;
1816 }
1817 shift32RightJamming( aSig, - expDiff, &aSig );
1818 bSig |= 0x40000000;
1819 bBigger:
1820 zSig = bSig - aSig;
1821 zExp = bExp;
1822 zSign ^= 1;
1823 goto normalizeRoundAndPack;
1824 aExpBigger:
1825 if ( aExp == 0xFF ) {
1826 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1827 return a;
1828 }
1829 if ( bExp == 0 ) {
1830 --expDiff;
1831 }
1832 else {
1833 bSig |= 0x40000000;
1834 }
1835 shift32RightJamming( bSig, expDiff, &bSig );
1836 aSig |= 0x40000000;
1837 aBigger:
1838 zSig = aSig - bSig;
1839 zExp = aExp;
1840 normalizeRoundAndPack:
1841 --zExp;
1842 return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1843
1844}
1845
1846/*----------------------------------------------------------------------------
1847| Returns the result of adding the single-precision floating-point values `a'
1848| and `b'. The operation is performed according to the IEC/IEEE Standard for
1849| Binary Floating-Point Arithmetic.
1850*----------------------------------------------------------------------------*/
1851
1852float32 float32_add( float32 a, float32 b STATUS_PARAM )
1853{
1854 flag aSign, bSign;
37d18660
PM
1855 a = float32_squash_input_denormal(a STATUS_VAR);
1856 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
1857
1858 aSign = extractFloat32Sign( a );
1859 bSign = extractFloat32Sign( b );
1860 if ( aSign == bSign ) {
1861 return addFloat32Sigs( a, b, aSign STATUS_VAR);
1862 }
1863 else {
1864 return subFloat32Sigs( a, b, aSign STATUS_VAR );
1865 }
1866
1867}
1868
1869/*----------------------------------------------------------------------------
1870| Returns the result of subtracting the single-precision floating-point values
1871| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1872| for Binary Floating-Point Arithmetic.
1873*----------------------------------------------------------------------------*/
1874
1875float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1876{
1877 flag aSign, bSign;
37d18660
PM
1878 a = float32_squash_input_denormal(a STATUS_VAR);
1879 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
1880
1881 aSign = extractFloat32Sign( a );
1882 bSign = extractFloat32Sign( b );
1883 if ( aSign == bSign ) {
1884 return subFloat32Sigs( a, b, aSign STATUS_VAR );
1885 }
1886 else {
1887 return addFloat32Sigs( a, b, aSign STATUS_VAR );
1888 }
1889
1890}
1891
1892/*----------------------------------------------------------------------------
1893| Returns the result of multiplying the single-precision floating-point values
1894| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1895| for Binary Floating-Point Arithmetic.
1896*----------------------------------------------------------------------------*/
1897
1898float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1899{
1900 flag aSign, bSign, zSign;
1901 int16 aExp, bExp, zExp;
bb98fe42
AF
1902 uint32_t aSig, bSig;
1903 uint64_t zSig64;
1904 uint32_t zSig;
158142c2 1905
37d18660
PM
1906 a = float32_squash_input_denormal(a STATUS_VAR);
1907 b = float32_squash_input_denormal(b STATUS_VAR);
1908
158142c2
FB
1909 aSig = extractFloat32Frac( a );
1910 aExp = extractFloat32Exp( a );
1911 aSign = extractFloat32Sign( a );
1912 bSig = extractFloat32Frac( b );
1913 bExp = extractFloat32Exp( b );
1914 bSign = extractFloat32Sign( b );
1915 zSign = aSign ^ bSign;
1916 if ( aExp == 0xFF ) {
1917 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1918 return propagateFloat32NaN( a, b STATUS_VAR );
1919 }
1920 if ( ( bExp | bSig ) == 0 ) {
1921 float_raise( float_flag_invalid STATUS_VAR);
1922 return float32_default_nan;
1923 }
1924 return packFloat32( zSign, 0xFF, 0 );
1925 }
1926 if ( bExp == 0xFF ) {
1927 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1928 if ( ( aExp | aSig ) == 0 ) {
1929 float_raise( float_flag_invalid STATUS_VAR);
1930 return float32_default_nan;
1931 }
1932 return packFloat32( zSign, 0xFF, 0 );
1933 }
1934 if ( aExp == 0 ) {
1935 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1936 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1937 }
1938 if ( bExp == 0 ) {
1939 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1940 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1941 }
1942 zExp = aExp + bExp - 0x7F;
1943 aSig = ( aSig | 0x00800000 )<<7;
1944 bSig = ( bSig | 0x00800000 )<<8;
bb98fe42 1945 shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
158142c2 1946 zSig = zSig64;
bb98fe42 1947 if ( 0 <= (int32_t) ( zSig<<1 ) ) {
158142c2
FB
1948 zSig <<= 1;
1949 --zExp;
1950 }
1951 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1952
1953}
1954
1955/*----------------------------------------------------------------------------
1956| Returns the result of dividing the single-precision floating-point value `a'
1957| by the corresponding value `b'. The operation is performed according to the
1958| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1959*----------------------------------------------------------------------------*/
1960
1961float32 float32_div( float32 a, float32 b STATUS_PARAM )
1962{
1963 flag aSign, bSign, zSign;
1964 int16 aExp, bExp, zExp;
bb98fe42 1965 uint32_t aSig, bSig, zSig;
37d18660
PM
1966 a = float32_squash_input_denormal(a STATUS_VAR);
1967 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
1968
1969 aSig = extractFloat32Frac( a );
1970 aExp = extractFloat32Exp( a );
1971 aSign = extractFloat32Sign( a );
1972 bSig = extractFloat32Frac( b );
1973 bExp = extractFloat32Exp( b );
1974 bSign = extractFloat32Sign( b );
1975 zSign = aSign ^ bSign;
1976 if ( aExp == 0xFF ) {
1977 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1978 if ( bExp == 0xFF ) {
1979 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1980 float_raise( float_flag_invalid STATUS_VAR);
1981 return float32_default_nan;
1982 }
1983 return packFloat32( zSign, 0xFF, 0 );
1984 }
1985 if ( bExp == 0xFF ) {
1986 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1987 return packFloat32( zSign, 0, 0 );
1988 }
1989 if ( bExp == 0 ) {
1990 if ( bSig == 0 ) {
1991 if ( ( aExp | aSig ) == 0 ) {
1992 float_raise( float_flag_invalid STATUS_VAR);
1993 return float32_default_nan;
1994 }
1995 float_raise( float_flag_divbyzero STATUS_VAR);
1996 return packFloat32( zSign, 0xFF, 0 );
1997 }
1998 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1999 }
2000 if ( aExp == 0 ) {
2001 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2002 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2003 }
2004 zExp = aExp - bExp + 0x7D;
2005 aSig = ( aSig | 0x00800000 )<<7;
2006 bSig = ( bSig | 0x00800000 )<<8;
2007 if ( bSig <= ( aSig + aSig ) ) {
2008 aSig >>= 1;
2009 ++zExp;
2010 }
bb98fe42 2011 zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
158142c2 2012 if ( ( zSig & 0x3F ) == 0 ) {
bb98fe42 2013 zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
158142c2
FB
2014 }
2015 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2016
2017}
2018
2019/*----------------------------------------------------------------------------
2020| Returns the remainder of the single-precision floating-point value `a'
2021| with respect to the corresponding value `b'. The operation is performed
2022| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2023*----------------------------------------------------------------------------*/
2024
2025float32 float32_rem( float32 a, float32 b STATUS_PARAM )
2026{
ed086f3d 2027 flag aSign, zSign;
158142c2 2028 int16 aExp, bExp, expDiff;
bb98fe42
AF
2029 uint32_t aSig, bSig;
2030 uint32_t q;
2031 uint64_t aSig64, bSig64, q64;
2032 uint32_t alternateASig;
2033 int32_t sigMean;
37d18660
PM
2034 a = float32_squash_input_denormal(a STATUS_VAR);
2035 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2036
2037 aSig = extractFloat32Frac( a );
2038 aExp = extractFloat32Exp( a );
2039 aSign = extractFloat32Sign( a );
2040 bSig = extractFloat32Frac( b );
2041 bExp = extractFloat32Exp( b );
158142c2
FB
2042 if ( aExp == 0xFF ) {
2043 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2044 return propagateFloat32NaN( a, b STATUS_VAR );
2045 }
2046 float_raise( float_flag_invalid STATUS_VAR);
2047 return float32_default_nan;
2048 }
2049 if ( bExp == 0xFF ) {
2050 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2051 return a;
2052 }
2053 if ( bExp == 0 ) {
2054 if ( bSig == 0 ) {
2055 float_raise( float_flag_invalid STATUS_VAR);
2056 return float32_default_nan;
2057 }
2058 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2059 }
2060 if ( aExp == 0 ) {
2061 if ( aSig == 0 ) return a;
2062 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2063 }
2064 expDiff = aExp - bExp;
2065 aSig |= 0x00800000;
2066 bSig |= 0x00800000;
2067 if ( expDiff < 32 ) {
2068 aSig <<= 8;
2069 bSig <<= 8;
2070 if ( expDiff < 0 ) {
2071 if ( expDiff < -1 ) return a;
2072 aSig >>= 1;
2073 }
2074 q = ( bSig <= aSig );
2075 if ( q ) aSig -= bSig;
2076 if ( 0 < expDiff ) {
bb98fe42 2077 q = ( ( (uint64_t) aSig )<<32 ) / bSig;
158142c2
FB
2078 q >>= 32 - expDiff;
2079 bSig >>= 2;
2080 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2081 }
2082 else {
2083 aSig >>= 2;
2084 bSig >>= 2;
2085 }
2086 }
2087 else {
2088 if ( bSig <= aSig ) aSig -= bSig;
bb98fe42
AF
2089 aSig64 = ( (uint64_t) aSig )<<40;
2090 bSig64 = ( (uint64_t) bSig )<<40;
158142c2
FB
2091 expDiff -= 64;
2092 while ( 0 < expDiff ) {
2093 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2094 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2095 aSig64 = - ( ( bSig * q64 )<<38 );
2096 expDiff -= 62;
2097 }
2098 expDiff += 64;
2099 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2100 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2101 q = q64>>( 64 - expDiff );
2102 bSig <<= 6;
2103 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2104 }
2105 do {
2106 alternateASig = aSig;
2107 ++q;
2108 aSig -= bSig;
bb98fe42 2109 } while ( 0 <= (int32_t) aSig );
158142c2
FB
2110 sigMean = aSig + alternateASig;
2111 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2112 aSig = alternateASig;
2113 }
bb98fe42 2114 zSign = ( (int32_t) aSig < 0 );
158142c2
FB
2115 if ( zSign ) aSig = - aSig;
2116 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2117
2118}
2119
2120/*----------------------------------------------------------------------------
2121| Returns the square root of the single-precision floating-point value `a'.
2122| The operation is performed according to the IEC/IEEE Standard for Binary
2123| Floating-Point Arithmetic.
2124*----------------------------------------------------------------------------*/
2125
2126float32 float32_sqrt( float32 a STATUS_PARAM )
2127{
2128 flag aSign;
2129 int16 aExp, zExp;
bb98fe42
AF
2130 uint32_t aSig, zSig;
2131 uint64_t rem, term;
37d18660 2132 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
2133
2134 aSig = extractFloat32Frac( a );
2135 aExp = extractFloat32Exp( a );
2136 aSign = extractFloat32Sign( a );
2137 if ( aExp == 0xFF ) {
f090c9d4 2138 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
158142c2
FB
2139 if ( ! aSign ) return a;
2140 float_raise( float_flag_invalid STATUS_VAR);
2141 return float32_default_nan;
2142 }
2143 if ( aSign ) {
2144 if ( ( aExp | aSig ) == 0 ) return a;
2145 float_raise( float_flag_invalid STATUS_VAR);
2146 return float32_default_nan;
2147 }
2148 if ( aExp == 0 ) {
f090c9d4 2149 if ( aSig == 0 ) return float32_zero;
158142c2
FB
2150 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2151 }
2152 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2153 aSig = ( aSig | 0x00800000 )<<8;
2154 zSig = estimateSqrt32( aExp, aSig ) + 2;
2155 if ( ( zSig & 0x7F ) <= 5 ) {
2156 if ( zSig < 2 ) {
2157 zSig = 0x7FFFFFFF;
2158 goto roundAndPack;
2159 }
2160 aSig >>= aExp & 1;
bb98fe42
AF
2161 term = ( (uint64_t) zSig ) * zSig;
2162 rem = ( ( (uint64_t) aSig )<<32 ) - term;
2163 while ( (int64_t) rem < 0 ) {
158142c2 2164 --zSig;
bb98fe42 2165 rem += ( ( (uint64_t) zSig )<<1 ) | 1;
158142c2
FB
2166 }
2167 zSig |= ( rem != 0 );
2168 }
2169 shift32RightJamming( zSig, 1, &zSig );
2170 roundAndPack:
2171 return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2172
2173}
2174
8229c991
AJ
2175/*----------------------------------------------------------------------------
2176| Returns the binary exponential of the single-precision floating-point value
2177| `a'. The operation is performed according to the IEC/IEEE Standard for
2178| Binary Floating-Point Arithmetic.
2179|
2180| Uses the following identities:
2181|
2182| 1. -------------------------------------------------------------------------
2183| x x*ln(2)
2184| 2 = e
2185|
2186| 2. -------------------------------------------------------------------------
2187| 2 3 4 5 n
2188| x x x x x x x
2189| e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2190| 1! 2! 3! 4! 5! n!
2191*----------------------------------------------------------------------------*/
2192
2193static const float64 float32_exp2_coefficients[15] =
2194{
d5138cf4
PM
2195 const_float64( 0x3ff0000000000000ll ), /* 1 */
2196 const_float64( 0x3fe0000000000000ll ), /* 2 */
2197 const_float64( 0x3fc5555555555555ll ), /* 3 */
2198 const_float64( 0x3fa5555555555555ll ), /* 4 */
2199 const_float64( 0x3f81111111111111ll ), /* 5 */
2200 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
2201 const_float64( 0x3f2a01a01a01a01all ), /* 7 */
2202 const_float64( 0x3efa01a01a01a01all ), /* 8 */
2203 const_float64( 0x3ec71de3a556c734ll ), /* 9 */
2204 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2205 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2206 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2207 const_float64( 0x3de6124613a86d09ll ), /* 13 */
2208 const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2209 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
8229c991
AJ
2210};
2211
2212float32 float32_exp2( float32 a STATUS_PARAM )
2213{
2214 flag aSign;
2215 int16 aExp;
bb98fe42 2216 uint32_t aSig;
8229c991
AJ
2217 float64 r, x, xn;
2218 int i;
37d18660 2219 a = float32_squash_input_denormal(a STATUS_VAR);
8229c991
AJ
2220
2221 aSig = extractFloat32Frac( a );
2222 aExp = extractFloat32Exp( a );
2223 aSign = extractFloat32Sign( a );
2224
2225 if ( aExp == 0xFF) {
2226 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2227 return (aSign) ? float32_zero : a;
2228 }
2229 if (aExp == 0) {
2230 if (aSig == 0) return float32_one;
2231 }
2232
2233 float_raise( float_flag_inexact STATUS_VAR);
2234
2235 /* ******************************* */
2236 /* using float64 for approximation */
2237 /* ******************************* */
2238 x = float32_to_float64(a STATUS_VAR);
2239 x = float64_mul(x, float64_ln2 STATUS_VAR);
2240
2241 xn = x;
2242 r = float64_one;
2243 for (i = 0 ; i < 15 ; i++) {
2244 float64 f;
2245
2246 f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2247 r = float64_add(r, f STATUS_VAR);
2248
2249 xn = float64_mul(xn, x STATUS_VAR);
2250 }
2251
2252 return float64_to_float32(r, status);
2253}
2254
374dfc33
AJ
2255/*----------------------------------------------------------------------------
2256| Returns the binary log of the single-precision floating-point value `a'.
2257| The operation is performed according to the IEC/IEEE Standard for Binary
2258| Floating-Point Arithmetic.
2259*----------------------------------------------------------------------------*/
2260float32 float32_log2( float32 a STATUS_PARAM )
2261{
2262 flag aSign, zSign;
2263 int16 aExp;
bb98fe42 2264 uint32_t aSig, zSig, i;
374dfc33 2265
37d18660 2266 a = float32_squash_input_denormal(a STATUS_VAR);
374dfc33
AJ
2267 aSig = extractFloat32Frac( a );
2268 aExp = extractFloat32Exp( a );
2269 aSign = extractFloat32Sign( a );
2270
2271 if ( aExp == 0 ) {
2272 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2273 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2274 }
2275 if ( aSign ) {
2276 float_raise( float_flag_invalid STATUS_VAR);
2277 return float32_default_nan;
2278 }
2279 if ( aExp == 0xFF ) {
2280 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2281 return a;
2282 }
2283
2284 aExp -= 0x7F;
2285 aSig |= 0x00800000;
2286 zSign = aExp < 0;
2287 zSig = aExp << 23;
2288
2289 for (i = 1 << 22; i > 0; i >>= 1) {
bb98fe42 2290 aSig = ( (uint64_t)aSig * aSig ) >> 23;
374dfc33
AJ
2291 if ( aSig & 0x01000000 ) {
2292 aSig >>= 1;
2293 zSig |= i;
2294 }
2295 }
2296
2297 if ( zSign )
2298 zSig = -zSig;
2299
2300 return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2301}
2302
158142c2
FB
2303/*----------------------------------------------------------------------------
2304| Returns 1 if the single-precision floating-point value `a' is equal to
b689362d
AJ
2305| the corresponding value `b', and 0 otherwise. The invalid exception is
2306| raised if either operand is a NaN. Otherwise, the comparison is performed
158142c2
FB
2307| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2308*----------------------------------------------------------------------------*/
2309
b689362d 2310int float32_eq( float32 a, float32 b STATUS_PARAM )
158142c2 2311{
b689362d 2312 uint32_t av, bv;
37d18660
PM
2313 a = float32_squash_input_denormal(a STATUS_VAR);
2314 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2315
2316 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2317 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2318 ) {
b689362d 2319 float_raise( float_flag_invalid STATUS_VAR);
158142c2
FB
2320 return 0;
2321 }
b689362d
AJ
2322 av = float32_val(a);
2323 bv = float32_val(b);
2324 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
158142c2
FB
2325}
2326
2327/*----------------------------------------------------------------------------
2328| Returns 1 if the single-precision floating-point value `a' is less than
f5a64251
AJ
2329| or equal to the corresponding value `b', and 0 otherwise. The invalid
2330| exception is raised if either operand is a NaN. The comparison is performed
2331| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
2332*----------------------------------------------------------------------------*/
2333
750afe93 2334int float32_le( float32 a, float32 b STATUS_PARAM )
158142c2
FB
2335{
2336 flag aSign, bSign;
bb98fe42 2337 uint32_t av, bv;
37d18660
PM
2338 a = float32_squash_input_denormal(a STATUS_VAR);
2339 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2340
2341 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2342 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2343 ) {
2344 float_raise( float_flag_invalid STATUS_VAR);
2345 return 0;
2346 }
2347 aSign = extractFloat32Sign( a );
2348 bSign = extractFloat32Sign( b );
f090c9d4
PB
2349 av = float32_val(a);
2350 bv = float32_val(b);
bb98fe42 2351 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 2352 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
2353
2354}
2355
2356/*----------------------------------------------------------------------------
2357| Returns 1 if the single-precision floating-point value `a' is less than
f5a64251
AJ
2358| the corresponding value `b', and 0 otherwise. The invalid exception is
2359| raised if either operand is a NaN. The comparison is performed according
2360| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
2361*----------------------------------------------------------------------------*/
2362
750afe93 2363int float32_lt( float32 a, float32 b STATUS_PARAM )
158142c2
FB
2364{
2365 flag aSign, bSign;
bb98fe42 2366 uint32_t av, bv;
37d18660
PM
2367 a = float32_squash_input_denormal(a STATUS_VAR);
2368 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2369
2370 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2371 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2372 ) {
2373 float_raise( float_flag_invalid STATUS_VAR);
2374 return 0;
2375 }
2376 aSign = extractFloat32Sign( a );
2377 bSign = extractFloat32Sign( b );
f090c9d4
PB
2378 av = float32_val(a);
2379 bv = float32_val(b);
bb98fe42 2380 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 2381 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
2382
2383}
2384
67b7861d
AJ
2385/*----------------------------------------------------------------------------
2386| Returns 1 if the single-precision floating-point values `a' and `b' cannot
f5a64251
AJ
2387| be compared, and 0 otherwise. The invalid exception is raised if either
2388| operand is a NaN. The comparison is performed according to the IEC/IEEE
2389| Standard for Binary Floating-Point Arithmetic.
67b7861d
AJ
2390*----------------------------------------------------------------------------*/
2391
2392int float32_unordered( float32 a, float32 b STATUS_PARAM )
2393{
2394 a = float32_squash_input_denormal(a STATUS_VAR);
2395 b = float32_squash_input_denormal(b STATUS_VAR);
2396
2397 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2398 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2399 ) {
2400 float_raise( float_flag_invalid STATUS_VAR);
2401 return 1;
2402 }
2403 return 0;
2404}
b689362d 2405
158142c2
FB
2406/*----------------------------------------------------------------------------
2407| Returns 1 if the single-precision floating-point value `a' is equal to
f5a64251
AJ
2408| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2409| exception. The comparison is performed according to the IEC/IEEE Standard
2410| for Binary Floating-Point Arithmetic.
158142c2
FB
2411*----------------------------------------------------------------------------*/
2412
b689362d 2413int float32_eq_quiet( float32 a, float32 b STATUS_PARAM )
158142c2 2414{
37d18660
PM
2415 a = float32_squash_input_denormal(a STATUS_VAR);
2416 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2417
2418 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2419 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2420 ) {
b689362d
AJ
2421 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2422 float_raise( float_flag_invalid STATUS_VAR);
2423 }
158142c2
FB
2424 return 0;
2425 }
b689362d
AJ
2426 return ( float32_val(a) == float32_val(b) ) ||
2427 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
158142c2
FB
2428}
2429
2430/*----------------------------------------------------------------------------
2431| Returns 1 if the single-precision floating-point value `a' is less than or
2432| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2433| cause an exception. Otherwise, the comparison is performed according to the
2434| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2435*----------------------------------------------------------------------------*/
2436
750afe93 2437int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
158142c2
FB
2438{
2439 flag aSign, bSign;
bb98fe42 2440 uint32_t av, bv;
37d18660
PM
2441 a = float32_squash_input_denormal(a STATUS_VAR);
2442 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2443
2444 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2445 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2446 ) {
2447 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2448 float_raise( float_flag_invalid STATUS_VAR);
2449 }
2450 return 0;
2451 }
2452 aSign = extractFloat32Sign( a );
2453 bSign = extractFloat32Sign( b );
f090c9d4
PB
2454 av = float32_val(a);
2455 bv = float32_val(b);
bb98fe42 2456 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 2457 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
2458
2459}
2460
2461/*----------------------------------------------------------------------------
2462| Returns 1 if the single-precision floating-point value `a' is less than
2463| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2464| exception. Otherwise, the comparison is performed according to the IEC/IEEE
2465| Standard for Binary Floating-Point Arithmetic.
2466*----------------------------------------------------------------------------*/
2467
750afe93 2468int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
158142c2
FB
2469{
2470 flag aSign, bSign;
bb98fe42 2471 uint32_t av, bv;
37d18660
PM
2472 a = float32_squash_input_denormal(a STATUS_VAR);
2473 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2474
2475 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2476 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2477 ) {
2478 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2479 float_raise( float_flag_invalid STATUS_VAR);
2480 }
2481 return 0;
2482 }
2483 aSign = extractFloat32Sign( a );
2484 bSign = extractFloat32Sign( b );
f090c9d4
PB
2485 av = float32_val(a);
2486 bv = float32_val(b);
bb98fe42 2487 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 2488 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
2489
2490}
2491
67b7861d
AJ
2492/*----------------------------------------------------------------------------
2493| Returns 1 if the single-precision floating-point values `a' and `b' cannot
2494| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
2495| comparison is performed according to the IEC/IEEE Standard for Binary
2496| Floating-Point Arithmetic.
2497*----------------------------------------------------------------------------*/
2498
2499int float32_unordered_quiet( float32 a, float32 b STATUS_PARAM )
2500{
2501 a = float32_squash_input_denormal(a STATUS_VAR);
2502 b = float32_squash_input_denormal(b STATUS_VAR);
2503
2504 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2505 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2506 ) {
2507 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2508 float_raise( float_flag_invalid STATUS_VAR);
2509 }
2510 return 1;
2511 }
2512 return 0;
2513}
2514
158142c2
FB
2515/*----------------------------------------------------------------------------
2516| Returns the result of converting the double-precision floating-point value
2517| `a' to the 32-bit two's complement integer format. The conversion is
2518| performed according to the IEC/IEEE Standard for Binary Floating-Point
2519| Arithmetic---which means in particular that the conversion is rounded
2520| according to the current rounding mode. If `a' is a NaN, the largest
2521| positive integer is returned. Otherwise, if the conversion overflows, the
2522| largest integer with the same sign as `a' is returned.
2523*----------------------------------------------------------------------------*/
2524
2525int32 float64_to_int32( float64 a STATUS_PARAM )
2526{
2527 flag aSign;
2528 int16 aExp, shiftCount;
bb98fe42 2529 uint64_t aSig;
37d18660 2530 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
2531
2532 aSig = extractFloat64Frac( a );
2533 aExp = extractFloat64Exp( a );
2534 aSign = extractFloat64Sign( a );
2535 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2536 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2537 shiftCount = 0x42C - aExp;
2538 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2539 return roundAndPackInt32( aSign, aSig STATUS_VAR );
2540
2541}
2542
2543/*----------------------------------------------------------------------------
2544| Returns the result of converting the double-precision floating-point value
2545| `a' to the 32-bit two's complement integer format. The conversion is
2546| performed according to the IEC/IEEE Standard for Binary Floating-Point
2547| Arithmetic, except that the conversion is always rounded toward zero.
2548| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2549| the conversion overflows, the largest integer with the same sign as `a' is
2550| returned.
2551*----------------------------------------------------------------------------*/
2552
2553int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2554{
2555 flag aSign;
2556 int16 aExp, shiftCount;
bb98fe42 2557 uint64_t aSig, savedASig;
158142c2 2558 int32 z;
37d18660 2559 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
2560
2561 aSig = extractFloat64Frac( a );
2562 aExp = extractFloat64Exp( a );
2563 aSign = extractFloat64Sign( a );
2564 if ( 0x41E < aExp ) {
2565 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2566 goto invalid;
2567 }
2568 else if ( aExp < 0x3FF ) {
2569 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2570 return 0;
2571 }
2572 aSig |= LIT64( 0x0010000000000000 );
2573 shiftCount = 0x433 - aExp;
2574 savedASig = aSig;
2575 aSig >>= shiftCount;
2576 z = aSig;
2577 if ( aSign ) z = - z;
2578 if ( ( z < 0 ) ^ aSign ) {
2579 invalid:
2580 float_raise( float_flag_invalid STATUS_VAR);
bb98fe42 2581 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2
FB
2582 }
2583 if ( ( aSig<<shiftCount ) != savedASig ) {
2584 STATUS(float_exception_flags) |= float_flag_inexact;
2585 }
2586 return z;
2587
2588}
2589
cbcef455
PM
2590/*----------------------------------------------------------------------------
2591| Returns the result of converting the double-precision floating-point value
2592| `a' to the 16-bit two's complement integer format. The conversion is
2593| performed according to the IEC/IEEE Standard for Binary Floating-Point
2594| Arithmetic, except that the conversion is always rounded toward zero.
2595| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2596| the conversion overflows, the largest integer with the same sign as `a' is
2597| returned.
2598*----------------------------------------------------------------------------*/
2599
2600int16 float64_to_int16_round_to_zero( float64 a STATUS_PARAM )
2601{
2602 flag aSign;
2603 int16 aExp, shiftCount;
bb98fe42 2604 uint64_t aSig, savedASig;
cbcef455
PM
2605 int32 z;
2606
2607 aSig = extractFloat64Frac( a );
2608 aExp = extractFloat64Exp( a );
2609 aSign = extractFloat64Sign( a );
2610 if ( 0x40E < aExp ) {
2611 if ( ( aExp == 0x7FF ) && aSig ) {
2612 aSign = 0;
2613 }
2614 goto invalid;
2615 }
2616 else if ( aExp < 0x3FF ) {
2617 if ( aExp || aSig ) {
2618 STATUS(float_exception_flags) |= float_flag_inexact;
2619 }
2620 return 0;
2621 }
2622 aSig |= LIT64( 0x0010000000000000 );
2623 shiftCount = 0x433 - aExp;
2624 savedASig = aSig;
2625 aSig >>= shiftCount;
2626 z = aSig;
2627 if ( aSign ) {
2628 z = - z;
2629 }
2630 if ( ( (int16_t)z < 0 ) ^ aSign ) {
2631 invalid:
2632 float_raise( float_flag_invalid STATUS_VAR);
bb98fe42 2633 return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
cbcef455
PM
2634 }
2635 if ( ( aSig<<shiftCount ) != savedASig ) {
2636 STATUS(float_exception_flags) |= float_flag_inexact;
2637 }
2638 return z;
2639}
2640
158142c2
FB
2641/*----------------------------------------------------------------------------
2642| Returns the result of converting the double-precision floating-point value
2643| `a' to the 64-bit two's complement integer format. The conversion is
2644| performed according to the IEC/IEEE Standard for Binary Floating-Point
2645| Arithmetic---which means in particular that the conversion is rounded
2646| according to the current rounding mode. If `a' is a NaN, the largest
2647| positive integer is returned. Otherwise, if the conversion overflows, the
2648| largest integer with the same sign as `a' is returned.
2649*----------------------------------------------------------------------------*/
2650
2651int64 float64_to_int64( float64 a STATUS_PARAM )
2652{
2653 flag aSign;
2654 int16 aExp, shiftCount;
bb98fe42 2655 uint64_t aSig, aSigExtra;
37d18660 2656 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
2657
2658 aSig = extractFloat64Frac( a );
2659 aExp = extractFloat64Exp( a );
2660 aSign = extractFloat64Sign( a );
2661 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2662 shiftCount = 0x433 - aExp;
2663 if ( shiftCount <= 0 ) {
2664 if ( 0x43E < aExp ) {
2665 float_raise( float_flag_invalid STATUS_VAR);
2666 if ( ! aSign
2667 || ( ( aExp == 0x7FF )
2668 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2669 ) {
2670 return LIT64( 0x7FFFFFFFFFFFFFFF );
2671 }
bb98fe42 2672 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
2673 }
2674 aSigExtra = 0;
2675 aSig <<= - shiftCount;
2676 }
2677 else {
2678 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2679 }
2680 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2681
2682}
2683
2684/*----------------------------------------------------------------------------
2685| Returns the result of converting the double-precision floating-point value
2686| `a' to the 64-bit two's complement integer format. The conversion is
2687| performed according to the IEC/IEEE Standard for Binary Floating-Point
2688| Arithmetic, except that the conversion is always rounded toward zero.
2689| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2690| the conversion overflows, the largest integer with the same sign as `a' is
2691| returned.
2692*----------------------------------------------------------------------------*/
2693
2694int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2695{
2696 flag aSign;
2697 int16 aExp, shiftCount;
bb98fe42 2698 uint64_t aSig;
158142c2 2699 int64 z;
37d18660 2700 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
2701
2702 aSig = extractFloat64Frac( a );
2703 aExp = extractFloat64Exp( a );
2704 aSign = extractFloat64Sign( a );
2705 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2706 shiftCount = aExp - 0x433;
2707 if ( 0 <= shiftCount ) {
2708 if ( 0x43E <= aExp ) {
f090c9d4 2709 if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
158142c2
FB
2710 float_raise( float_flag_invalid STATUS_VAR);
2711 if ( ! aSign
2712 || ( ( aExp == 0x7FF )
2713 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2714 ) {
2715 return LIT64( 0x7FFFFFFFFFFFFFFF );
2716 }
2717 }
bb98fe42 2718 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
2719 }
2720 z = aSig<<shiftCount;
2721 }
2722 else {
2723 if ( aExp < 0x3FE ) {
2724 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2725 return 0;
2726 }
2727 z = aSig>>( - shiftCount );
bb98fe42 2728 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
158142c2
FB
2729 STATUS(float_exception_flags) |= float_flag_inexact;
2730 }
2731 }
2732 if ( aSign ) z = - z;
2733 return z;
2734
2735}
2736
2737/*----------------------------------------------------------------------------
2738| Returns the result of converting the double-precision floating-point value
2739| `a' to the single-precision floating-point format. The conversion is
2740| performed according to the IEC/IEEE Standard for Binary Floating-Point
2741| Arithmetic.
2742*----------------------------------------------------------------------------*/
2743
2744float32 float64_to_float32( float64 a STATUS_PARAM )
2745{
2746 flag aSign;
2747 int16 aExp;
bb98fe42
AF
2748 uint64_t aSig;
2749 uint32_t zSig;
37d18660 2750 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
2751
2752 aSig = extractFloat64Frac( a );
2753 aExp = extractFloat64Exp( a );
2754 aSign = extractFloat64Sign( a );
2755 if ( aExp == 0x7FF ) {
bcd4d9af 2756 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
2757 return packFloat32( aSign, 0xFF, 0 );
2758 }
2759 shift64RightJamming( aSig, 22, &aSig );
2760 zSig = aSig;
2761 if ( aExp || zSig ) {
2762 zSig |= 0x40000000;
2763 aExp -= 0x381;
2764 }
2765 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2766
2767}
2768
60011498
PB
2769
2770/*----------------------------------------------------------------------------
2771| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2772| half-precision floating-point value, returning the result. After being
2773| shifted into the proper positions, the three fields are simply added
2774| together to form the result. This means that any integer portion of `zSig'
2775| will be added into the exponent. Since a properly normalized significand
2776| will have an integer portion equal to 1, the `zExp' input should be 1 less
2777| than the desired result exponent whenever `zSig' is a complete, normalized
2778| significand.
2779*----------------------------------------------------------------------------*/
bb98fe42 2780static float16 packFloat16(flag zSign, int16 zExp, uint16_t zSig)
60011498 2781{
bb4d4bb3 2782 return make_float16(
bb98fe42 2783 (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
60011498
PB
2784}
2785
2786/* Half precision floats come in two formats: standard IEEE and "ARM" format.
2787 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
bb4d4bb3
PM
2788
2789float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM)
60011498
PB
2790{
2791 flag aSign;
2792 int16 aExp;
bb98fe42 2793 uint32_t aSig;
60011498 2794
bb4d4bb3
PM
2795 aSign = extractFloat16Sign(a);
2796 aExp = extractFloat16Exp(a);
2797 aSig = extractFloat16Frac(a);
60011498
PB
2798
2799 if (aExp == 0x1f && ieee) {
2800 if (aSig) {
f591e1be 2801 return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
60011498
PB
2802 }
2803 return packFloat32(aSign, 0xff, aSig << 13);
2804 }
2805 if (aExp == 0) {
2806 int8 shiftCount;
2807
2808 if (aSig == 0) {
2809 return packFloat32(aSign, 0, 0);
2810 }
2811
2812 shiftCount = countLeadingZeros32( aSig ) - 21;
2813 aSig = aSig << shiftCount;
2814 aExp = -shiftCount;
2815 }
2816 return packFloat32( aSign, aExp + 0x70, aSig << 13);
2817}
2818
bb4d4bb3 2819float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM)
60011498
PB
2820{
2821 flag aSign;
2822 int16 aExp;
bb98fe42
AF
2823 uint32_t aSig;
2824 uint32_t mask;
2825 uint32_t increment;
60011498 2826 int8 roundingMode;
37d18660 2827 a = float32_squash_input_denormal(a STATUS_VAR);
60011498
PB
2828
2829 aSig = extractFloat32Frac( a );
2830 aExp = extractFloat32Exp( a );
2831 aSign = extractFloat32Sign( a );
2832 if ( aExp == 0xFF ) {
2833 if (aSig) {
600e30d2
PM
2834 /* Input is a NaN */
2835 float16 r = commonNaNToFloat16( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2836 if (!ieee) {
2837 return packFloat16(aSign, 0, 0);
2838 }
2839 return r;
60011498 2840 }
600e30d2
PM
2841 /* Infinity */
2842 if (!ieee) {
2843 float_raise(float_flag_invalid STATUS_VAR);
2844 return packFloat16(aSign, 0x1f, 0x3ff);
2845 }
2846 return packFloat16(aSign, 0x1f, 0);
60011498 2847 }
600e30d2 2848 if (aExp == 0 && aSig == 0) {
60011498
PB
2849 return packFloat16(aSign, 0, 0);
2850 }
2851 /* Decimal point between bits 22 and 23. */
2852 aSig |= 0x00800000;
2853 aExp -= 0x7f;
2854 if (aExp < -14) {
600e30d2
PM
2855 mask = 0x00ffffff;
2856 if (aExp >= -24) {
2857 mask >>= 25 + aExp;
60011498
PB
2858 }
2859 } else {
2860 mask = 0x00001fff;
2861 }
2862 if (aSig & mask) {
2863 float_raise( float_flag_underflow STATUS_VAR );
2864 roundingMode = STATUS(float_rounding_mode);
2865 switch (roundingMode) {
2866 case float_round_nearest_even:
2867 increment = (mask + 1) >> 1;
2868 if ((aSig & mask) == increment) {
2869 increment = aSig & (increment << 1);
2870 }
2871 break;
2872 case float_round_up:
2873 increment = aSign ? 0 : mask;
2874 break;
2875 case float_round_down:
2876 increment = aSign ? mask : 0;
2877 break;
2878 default: /* round_to_zero */
2879 increment = 0;
2880 break;
2881 }
2882 aSig += increment;
2883 if (aSig >= 0x01000000) {
2884 aSig >>= 1;
2885 aExp++;
2886 }
2887 } else if (aExp < -14
2888 && STATUS(float_detect_tininess) == float_tininess_before_rounding) {
2889 float_raise( float_flag_underflow STATUS_VAR);
2890 }
2891
2892 if (ieee) {
2893 if (aExp > 15) {
2894 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
2895 return packFloat16(aSign, 0x1f, 0);
2896 }
2897 } else {
2898 if (aExp > 16) {
600e30d2 2899 float_raise(float_flag_invalid | float_flag_inexact STATUS_VAR);
60011498
PB
2900 return packFloat16(aSign, 0x1f, 0x3ff);
2901 }
2902 }
2903 if (aExp < -24) {
2904 return packFloat16(aSign, 0, 0);
2905 }
2906 if (aExp < -14) {
2907 aSig >>= -14 - aExp;
2908 aExp = -14;
2909 }
2910 return packFloat16(aSign, aExp + 14, aSig >> 13);
2911}
2912
158142c2
FB
2913/*----------------------------------------------------------------------------
2914| Returns the result of converting the double-precision floating-point value
2915| `a' to the extended double-precision floating-point format. The conversion
2916| is performed according to the IEC/IEEE Standard for Binary Floating-Point
2917| Arithmetic.
2918*----------------------------------------------------------------------------*/
2919
2920floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
2921{
2922 flag aSign;
2923 int16 aExp;
bb98fe42 2924 uint64_t aSig;
158142c2 2925
37d18660 2926 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
2927 aSig = extractFloat64Frac( a );
2928 aExp = extractFloat64Exp( a );
2929 aSign = extractFloat64Sign( a );
2930 if ( aExp == 0x7FF ) {
bcd4d9af 2931 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
2932 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2933 }
2934 if ( aExp == 0 ) {
2935 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2936 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2937 }
2938 return
2939 packFloatx80(
2940 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2941
2942}
2943
158142c2
FB
2944/*----------------------------------------------------------------------------
2945| Returns the result of converting the double-precision floating-point value
2946| `a' to the quadruple-precision floating-point format. The conversion is
2947| performed according to the IEC/IEEE Standard for Binary Floating-Point
2948| Arithmetic.
2949*----------------------------------------------------------------------------*/
2950
2951float128 float64_to_float128( float64 a STATUS_PARAM )
2952{
2953 flag aSign;
2954 int16 aExp;
bb98fe42 2955 uint64_t aSig, zSig0, zSig1;
158142c2 2956
37d18660 2957 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
2958 aSig = extractFloat64Frac( a );
2959 aExp = extractFloat64Exp( a );
2960 aSign = extractFloat64Sign( a );
2961 if ( aExp == 0x7FF ) {
bcd4d9af 2962 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
2963 return packFloat128( aSign, 0x7FFF, 0, 0 );
2964 }
2965 if ( aExp == 0 ) {
2966 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2967 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2968 --aExp;
2969 }
2970 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2971 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2972
2973}
2974
158142c2
FB
2975/*----------------------------------------------------------------------------
2976| Rounds the double-precision floating-point value `a' to an integer, and
2977| returns the result as a double-precision floating-point value. The
2978| operation is performed according to the IEC/IEEE Standard for Binary
2979| Floating-Point Arithmetic.
2980*----------------------------------------------------------------------------*/
2981
2982float64 float64_round_to_int( float64 a STATUS_PARAM )
2983{
2984 flag aSign;
2985 int16 aExp;
bb98fe42 2986 uint64_t lastBitMask, roundBitsMask;
158142c2 2987 int8 roundingMode;
bb98fe42 2988 uint64_t z;
37d18660 2989 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
2990
2991 aExp = extractFloat64Exp( a );
2992 if ( 0x433 <= aExp ) {
2993 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2994 return propagateFloat64NaN( a, a STATUS_VAR );
2995 }
2996 return a;
2997 }
2998 if ( aExp < 0x3FF ) {
bb98fe42 2999 if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
158142c2
FB
3000 STATUS(float_exception_flags) |= float_flag_inexact;
3001 aSign = extractFloat64Sign( a );
3002 switch ( STATUS(float_rounding_mode) ) {
3003 case float_round_nearest_even:
3004 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3005 return packFloat64( aSign, 0x3FF, 0 );
3006 }
3007 break;
3008 case float_round_down:
f090c9d4 3009 return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
158142c2 3010 case float_round_up:
f090c9d4
PB
3011 return make_float64(
3012 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
158142c2
FB
3013 }
3014 return packFloat64( aSign, 0, 0 );
3015 }
3016 lastBitMask = 1;
3017 lastBitMask <<= 0x433 - aExp;
3018 roundBitsMask = lastBitMask - 1;
f090c9d4 3019 z = float64_val(a);
158142c2
FB
3020 roundingMode = STATUS(float_rounding_mode);
3021 if ( roundingMode == float_round_nearest_even ) {
3022 z += lastBitMask>>1;
3023 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
3024 }
3025 else if ( roundingMode != float_round_to_zero ) {
f090c9d4 3026 if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
158142c2
FB
3027 z += roundBitsMask;
3028 }
3029 }
3030 z &= ~ roundBitsMask;
f090c9d4
PB
3031 if ( z != float64_val(a) )
3032 STATUS(float_exception_flags) |= float_flag_inexact;
3033 return make_float64(z);
158142c2
FB
3034
3035}
3036
e6e5906b
PB
3037float64 float64_trunc_to_int( float64 a STATUS_PARAM)
3038{
3039 int oldmode;
3040 float64 res;
3041 oldmode = STATUS(float_rounding_mode);
3042 STATUS(float_rounding_mode) = float_round_to_zero;
3043 res = float64_round_to_int(a STATUS_VAR);
3044 STATUS(float_rounding_mode) = oldmode;
3045 return res;
3046}
3047
158142c2
FB
3048/*----------------------------------------------------------------------------
3049| Returns the result of adding the absolute values of the double-precision
3050| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
3051| before being returned. `zSign' is ignored if the result is a NaN.
3052| The addition is performed according to the IEC/IEEE Standard for Binary
3053| Floating-Point Arithmetic.
3054*----------------------------------------------------------------------------*/
3055
3056static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3057{
3058 int16 aExp, bExp, zExp;
bb98fe42 3059 uint64_t aSig, bSig, zSig;
158142c2
FB
3060 int16 expDiff;
3061
3062 aSig = extractFloat64Frac( a );
3063 aExp = extractFloat64Exp( a );
3064 bSig = extractFloat64Frac( b );
3065 bExp = extractFloat64Exp( b );
3066 expDiff = aExp - bExp;
3067 aSig <<= 9;
3068 bSig <<= 9;
3069 if ( 0 < expDiff ) {
3070 if ( aExp == 0x7FF ) {
3071 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3072 return a;
3073 }
3074 if ( bExp == 0 ) {
3075 --expDiff;
3076 }
3077 else {
3078 bSig |= LIT64( 0x2000000000000000 );
3079 }
3080 shift64RightJamming( bSig, expDiff, &bSig );
3081 zExp = aExp;
3082 }
3083 else if ( expDiff < 0 ) {
3084 if ( bExp == 0x7FF ) {
3085 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3086 return packFloat64( zSign, 0x7FF, 0 );
3087 }
3088 if ( aExp == 0 ) {
3089 ++expDiff;
3090 }
3091 else {
3092 aSig |= LIT64( 0x2000000000000000 );
3093 }
3094 shift64RightJamming( aSig, - expDiff, &aSig );
3095 zExp = bExp;
3096 }
3097 else {
3098 if ( aExp == 0x7FF ) {
3099 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3100 return a;
3101 }
fe76d976 3102 if ( aExp == 0 ) {
e6afc87f
PM
3103 if (STATUS(flush_to_zero)) {
3104 if (aSig | bSig) {
3105 float_raise(float_flag_output_denormal STATUS_VAR);
3106 }
3107 return packFloat64(zSign, 0, 0);
3108 }
fe76d976
PB
3109 return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3110 }
158142c2
FB
3111 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3112 zExp = aExp;
3113 goto roundAndPack;
3114 }
3115 aSig |= LIT64( 0x2000000000000000 );
3116 zSig = ( aSig + bSig )<<1;
3117 --zExp;
bb98fe42 3118 if ( (int64_t) zSig < 0 ) {
158142c2
FB
3119 zSig = aSig + bSig;
3120 ++zExp;
3121 }
3122 roundAndPack:
3123 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3124
3125}
3126
3127/*----------------------------------------------------------------------------
3128| Returns the result of subtracting the absolute values of the double-
3129| precision floating-point values `a' and `b'. If `zSign' is 1, the
3130| difference is negated before being returned. `zSign' is ignored if the
3131| result is a NaN. The subtraction is performed according to the IEC/IEEE
3132| Standard for Binary Floating-Point Arithmetic.
3133*----------------------------------------------------------------------------*/
3134
3135static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3136{
3137 int16 aExp, bExp, zExp;
bb98fe42 3138 uint64_t aSig, bSig, zSig;
158142c2
FB
3139 int16 expDiff;
3140
3141 aSig = extractFloat64Frac( a );
3142 aExp = extractFloat64Exp( a );
3143 bSig = extractFloat64Frac( b );
3144 bExp = extractFloat64Exp( b );
3145 expDiff = aExp - bExp;
3146 aSig <<= 10;
3147 bSig <<= 10;
3148 if ( 0 < expDiff ) goto aExpBigger;
3149 if ( expDiff < 0 ) goto bExpBigger;
3150 if ( aExp == 0x7FF ) {
3151 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3152 float_raise( float_flag_invalid STATUS_VAR);
3153 return float64_default_nan;
3154 }
3155 if ( aExp == 0 ) {
3156 aExp = 1;
3157 bExp = 1;
3158 }
3159 if ( bSig < aSig ) goto aBigger;
3160 if ( aSig < bSig ) goto bBigger;
3161 return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3162 bExpBigger:
3163 if ( bExp == 0x7FF ) {
3164 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3165 return packFloat64( zSign ^ 1, 0x7FF, 0 );
3166 }
3167 if ( aExp == 0 ) {
3168 ++expDiff;
3169 }
3170 else {
3171 aSig |= LIT64( 0x4000000000000000 );
3172 }
3173 shift64RightJamming( aSig, - expDiff, &aSig );
3174 bSig |= LIT64( 0x4000000000000000 );
3175 bBigger:
3176 zSig = bSig - aSig;
3177 zExp = bExp;
3178 zSign ^= 1;
3179 goto normalizeRoundAndPack;
3180 aExpBigger:
3181 if ( aExp == 0x7FF ) {
3182 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3183 return a;
3184 }
3185 if ( bExp == 0 ) {
3186 --expDiff;
3187 }
3188 else {
3189 bSig |= LIT64( 0x4000000000000000 );
3190 }
3191 shift64RightJamming( bSig, expDiff, &bSig );
3192 aSig |= LIT64( 0x4000000000000000 );
3193 aBigger:
3194 zSig = aSig - bSig;
3195 zExp = aExp;
3196 normalizeRoundAndPack:
3197 --zExp;
3198 return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3199
3200}
3201
3202/*----------------------------------------------------------------------------
3203| Returns the result of adding the double-precision floating-point values `a'
3204| and `b'. The operation is performed according to the IEC/IEEE Standard for
3205| Binary Floating-Point Arithmetic.
3206*----------------------------------------------------------------------------*/
3207
3208float64 float64_add( float64 a, float64 b STATUS_PARAM )
3209{
3210 flag aSign, bSign;
37d18660
PM
3211 a = float64_squash_input_denormal(a STATUS_VAR);
3212 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
3213
3214 aSign = extractFloat64Sign( a );
3215 bSign = extractFloat64Sign( b );
3216 if ( aSign == bSign ) {
3217 return addFloat64Sigs( a, b, aSign STATUS_VAR );
3218 }
3219 else {
3220 return subFloat64Sigs( a, b, aSign STATUS_VAR );
3221 }
3222
3223}
3224
3225/*----------------------------------------------------------------------------
3226| Returns the result of subtracting the double-precision floating-point values
3227| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3228| for Binary Floating-Point Arithmetic.
3229*----------------------------------------------------------------------------*/
3230
3231float64 float64_sub( float64 a, float64 b STATUS_PARAM )
3232{
3233 flag aSign, bSign;
37d18660
PM
3234 a = float64_squash_input_denormal(a STATUS_VAR);
3235 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
3236
3237 aSign = extractFloat64Sign( a );
3238 bSign = extractFloat64Sign( b );
3239 if ( aSign == bSign ) {
3240 return subFloat64Sigs( a, b, aSign STATUS_VAR );
3241 }
3242 else {
3243 return addFloat64Sigs( a, b, aSign STATUS_VAR );
3244 }
3245
3246}
3247
3248/*----------------------------------------------------------------------------
3249| Returns the result of multiplying the double-precision floating-point values
3250| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3251| for Binary Floating-Point Arithmetic.
3252*----------------------------------------------------------------------------*/
3253
3254float64 float64_mul( float64 a, float64 b STATUS_PARAM )
3255{
3256 flag aSign, bSign, zSign;
3257 int16 aExp, bExp, zExp;
bb98fe42 3258 uint64_t aSig, bSig, zSig0, zSig1;
158142c2 3259
37d18660
PM
3260 a = float64_squash_input_denormal(a STATUS_VAR);
3261 b = float64_squash_input_denormal(b STATUS_VAR);
3262
158142c2
FB
3263 aSig = extractFloat64Frac( a );
3264 aExp = extractFloat64Exp( a );
3265 aSign = extractFloat64Sign( a );
3266 bSig = extractFloat64Frac( b );
3267 bExp = extractFloat64Exp( b );
3268 bSign = extractFloat64Sign( b );
3269 zSign = aSign ^ bSign;
3270 if ( aExp == 0x7FF ) {
3271 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3272 return propagateFloat64NaN( a, b STATUS_VAR );
3273 }
3274 if ( ( bExp | bSig ) == 0 ) {
3275 float_raise( float_flag_invalid STATUS_VAR);
3276 return float64_default_nan;
3277 }
3278 return packFloat64( zSign, 0x7FF, 0 );
3279 }
3280 if ( bExp == 0x7FF ) {
3281 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3282 if ( ( aExp | aSig ) == 0 ) {
3283 float_raise( float_flag_invalid STATUS_VAR);
3284 return float64_default_nan;
3285 }
3286 return packFloat64( zSign, 0x7FF, 0 );
3287 }
3288 if ( aExp == 0 ) {
3289 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3290 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3291 }
3292 if ( bExp == 0 ) {
3293 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
3294 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3295 }
3296 zExp = aExp + bExp - 0x3FF;
3297 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3298 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3299 mul64To128( aSig, bSig, &zSig0, &zSig1 );
3300 zSig0 |= ( zSig1 != 0 );
bb98fe42 3301 if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
158142c2
FB
3302 zSig0 <<= 1;
3303 --zExp;
3304 }
3305 return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
3306
3307}
3308
3309/*----------------------------------------------------------------------------
3310| Returns the result of dividing the double-precision floating-point value `a'
3311| by the corresponding value `b'. The operation is performed according to
3312| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3313*----------------------------------------------------------------------------*/
3314
3315float64 float64_div( float64 a, float64 b STATUS_PARAM )
3316{
3317 flag aSign, bSign, zSign;
3318 int16 aExp, bExp, zExp;
bb98fe42
AF
3319 uint64_t aSig, bSig, zSig;
3320 uint64_t rem0, rem1;
3321 uint64_t term0, term1;
37d18660
PM
3322 a = float64_squash_input_denormal(a STATUS_VAR);
3323 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
3324
3325 aSig = extractFloat64Frac( a );
3326 aExp = extractFloat64Exp( a );
3327 aSign = extractFloat64Sign( a );
3328 bSig = extractFloat64Frac( b );
3329 bExp = extractFloat64Exp( b );
3330 bSign = extractFloat64Sign( b );
3331 zSign = aSign ^ bSign;
3332 if ( aExp == 0x7FF ) {
3333 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3334 if ( bExp == 0x7FF ) {
3335 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3336 float_raise( float_flag_invalid STATUS_VAR);
3337 return float64_default_nan;
3338 }
3339 return packFloat64( zSign, 0x7FF, 0 );
3340 }
3341 if ( bExp == 0x7FF ) {
3342 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3343 return packFloat64( zSign, 0, 0 );
3344 }
3345 if ( bExp == 0 ) {
3346 if ( bSig == 0 ) {
3347 if ( ( aExp | aSig ) == 0 ) {
3348 float_raise( float_flag_invalid STATUS_VAR);
3349 return float64_default_nan;
3350 }
3351 float_raise( float_flag_divbyzero STATUS_VAR);
3352 return packFloat64( zSign, 0x7FF, 0 );
3353 }
3354 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3355 }
3356 if ( aExp == 0 ) {
3357 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3358 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3359 }
3360 zExp = aExp - bExp + 0x3FD;
3361 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3362 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3363 if ( bSig <= ( aSig + aSig ) ) {
3364 aSig >>= 1;
3365 ++zExp;
3366 }
3367 zSig = estimateDiv128To64( aSig, 0, bSig );
3368 if ( ( zSig & 0x1FF ) <= 2 ) {
3369 mul64To128( bSig, zSig, &term0, &term1 );
3370 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
bb98fe42 3371 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
3372 --zSig;
3373 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3374 }
3375 zSig |= ( rem1 != 0 );
3376 }
3377 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3378
3379}
3380
3381/*----------------------------------------------------------------------------
3382| Returns the remainder of the double-precision floating-point value `a'
3383| with respect to the corresponding value `b'. The operation is performed
3384| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3385*----------------------------------------------------------------------------*/
3386
3387float64 float64_rem( float64 a, float64 b STATUS_PARAM )
3388{
ed086f3d 3389 flag aSign, zSign;
158142c2 3390 int16 aExp, bExp, expDiff;
bb98fe42
AF
3391 uint64_t aSig, bSig;
3392 uint64_t q, alternateASig;
3393 int64_t sigMean;
158142c2 3394
37d18660
PM
3395 a = float64_squash_input_denormal(a STATUS_VAR);
3396 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
3397 aSig = extractFloat64Frac( a );
3398 aExp = extractFloat64Exp( a );
3399 aSign = extractFloat64Sign( a );
3400 bSig = extractFloat64Frac( b );
3401 bExp = extractFloat64Exp( b );
158142c2
FB
3402 if ( aExp == 0x7FF ) {
3403 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3404 return propagateFloat64NaN( a, b STATUS_VAR );
3405 }
3406 float_raise( float_flag_invalid STATUS_VAR);
3407 return float64_default_nan;
3408 }
3409 if ( bExp == 0x7FF ) {
3410 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3411 return a;
3412 }
3413 if ( bExp == 0 ) {
3414 if ( bSig == 0 ) {
3415 float_raise( float_flag_invalid STATUS_VAR);
3416 return float64_default_nan;
3417 }
3418 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3419 }
3420 if ( aExp == 0 ) {
3421 if ( aSig == 0 ) return a;
3422 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3423 }
3424 expDiff = aExp - bExp;
3425 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3426 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3427 if ( expDiff < 0 ) {
3428 if ( expDiff < -1 ) return a;
3429 aSig >>= 1;
3430 }
3431 q = ( bSig <= aSig );
3432 if ( q ) aSig -= bSig;
3433 expDiff -= 64;
3434 while ( 0 < expDiff ) {
3435 q = estimateDiv128To64( aSig, 0, bSig );
3436 q = ( 2 < q ) ? q - 2 : 0;
3437 aSig = - ( ( bSig>>2 ) * q );
3438 expDiff -= 62;
3439 }
3440 expDiff += 64;
3441 if ( 0 < expDiff ) {
3442 q = estimateDiv128To64( aSig, 0, bSig );
3443 q = ( 2 < q ) ? q - 2 : 0;
3444 q >>= 64 - expDiff;
3445 bSig >>= 2;
3446 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3447 }
3448 else {
3449 aSig >>= 2;
3450 bSig >>= 2;
3451 }
3452 do {
3453 alternateASig = aSig;
3454 ++q;
3455 aSig -= bSig;
bb98fe42 3456 } while ( 0 <= (int64_t) aSig );
158142c2
FB
3457 sigMean = aSig + alternateASig;
3458 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3459 aSig = alternateASig;
3460 }
bb98fe42 3461 zSign = ( (int64_t) aSig < 0 );
158142c2
FB
3462 if ( zSign ) aSig = - aSig;
3463 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
3464
3465}
3466
3467/*----------------------------------------------------------------------------
3468| Returns the square root of the double-precision floating-point value `a'.
3469| The operation is performed according to the IEC/IEEE Standard for Binary
3470| Floating-Point Arithmetic.
3471*----------------------------------------------------------------------------*/
3472
3473float64 float64_sqrt( float64 a STATUS_PARAM )
3474{
3475 flag aSign;
3476 int16 aExp, zExp;
bb98fe42
AF
3477 uint64_t aSig, zSig, doubleZSig;
3478 uint64_t rem0, rem1, term0, term1;
37d18660 3479 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
3480
3481 aSig = extractFloat64Frac( a );
3482 aExp = extractFloat64Exp( a );
3483 aSign = extractFloat64Sign( a );
3484 if ( aExp == 0x7FF ) {
3485 if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
3486 if ( ! aSign ) return a;
3487 float_raise( float_flag_invalid STATUS_VAR);
3488 return float64_default_nan;
3489 }
3490 if ( aSign ) {
3491 if ( ( aExp | aSig ) == 0 ) return a;
3492 float_raise( float_flag_invalid STATUS_VAR);
3493 return float64_default_nan;
3494 }
3495 if ( aExp == 0 ) {
f090c9d4 3496 if ( aSig == 0 ) return float64_zero;
158142c2
FB
3497 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3498 }
3499 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3500 aSig |= LIT64( 0x0010000000000000 );
3501 zSig = estimateSqrt32( aExp, aSig>>21 );
3502 aSig <<= 9 - ( aExp & 1 );
3503 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3504 if ( ( zSig & 0x1FF ) <= 5 ) {
3505 doubleZSig = zSig<<1;
3506 mul64To128( zSig, zSig, &term0, &term1 );
3507 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
bb98fe42 3508 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
3509 --zSig;
3510 doubleZSig -= 2;
3511 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3512 }
3513 zSig |= ( ( rem0 | rem1 ) != 0 );
3514 }
3515 return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
3516
3517}
3518
374dfc33
AJ
3519/*----------------------------------------------------------------------------
3520| Returns the binary log of the double-precision floating-point value `a'.
3521| The operation is performed according to the IEC/IEEE Standard for Binary
3522| Floating-Point Arithmetic.
3523*----------------------------------------------------------------------------*/
3524float64 float64_log2( float64 a STATUS_PARAM )
3525{
3526 flag aSign, zSign;
3527 int16 aExp;
bb98fe42 3528 uint64_t aSig, aSig0, aSig1, zSig, i;
37d18660 3529 a = float64_squash_input_denormal(a STATUS_VAR);
374dfc33
AJ
3530
3531 aSig = extractFloat64Frac( a );
3532 aExp = extractFloat64Exp( a );
3533 aSign = extractFloat64Sign( a );
3534
3535 if ( aExp == 0 ) {
3536 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
3537 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3538 }
3539 if ( aSign ) {
3540 float_raise( float_flag_invalid STATUS_VAR);
3541 return float64_default_nan;
3542 }
3543 if ( aExp == 0x7FF ) {
3544 if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
3545 return a;
3546 }
3547
3548 aExp -= 0x3FF;
3549 aSig |= LIT64( 0x0010000000000000 );
3550 zSign = aExp < 0;
bb98fe42 3551 zSig = (uint64_t)aExp << 52;
374dfc33
AJ
3552 for (i = 1LL << 51; i > 0; i >>= 1) {
3553 mul64To128( aSig, aSig, &aSig0, &aSig1 );
3554 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
3555 if ( aSig & LIT64( 0x0020000000000000 ) ) {
3556 aSig >>= 1;
3557 zSig |= i;
3558 }
3559 }
3560
3561 if ( zSign )
3562 zSig = -zSig;
3563 return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
3564}
3565
158142c2
FB
3566/*----------------------------------------------------------------------------
3567| Returns 1 if the double-precision floating-point value `a' is equal to the
b689362d
AJ
3568| corresponding value `b', and 0 otherwise. The invalid exception is raised
3569| if either operand is a NaN. Otherwise, the comparison is performed
158142c2
FB
3570| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3571*----------------------------------------------------------------------------*/
3572
b689362d 3573int float64_eq( float64 a, float64 b STATUS_PARAM )
158142c2 3574{
bb98fe42 3575 uint64_t av, bv;
37d18660
PM
3576 a = float64_squash_input_denormal(a STATUS_VAR);
3577 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
3578
3579 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3580 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3581 ) {
b689362d 3582 float_raise( float_flag_invalid STATUS_VAR);
158142c2
FB
3583 return 0;
3584 }
f090c9d4 3585 av = float64_val(a);
a1b91bb4 3586 bv = float64_val(b);
bb98fe42 3587 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
158142c2
FB
3588
3589}
3590
3591/*----------------------------------------------------------------------------
3592| Returns 1 if the double-precision floating-point value `a' is less than or
f5a64251
AJ
3593| equal to the corresponding value `b', and 0 otherwise. The invalid
3594| exception is raised if either operand is a NaN. The comparison is performed
3595| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
3596*----------------------------------------------------------------------------*/
3597
750afe93 3598int float64_le( float64 a, float64 b STATUS_PARAM )
158142c2
FB
3599{
3600 flag aSign, bSign;
bb98fe42 3601 uint64_t av, bv;
37d18660
PM
3602 a = float64_squash_input_denormal(a STATUS_VAR);
3603 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
3604
3605 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3606 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3607 ) {
3608 float_raise( float_flag_invalid STATUS_VAR);
3609 return 0;
3610 }
3611 aSign = extractFloat64Sign( a );
3612 bSign = extractFloat64Sign( b );
f090c9d4 3613 av = float64_val(a);
a1b91bb4 3614 bv = float64_val(b);
bb98fe42 3615 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 3616 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
3617
3618}
3619
3620/*----------------------------------------------------------------------------
3621| Returns 1 if the double-precision floating-point value `a' is less than
f5a64251
AJ
3622| the corresponding value `b', and 0 otherwise. The invalid exception is
3623| raised if either operand is a NaN. The comparison is performed according
3624| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
3625*----------------------------------------------------------------------------*/
3626
750afe93 3627int float64_lt( float64 a, float64 b STATUS_PARAM )
158142c2
FB
3628{
3629 flag aSign, bSign;
bb98fe42 3630 uint64_t av, bv;
158142c2 3631
37d18660
PM
3632 a = float64_squash_input_denormal(a STATUS_VAR);
3633 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
3634 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3635 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3636 ) {
3637 float_raise( float_flag_invalid STATUS_VAR);
3638 return 0;
3639 }
3640 aSign = extractFloat64Sign( a );
3641 bSign = extractFloat64Sign( b );
f090c9d4 3642 av = float64_val(a);
a1b91bb4 3643 bv = float64_val(b);
bb98fe42 3644 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 3645 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
3646
3647}
3648
67b7861d
AJ
3649/*----------------------------------------------------------------------------
3650| Returns 1 if the double-precision floating-point values `a' and `b' cannot
f5a64251
AJ
3651| be compared, and 0 otherwise. The invalid exception is raised if either
3652| operand is a NaN. The comparison is performed according to the IEC/IEEE
3653| Standard for Binary Floating-Point Arithmetic.
67b7861d
AJ
3654*----------------------------------------------------------------------------*/
3655
3656int float64_unordered( float64 a, float64 b STATUS_PARAM )
3657{
3658 a = float64_squash_input_denormal(a STATUS_VAR);
3659 b = float64_squash_input_denormal(b STATUS_VAR);
3660
3661 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3662 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3663 ) {
3664 float_raise( float_flag_invalid STATUS_VAR);
3665 return 1;
3666 }
3667 return 0;
3668}
3669
158142c2
FB
3670/*----------------------------------------------------------------------------
3671| Returns 1 if the double-precision floating-point value `a' is equal to the
f5a64251
AJ
3672| corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3673| exception.The comparison is performed according to the IEC/IEEE Standard
3674| for Binary Floating-Point Arithmetic.
158142c2
FB
3675*----------------------------------------------------------------------------*/
3676
b689362d 3677int float64_eq_quiet( float64 a, float64 b STATUS_PARAM )
158142c2 3678{
bb98fe42 3679 uint64_t av, bv;
37d18660
PM
3680 a = float64_squash_input_denormal(a STATUS_VAR);
3681 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
3682
3683 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3684 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3685 ) {
b689362d
AJ
3686 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3687 float_raise( float_flag_invalid STATUS_VAR);
3688 }
158142c2
FB
3689 return 0;
3690 }
f090c9d4 3691 av = float64_val(a);
a1b91bb4 3692 bv = float64_val(b);
bb98fe42 3693 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
158142c2
FB
3694
3695}
3696
3697/*----------------------------------------------------------------------------
3698| Returns 1 if the double-precision floating-point value `a' is less than or
3699| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3700| cause an exception. Otherwise, the comparison is performed according to the
3701| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3702*----------------------------------------------------------------------------*/
3703
750afe93 3704int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
158142c2
FB
3705{
3706 flag aSign, bSign;
bb98fe42 3707 uint64_t av, bv;
37d18660
PM
3708 a = float64_squash_input_denormal(a STATUS_VAR);
3709 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
3710
3711 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3712 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3713 ) {
3714 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3715 float_raise( float_flag_invalid STATUS_VAR);
3716 }
3717 return 0;
3718 }
3719 aSign = extractFloat64Sign( a );
3720 bSign = extractFloat64Sign( b );
f090c9d4 3721 av = float64_val(a);
a1b91bb4 3722 bv = float64_val(b);
bb98fe42 3723 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 3724 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
3725
3726}
3727
3728/*----------------------------------------------------------------------------
3729| Returns 1 if the double-precision floating-point value `a' is less than
3730| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3731| exception. Otherwise, the comparison is performed according to the IEC/IEEE
3732| Standard for Binary Floating-Point Arithmetic.
3733*----------------------------------------------------------------------------*/
3734
750afe93 3735int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
158142c2
FB
3736{
3737 flag aSign, bSign;
bb98fe42 3738 uint64_t av, bv;
37d18660
PM
3739 a = float64_squash_input_denormal(a STATUS_VAR);
3740 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
3741
3742 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3743 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3744 ) {
3745 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3746 float_raise( float_flag_invalid STATUS_VAR);
3747 }
3748 return 0;
3749 }
3750 aSign = extractFloat64Sign( a );
3751 bSign = extractFloat64Sign( b );
f090c9d4 3752 av = float64_val(a);
a1b91bb4 3753 bv = float64_val(b);
bb98fe42 3754 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 3755 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
3756
3757}
3758
67b7861d
AJ
3759/*----------------------------------------------------------------------------
3760| Returns 1 if the double-precision floating-point values `a' and `b' cannot
3761| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
3762| comparison is performed according to the IEC/IEEE Standard for Binary
3763| Floating-Point Arithmetic.
3764*----------------------------------------------------------------------------*/
3765
3766int float64_unordered_quiet( float64 a, float64 b STATUS_PARAM )
3767{
3768 a = float64_squash_input_denormal(a STATUS_VAR);
3769 b = float64_squash_input_denormal(b STATUS_VAR);
3770
3771 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3772 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3773 ) {
3774 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3775 float_raise( float_flag_invalid STATUS_VAR);
3776 }
3777 return 1;
3778 }
3779 return 0;
3780}
3781
158142c2
FB
3782/*----------------------------------------------------------------------------
3783| Returns the result of converting the extended double-precision floating-
3784| point value `a' to the 32-bit two's complement integer format. The
3785| conversion is performed according to the IEC/IEEE Standard for Binary
3786| Floating-Point Arithmetic---which means in particular that the conversion
3787| is rounded according to the current rounding mode. If `a' is a NaN, the
3788| largest positive integer is returned. Otherwise, if the conversion
3789| overflows, the largest integer with the same sign as `a' is returned.
3790*----------------------------------------------------------------------------*/
3791
3792int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
3793{
3794 flag aSign;
3795 int32 aExp, shiftCount;
bb98fe42 3796 uint64_t aSig;
158142c2
FB
3797
3798 aSig = extractFloatx80Frac( a );
3799 aExp = extractFloatx80Exp( a );
3800 aSign = extractFloatx80Sign( a );
bb98fe42 3801 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
158142c2
FB
3802 shiftCount = 0x4037 - aExp;
3803 if ( shiftCount <= 0 ) shiftCount = 1;
3804 shift64RightJamming( aSig, shiftCount, &aSig );
3805 return roundAndPackInt32( aSign, aSig STATUS_VAR );
3806
3807}
3808
3809/*----------------------------------------------------------------------------
3810| Returns the result of converting the extended double-precision floating-
3811| point value `a' to the 32-bit two's complement integer format. The
3812| conversion is performed according to the IEC/IEEE Standard for Binary
3813| Floating-Point Arithmetic, except that the conversion is always rounded
3814| toward zero. If `a' is a NaN, the largest positive integer is returned.
3815| Otherwise, if the conversion overflows, the largest integer with the same
3816| sign as `a' is returned.
3817*----------------------------------------------------------------------------*/
3818
3819int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
3820{
3821 flag aSign;
3822 int32 aExp, shiftCount;
bb98fe42 3823 uint64_t aSig, savedASig;
158142c2
FB
3824 int32 z;
3825
3826 aSig = extractFloatx80Frac( a );
3827 aExp = extractFloatx80Exp( a );
3828 aSign = extractFloatx80Sign( a );
3829 if ( 0x401E < aExp ) {
bb98fe42 3830 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
158142c2
FB
3831 goto invalid;
3832 }
3833 else if ( aExp < 0x3FFF ) {
3834 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3835 return 0;
3836 }
3837 shiftCount = 0x403E - aExp;
3838 savedASig = aSig;
3839 aSig >>= shiftCount;
3840 z = aSig;
3841 if ( aSign ) z = - z;
3842 if ( ( z < 0 ) ^ aSign ) {
3843 invalid:
3844 float_raise( float_flag_invalid STATUS_VAR);
bb98fe42 3845 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2
FB
3846 }
3847 if ( ( aSig<<shiftCount ) != savedASig ) {
3848 STATUS(float_exception_flags) |= float_flag_inexact;
3849 }
3850 return z;
3851
3852}
3853
3854/*----------------------------------------------------------------------------
3855| Returns the result of converting the extended double-precision floating-
3856| point value `a' to the 64-bit two's complement integer format. The
3857| conversion is performed according to the IEC/IEEE Standard for Binary
3858| Floating-Point Arithmetic---which means in particular that the conversion
3859| is rounded according to the current rounding mode. If `a' is a NaN,
3860| the largest positive integer is returned. Otherwise, if the conversion
3861| overflows, the largest integer with the same sign as `a' is returned.
3862*----------------------------------------------------------------------------*/
3863
3864int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
3865{
3866 flag aSign;
3867 int32 aExp, shiftCount;
bb98fe42 3868 uint64_t aSig, aSigExtra;
158142c2
FB
3869
3870 aSig = extractFloatx80Frac( a );
3871 aExp = extractFloatx80Exp( a );
3872 aSign = extractFloatx80Sign( a );
3873 shiftCount = 0x403E - aExp;
3874 if ( shiftCount <= 0 ) {
3875 if ( shiftCount ) {
3876 float_raise( float_flag_invalid STATUS_VAR);
3877 if ( ! aSign
3878 || ( ( aExp == 0x7FFF )
3879 && ( aSig != LIT64( 0x8000000000000000 ) ) )
3880 ) {
3881 return LIT64( 0x7FFFFFFFFFFFFFFF );
3882 }
bb98fe42 3883 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
3884 }
3885 aSigExtra = 0;
3886 }
3887 else {
3888 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3889 }
3890 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3891
3892}
3893
3894/*----------------------------------------------------------------------------
3895| Returns the result of converting the extended double-precision floating-
3896| point value `a' to the 64-bit two's complement integer format. The
3897| conversion is performed according to the IEC/IEEE Standard for Binary
3898| Floating-Point Arithmetic, except that the conversion is always rounded
3899| toward zero. If `a' is a NaN, the largest positive integer is returned.
3900| Otherwise, if the conversion overflows, the largest integer with the same
3901| sign as `a' is returned.
3902*----------------------------------------------------------------------------*/
3903
3904int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
3905{
3906 flag aSign;
3907 int32 aExp, shiftCount;
bb98fe42 3908 uint64_t aSig;
158142c2
FB
3909 int64 z;
3910
3911 aSig = extractFloatx80Frac( a );
3912 aExp = extractFloatx80Exp( a );
3913 aSign = extractFloatx80Sign( a );
3914 shiftCount = aExp - 0x403E;
3915 if ( 0 <= shiftCount ) {
3916 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3917 if ( ( a.high != 0xC03E ) || aSig ) {
3918 float_raise( float_flag_invalid STATUS_VAR);
3919 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3920 return LIT64( 0x7FFFFFFFFFFFFFFF );
3921 }
3922 }
bb98fe42 3923 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
3924 }
3925 else if ( aExp < 0x3FFF ) {
3926 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3927 return 0;
3928 }
3929 z = aSig>>( - shiftCount );
bb98fe42 3930 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
158142c2
FB
3931 STATUS(float_exception_flags) |= float_flag_inexact;
3932 }
3933 if ( aSign ) z = - z;
3934 return z;
3935
3936}
3937
3938/*----------------------------------------------------------------------------
3939| Returns the result of converting the extended double-precision floating-
3940| point value `a' to the single-precision floating-point format. The
3941| conversion is performed according to the IEC/IEEE Standard for Binary
3942| Floating-Point Arithmetic.
3943*----------------------------------------------------------------------------*/
3944
3945float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
3946{
3947 flag aSign;
3948 int32 aExp;
bb98fe42 3949 uint64_t aSig;
158142c2
FB
3950
3951 aSig = extractFloatx80Frac( a );
3952 aExp = extractFloatx80Exp( a );
3953 aSign = extractFloatx80Sign( a );
3954 if ( aExp == 0x7FFF ) {
bb98fe42 3955 if ( (uint64_t) ( aSig<<1 ) ) {
bcd4d9af 3956 return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
3957 }
3958 return packFloat32( aSign, 0xFF, 0 );
3959 }
3960 shift64RightJamming( aSig, 33, &aSig );
3961 if ( aExp || aSig ) aExp -= 0x3F81;
3962 return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
3963
3964}
3965
3966/*----------------------------------------------------------------------------
3967| Returns the result of converting the extended double-precision floating-
3968| point value `a' to the double-precision floating-point format. The
3969| conversion is performed according to the IEC/IEEE Standard for Binary
3970| Floating-Point Arithmetic.
3971*----------------------------------------------------------------------------*/
3972
3973float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
3974{
3975 flag aSign;
3976 int32 aExp;
bb98fe42 3977 uint64_t aSig, zSig;
158142c2
FB
3978
3979 aSig = extractFloatx80Frac( a );
3980 aExp = extractFloatx80Exp( a );
3981 aSign = extractFloatx80Sign( a );
3982 if ( aExp == 0x7FFF ) {
bb98fe42 3983 if ( (uint64_t) ( aSig<<1 ) ) {
bcd4d9af 3984 return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
3985 }
3986 return packFloat64( aSign, 0x7FF, 0 );
3987 }
3988 shift64RightJamming( aSig, 1, &zSig );
3989 if ( aExp || aSig ) aExp -= 0x3C01;
3990 return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
3991
3992}
3993
158142c2
FB
3994/*----------------------------------------------------------------------------
3995| Returns the result of converting the extended double-precision floating-
3996| point value `a' to the quadruple-precision floating-point format. The
3997| conversion is performed according to the IEC/IEEE Standard for Binary
3998| Floating-Point Arithmetic.
3999*----------------------------------------------------------------------------*/
4000
4001float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
4002{
4003 flag aSign;
4004 int16 aExp;
bb98fe42 4005 uint64_t aSig, zSig0, zSig1;
158142c2
FB
4006
4007 aSig = extractFloatx80Frac( a );
4008 aExp = extractFloatx80Exp( a );
4009 aSign = extractFloatx80Sign( a );
bb98fe42 4010 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
bcd4d9af 4011 return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
4012 }
4013 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4014 return packFloat128( aSign, aExp, zSig0, zSig1 );
4015
4016}
4017
158142c2
FB
4018/*----------------------------------------------------------------------------
4019| Rounds the extended double-precision floating-point value `a' to an integer,
4020| and returns the result as an extended quadruple-precision floating-point
4021| value. The operation is performed according to the IEC/IEEE Standard for
4022| Binary Floating-Point Arithmetic.
4023*----------------------------------------------------------------------------*/
4024
4025floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
4026{
4027 flag aSign;
4028 int32 aExp;
bb98fe42 4029 uint64_t lastBitMask, roundBitsMask;
158142c2
FB
4030 int8 roundingMode;
4031 floatx80 z;
4032
4033 aExp = extractFloatx80Exp( a );
4034 if ( 0x403E <= aExp ) {
bb98fe42 4035 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
158142c2
FB
4036 return propagateFloatx80NaN( a, a STATUS_VAR );
4037 }
4038 return a;
4039 }
4040 if ( aExp < 0x3FFF ) {
4041 if ( ( aExp == 0 )
bb98fe42 4042 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
158142c2
FB
4043 return a;
4044 }
4045 STATUS(float_exception_flags) |= float_flag_inexact;
4046 aSign = extractFloatx80Sign( a );
4047 switch ( STATUS(float_rounding_mode) ) {
4048 case float_round_nearest_even:
bb98fe42 4049 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
158142c2
FB
4050 ) {
4051 return
4052 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4053 }
4054 break;
4055 case float_round_down:
4056 return
4057 aSign ?
4058 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4059 : packFloatx80( 0, 0, 0 );
4060 case float_round_up:
4061 return
4062 aSign ? packFloatx80( 1, 0, 0 )
4063 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4064 }
4065 return packFloatx80( aSign, 0, 0 );
4066 }
4067 lastBitMask = 1;
4068 lastBitMask <<= 0x403E - aExp;
4069 roundBitsMask = lastBitMask - 1;
4070 z = a;
4071 roundingMode = STATUS(float_rounding_mode);
4072 if ( roundingMode == float_round_nearest_even ) {
4073 z.low += lastBitMask>>1;
4074 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4075 }
4076 else if ( roundingMode != float_round_to_zero ) {
4077 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
4078 z.low += roundBitsMask;
4079 }
4080 }
4081 z.low &= ~ roundBitsMask;
4082 if ( z.low == 0 ) {
4083 ++z.high;
4084 z.low = LIT64( 0x8000000000000000 );
4085 }
4086 if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
4087 return z;
4088
4089}
4090
4091/*----------------------------------------------------------------------------
4092| Returns the result of adding the absolute values of the extended double-
4093| precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
4094| negated before being returned. `zSign' is ignored if the result is a NaN.
4095| The addition is performed according to the IEC/IEEE Standard for Binary
4096| Floating-Point Arithmetic.
4097*----------------------------------------------------------------------------*/
4098
4099static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
4100{
4101 int32 aExp, bExp, zExp;
bb98fe42 4102 uint64_t aSig, bSig, zSig0, zSig1;
158142c2
FB
4103 int32 expDiff;
4104
4105 aSig = extractFloatx80Frac( a );
4106 aExp = extractFloatx80Exp( a );
4107 bSig = extractFloatx80Frac( b );
4108 bExp = extractFloatx80Exp( b );
4109 expDiff = aExp - bExp;
4110 if ( 0 < expDiff ) {
4111 if ( aExp == 0x7FFF ) {
bb98fe42 4112 if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
4113 return a;
4114 }
4115 if ( bExp == 0 ) --expDiff;
4116 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4117 zExp = aExp;
4118 }
4119 else if ( expDiff < 0 ) {
4120 if ( bExp == 0x7FFF ) {
bb98fe42 4121 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
4122 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4123 }
4124 if ( aExp == 0 ) ++expDiff;
4125 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4126 zExp = bExp;
4127 }
4128 else {
4129 if ( aExp == 0x7FFF ) {
bb98fe42 4130 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
158142c2
FB
4131 return propagateFloatx80NaN( a, b STATUS_VAR );
4132 }
4133 return a;
4134 }
4135 zSig1 = 0;
4136 zSig0 = aSig + bSig;
4137 if ( aExp == 0 ) {
4138 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4139 goto roundAndPack;
4140 }
4141 zExp = aExp;
4142 goto shiftRight1;
4143 }
4144 zSig0 = aSig + bSig;
bb98fe42 4145 if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
158142c2
FB
4146 shiftRight1:
4147 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4148 zSig0 |= LIT64( 0x8000000000000000 );
4149 ++zExp;
4150 roundAndPack:
4151 return
4152 roundAndPackFloatx80(
4153 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4154
4155}
4156
4157/*----------------------------------------------------------------------------
4158| Returns the result of subtracting the absolute values of the extended
4159| double-precision floating-point values `a' and `b'. If `zSign' is 1, the
4160| difference is negated before being returned. `zSign' is ignored if the
4161| result is a NaN. The subtraction is performed according to the IEC/IEEE
4162| Standard for Binary Floating-Point Arithmetic.
4163*----------------------------------------------------------------------------*/
4164
4165static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
4166{
4167 int32 aExp, bExp, zExp;
bb98fe42 4168 uint64_t aSig, bSig, zSig0, zSig1;
158142c2
FB
4169 int32 expDiff;
4170 floatx80 z;
4171
4172 aSig = extractFloatx80Frac( a );
4173 aExp = extractFloatx80Exp( a );
4174 bSig = extractFloatx80Frac( b );
4175 bExp = extractFloatx80Exp( b );
4176 expDiff = aExp - bExp;
4177 if ( 0 < expDiff ) goto aExpBigger;
4178 if ( expDiff < 0 ) goto bExpBigger;
4179 if ( aExp == 0x7FFF ) {
bb98fe42 4180 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
158142c2
FB
4181 return propagateFloatx80NaN( a, b STATUS_VAR );
4182 }
4183 float_raise( float_flag_invalid STATUS_VAR);
4184 z.low = floatx80_default_nan_low;
4185 z.high = floatx80_default_nan_high;
4186 return z;
4187 }
4188 if ( aExp == 0 ) {
4189 aExp = 1;
4190 bExp = 1;
4191 }
4192 zSig1 = 0;
4193 if ( bSig < aSig ) goto aBigger;
4194 if ( aSig < bSig ) goto bBigger;
4195 return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
4196 bExpBigger:
4197 if ( bExp == 0x7FFF ) {
bb98fe42 4198 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
4199 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
4200 }
4201 if ( aExp == 0 ) ++expDiff;
4202 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4203 bBigger:
4204 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4205 zExp = bExp;
4206 zSign ^= 1;
4207 goto normalizeRoundAndPack;
4208 aExpBigger:
4209 if ( aExp == 0x7FFF ) {
bb98fe42 4210 if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
4211 return a;
4212 }
4213 if ( bExp == 0 ) --expDiff;
4214 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4215 aBigger:
4216 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4217 zExp = aExp;
4218 normalizeRoundAndPack:
4219 return
4220 normalizeRoundAndPackFloatx80(
4221 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4222
4223}
4224
4225/*----------------------------------------------------------------------------
4226| Returns the result of adding the extended double-precision floating-point
4227| values `a' and `b'. The operation is performed according to the IEC/IEEE
4228| Standard for Binary Floating-Point Arithmetic.
4229*----------------------------------------------------------------------------*/
4230
4231floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
4232{
4233 flag aSign, bSign;
4234
4235 aSign = extractFloatx80Sign( a );
4236 bSign = extractFloatx80Sign( b );
4237 if ( aSign == bSign ) {
4238 return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4239 }
4240 else {
4241 return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4242 }
4243
4244}
4245
4246/*----------------------------------------------------------------------------
4247| Returns the result of subtracting the extended double-precision floating-
4248| point values `a' and `b'. The operation is performed according to the
4249| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4250*----------------------------------------------------------------------------*/
4251
4252floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
4253{
4254 flag aSign, bSign;
4255
4256 aSign = extractFloatx80Sign( a );
4257 bSign = extractFloatx80Sign( b );
4258 if ( aSign == bSign ) {
4259 return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4260 }
4261 else {
4262 return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4263 }
4264
4265}
4266
4267/*----------------------------------------------------------------------------
4268| Returns the result of multiplying the extended double-precision floating-
4269| point values `a' and `b'. The operation is performed according to the
4270| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4271*----------------------------------------------------------------------------*/
4272
4273floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
4274{
4275 flag aSign, bSign, zSign;
4276 int32 aExp, bExp, zExp;
bb98fe42 4277 uint64_t aSig, bSig, zSig0, zSig1;
158142c2
FB
4278 floatx80 z;
4279
4280 aSig = extractFloatx80Frac( a );
4281 aExp = extractFloatx80Exp( a );
4282 aSign = extractFloatx80Sign( a );
4283 bSig = extractFloatx80Frac( b );
4284 bExp = extractFloatx80Exp( b );
4285 bSign = extractFloatx80Sign( b );
4286 zSign = aSign ^ bSign;
4287 if ( aExp == 0x7FFF ) {
bb98fe42
AF
4288 if ( (uint64_t) ( aSig<<1 )
4289 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
158142c2
FB
4290 return propagateFloatx80NaN( a, b STATUS_VAR );
4291 }
4292 if ( ( bExp | bSig ) == 0 ) goto invalid;
4293 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4294 }
4295 if ( bExp == 0x7FFF ) {
bb98fe42 4296 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
4297 if ( ( aExp | aSig ) == 0 ) {
4298 invalid:
4299 float_raise( float_flag_invalid STATUS_VAR);
4300 z.low = floatx80_default_nan_low;
4301 z.high = floatx80_default_nan_high;
4302 return z;
4303 }
4304 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4305 }
4306 if ( aExp == 0 ) {
4307 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4308 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4309 }
4310 if ( bExp == 0 ) {
4311 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
4312 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4313 }
4314 zExp = aExp + bExp - 0x3FFE;
4315 mul64To128( aSig, bSig, &zSig0, &zSig1 );
bb98fe42 4316 if ( 0 < (int64_t) zSig0 ) {
158142c2
FB
4317 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
4318 --zExp;
4319 }
4320 return
4321 roundAndPackFloatx80(
4322 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4323
4324}
4325
4326/*----------------------------------------------------------------------------
4327| Returns the result of dividing the extended double-precision floating-point
4328| value `a' by the corresponding value `b'. The operation is performed
4329| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4330*----------------------------------------------------------------------------*/
4331
4332floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
4333{
4334 flag aSign, bSign, zSign;
4335 int32 aExp, bExp, zExp;
bb98fe42
AF
4336 uint64_t aSig, bSig, zSig0, zSig1;
4337 uint64_t rem0, rem1, rem2, term0, term1, term2;
158142c2
FB
4338 floatx80 z;
4339
4340 aSig = extractFloatx80Frac( a );
4341 aExp = extractFloatx80Exp( a );
4342 aSign = extractFloatx80Sign( a );
4343 bSig = extractFloatx80Frac( b );
4344 bExp = extractFloatx80Exp( b );
4345 bSign = extractFloatx80Sign( b );
4346 zSign = aSign ^ bSign;
4347 if ( aExp == 0x7FFF ) {
bb98fe42 4348 if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2 4349 if ( bExp == 0x7FFF ) {
bb98fe42 4350 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
4351 goto invalid;
4352 }
4353 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4354 }
4355 if ( bExp == 0x7FFF ) {
bb98fe42 4356 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
4357 return packFloatx80( zSign, 0, 0 );
4358 }
4359 if ( bExp == 0 ) {
4360 if ( bSig == 0 ) {
4361 if ( ( aExp | aSig ) == 0 ) {
4362 invalid:
4363 float_raise( float_flag_invalid STATUS_VAR);
4364 z.low = floatx80_default_nan_low;
4365 z.high = floatx80_default_nan_high;
4366 return z;
4367 }
4368 float_raise( float_flag_divbyzero STATUS_VAR);
4369 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4370 }
4371 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4372 }
4373 if ( aExp == 0 ) {
4374 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4375 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4376 }
4377 zExp = aExp - bExp + 0x3FFE;
4378 rem1 = 0;
4379 if ( bSig <= aSig ) {
4380 shift128Right( aSig, 0, 1, &aSig, &rem1 );
4381 ++zExp;
4382 }
4383 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
4384 mul64To128( bSig, zSig0, &term0, &term1 );
4385 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
bb98fe42 4386 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
4387 --zSig0;
4388 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4389 }
4390 zSig1 = estimateDiv128To64( rem1, 0, bSig );
bb98fe42 4391 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
158142c2
FB
4392 mul64To128( bSig, zSig1, &term1, &term2 );
4393 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
bb98fe42 4394 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
4395 --zSig1;
4396 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
4397 }
4398 zSig1 |= ( ( rem1 | rem2 ) != 0 );
4399 }
4400 return
4401 roundAndPackFloatx80(
4402 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4403
4404}
4405
4406/*----------------------------------------------------------------------------
4407| Returns the remainder of the extended double-precision floating-point value
4408| `a' with respect to the corresponding value `b'. The operation is performed
4409| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4410*----------------------------------------------------------------------------*/
4411
4412floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
4413{
ed086f3d 4414 flag aSign, zSign;
158142c2 4415 int32 aExp, bExp, expDiff;
bb98fe42
AF
4416 uint64_t aSig0, aSig1, bSig;
4417 uint64_t q, term0, term1, alternateASig0, alternateASig1;
158142c2
FB
4418 floatx80 z;
4419
4420 aSig0 = extractFloatx80Frac( a );
4421 aExp = extractFloatx80Exp( a );
4422 aSign = extractFloatx80Sign( a );
4423 bSig = extractFloatx80Frac( b );
4424 bExp = extractFloatx80Exp( b );
158142c2 4425 if ( aExp == 0x7FFF ) {
bb98fe42
AF
4426 if ( (uint64_t) ( aSig0<<1 )
4427 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
158142c2
FB
4428 return propagateFloatx80NaN( a, b STATUS_VAR );
4429 }
4430 goto invalid;
4431 }
4432 if ( bExp == 0x7FFF ) {
bb98fe42 4433 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
4434 return a;
4435 }
4436 if ( bExp == 0 ) {
4437 if ( bSig == 0 ) {
4438 invalid:
4439 float_raise( float_flag_invalid STATUS_VAR);
4440 z.low = floatx80_default_nan_low;
4441 z.high = floatx80_default_nan_high;
4442 return z;
4443 }
4444 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4445 }
4446 if ( aExp == 0 ) {
bb98fe42 4447 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
158142c2
FB
4448 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4449 }
4450 bSig |= LIT64( 0x8000000000000000 );
4451 zSign = aSign;
4452 expDiff = aExp - bExp;
4453 aSig1 = 0;
4454 if ( expDiff < 0 ) {
4455 if ( expDiff < -1 ) return a;
4456 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
4457 expDiff = 0;
4458 }
4459 q = ( bSig <= aSig0 );
4460 if ( q ) aSig0 -= bSig;
4461 expDiff -= 64;
4462 while ( 0 < expDiff ) {
4463 q = estimateDiv128To64( aSig0, aSig1, bSig );
4464 q = ( 2 < q ) ? q - 2 : 0;
4465 mul64To128( bSig, q, &term0, &term1 );
4466 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4467 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
4468 expDiff -= 62;
4469 }
4470 expDiff += 64;
4471 if ( 0 < expDiff ) {
4472 q = estimateDiv128To64( aSig0, aSig1, bSig );
4473 q = ( 2 < q ) ? q - 2 : 0;
4474 q >>= 64 - expDiff;
4475 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
4476 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4477 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4478 while ( le128( term0, term1, aSig0, aSig1 ) ) {
4479 ++q;
4480 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4481 }
4482 }
4483 else {
4484 term1 = 0;
4485 term0 = bSig;
4486 }
4487 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4488 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4489 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4490 && ( q & 1 ) )
4491 ) {
4492 aSig0 = alternateASig0;
4493 aSig1 = alternateASig1;
4494 zSign = ! zSign;
4495 }
4496 return
4497 normalizeRoundAndPackFloatx80(
4498 80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
4499
4500}
4501
4502/*----------------------------------------------------------------------------
4503| Returns the square root of the extended double-precision floating-point
4504| value `a'. The operation is performed according to the IEC/IEEE Standard
4505| for Binary Floating-Point Arithmetic.
4506*----------------------------------------------------------------------------*/
4507
4508floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
4509{
4510 flag aSign;
4511 int32 aExp, zExp;
bb98fe42
AF
4512 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4513 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
158142c2
FB
4514 floatx80 z;
4515
4516 aSig0 = extractFloatx80Frac( a );
4517 aExp = extractFloatx80Exp( a );
4518 aSign = extractFloatx80Sign( a );
4519 if ( aExp == 0x7FFF ) {
bb98fe42 4520 if ( (uint64_t) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
158142c2
FB
4521 if ( ! aSign ) return a;
4522 goto invalid;
4523 }
4524 if ( aSign ) {
4525 if ( ( aExp | aSig0 ) == 0 ) return a;
4526 invalid:
4527 float_raise( float_flag_invalid STATUS_VAR);
4528 z.low = floatx80_default_nan_low;
4529 z.high = floatx80_default_nan_high;
4530 return z;
4531 }
4532 if ( aExp == 0 ) {
4533 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4534 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4535 }
4536 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4537 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4538 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4539 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4540 doubleZSig0 = zSig0<<1;
4541 mul64To128( zSig0, zSig0, &term0, &term1 );
4542 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
bb98fe42 4543 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
4544 --zSig0;
4545 doubleZSig0 -= 2;
4546 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4547 }
4548 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4549 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4550 if ( zSig1 == 0 ) zSig1 = 1;
4551 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4552 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4553 mul64To128( zSig1, zSig1, &term2, &term3 );
4554 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
bb98fe42 4555 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
4556 --zSig1;
4557 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4558 term3 |= 1;
4559 term2 |= doubleZSig0;
4560 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4561 }
4562 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4563 }
4564 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4565 zSig0 |= doubleZSig0;
4566 return
4567 roundAndPackFloatx80(
4568 STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
4569
4570}
4571
4572/*----------------------------------------------------------------------------
b689362d
AJ
4573| Returns 1 if the extended double-precision floating-point value `a' is equal
4574| to the corresponding value `b', and 0 otherwise. The invalid exception is
4575| raised if either operand is a NaN. Otherwise, the comparison is performed
4576| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
4577*----------------------------------------------------------------------------*/
4578
b689362d 4579int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
158142c2
FB
4580{
4581
4582 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 4583 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 4584 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 4585 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2 4586 ) {
b689362d 4587 float_raise( float_flag_invalid STATUS_VAR);
158142c2
FB
4588 return 0;
4589 }
4590 return
4591 ( a.low == b.low )
4592 && ( ( a.high == b.high )
4593 || ( ( a.low == 0 )
bb98fe42 4594 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
4595 );
4596
4597}
4598
4599/*----------------------------------------------------------------------------
4600| Returns 1 if the extended double-precision floating-point value `a' is
4601| less than or equal to the corresponding value `b', and 0 otherwise. The
f5a64251
AJ
4602| invalid exception is raised if either operand is a NaN. The comparison is
4603| performed according to the IEC/IEEE Standard for Binary Floating-Point
4604| Arithmetic.
158142c2
FB
4605*----------------------------------------------------------------------------*/
4606
750afe93 4607int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
158142c2
FB
4608{
4609 flag aSign, bSign;
4610
4611 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 4612 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 4613 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 4614 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2
FB
4615 ) {
4616 float_raise( float_flag_invalid STATUS_VAR);
4617 return 0;
4618 }
4619 aSign = extractFloatx80Sign( a );
4620 bSign = extractFloatx80Sign( b );
4621 if ( aSign != bSign ) {
4622 return
4623 aSign
bb98fe42 4624 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
4625 == 0 );
4626 }
4627 return
4628 aSign ? le128( b.high, b.low, a.high, a.low )
4629 : le128( a.high, a.low, b.high, b.low );
4630
4631}
4632
4633/*----------------------------------------------------------------------------
4634| Returns 1 if the extended double-precision floating-point value `a' is
f5a64251
AJ
4635| less than the corresponding value `b', and 0 otherwise. The invalid
4636| exception is raised if either operand is a NaN. The comparison is performed
4637| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
4638*----------------------------------------------------------------------------*/
4639
750afe93 4640int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
158142c2
FB
4641{
4642 flag aSign, bSign;
4643
4644 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 4645 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 4646 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 4647 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2
FB
4648 ) {
4649 float_raise( float_flag_invalid STATUS_VAR);
4650 return 0;
4651 }
4652 aSign = extractFloatx80Sign( a );
4653 bSign = extractFloatx80Sign( b );
4654 if ( aSign != bSign ) {
4655 return
4656 aSign
bb98fe42 4657 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
4658 != 0 );
4659 }
4660 return
4661 aSign ? lt128( b.high, b.low, a.high, a.low )
4662 : lt128( a.high, a.low, b.high, b.low );
4663
4664}
4665
67b7861d
AJ
4666/*----------------------------------------------------------------------------
4667| Returns 1 if the extended double-precision floating-point values `a' and `b'
f5a64251
AJ
4668| cannot be compared, and 0 otherwise. The invalid exception is raised if
4669| either operand is a NaN. The comparison is performed according to the
4670| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
67b7861d
AJ
4671*----------------------------------------------------------------------------*/
4672int floatx80_unordered( floatx80 a, floatx80 b STATUS_PARAM )
4673{
4674 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4675 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4676 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4677 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4678 ) {
4679 float_raise( float_flag_invalid STATUS_VAR);
4680 return 1;
4681 }
4682 return 0;
4683}
4684
158142c2 4685/*----------------------------------------------------------------------------
b689362d 4686| Returns 1 if the extended double-precision floating-point value `a' is
f5a64251
AJ
4687| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4688| cause an exception. The comparison is performed according to the IEC/IEEE
4689| Standard for Binary Floating-Point Arithmetic.
158142c2
FB
4690*----------------------------------------------------------------------------*/
4691
b689362d 4692int floatx80_eq_quiet( floatx80 a, floatx80 b STATUS_PARAM )
158142c2
FB
4693{
4694
4695 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 4696 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 4697 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 4698 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2 4699 ) {
b689362d
AJ
4700 if ( floatx80_is_signaling_nan( a )
4701 || floatx80_is_signaling_nan( b ) ) {
4702 float_raise( float_flag_invalid STATUS_VAR);
4703 }
158142c2
FB
4704 return 0;
4705 }
4706 return
4707 ( a.low == b.low )
4708 && ( ( a.high == b.high )
4709 || ( ( a.low == 0 )
bb98fe42 4710 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
4711 );
4712
4713}
4714
4715/*----------------------------------------------------------------------------
4716| Returns 1 if the extended double-precision floating-point value `a' is less
4717| than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
4718| do not cause an exception. Otherwise, the comparison is performed according
4719| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4720*----------------------------------------------------------------------------*/
4721
750afe93 4722int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
158142c2
FB
4723{
4724 flag aSign, bSign;
4725
4726 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 4727 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 4728 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 4729 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2
FB
4730 ) {
4731 if ( floatx80_is_signaling_nan( a )
4732 || floatx80_is_signaling_nan( b ) ) {
4733 float_raise( float_flag_invalid STATUS_VAR);
4734 }
4735 return 0;
4736 }
4737 aSign = extractFloatx80Sign( a );
4738 bSign = extractFloatx80Sign( b );
4739 if ( aSign != bSign ) {
4740 return
4741 aSign
bb98fe42 4742 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
4743 == 0 );
4744 }
4745 return
4746 aSign ? le128( b.high, b.low, a.high, a.low )
4747 : le128( a.high, a.low, b.high, b.low );
4748
4749}
4750
4751/*----------------------------------------------------------------------------
4752| Returns 1 if the extended double-precision floating-point value `a' is less
4753| than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
4754| an exception. Otherwise, the comparison is performed according to the
4755| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4756*----------------------------------------------------------------------------*/
4757
750afe93 4758int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
158142c2
FB
4759{
4760 flag aSign, bSign;
4761
4762 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 4763 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 4764 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 4765 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2
FB
4766 ) {
4767 if ( floatx80_is_signaling_nan( a )
4768 || floatx80_is_signaling_nan( b ) ) {
4769 float_raise( float_flag_invalid STATUS_VAR);
4770 }
4771 return 0;
4772 }
4773 aSign = extractFloatx80Sign( a );
4774 bSign = extractFloatx80Sign( b );
4775 if ( aSign != bSign ) {
4776 return
4777 aSign
bb98fe42 4778 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
4779 != 0 );
4780 }
4781 return
4782 aSign ? lt128( b.high, b.low, a.high, a.low )
4783 : lt128( a.high, a.low, b.high, b.low );
4784
4785}
4786
67b7861d
AJ
4787/*----------------------------------------------------------------------------
4788| Returns 1 if the extended double-precision floating-point values `a' and `b'
4789| cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
4790| The comparison is performed according to the IEC/IEEE Standard for Binary
4791| Floating-Point Arithmetic.
4792*----------------------------------------------------------------------------*/
4793int floatx80_unordered_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4794{
4795 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4796 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4797 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4798 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4799 ) {
4800 if ( floatx80_is_signaling_nan( a )
4801 || floatx80_is_signaling_nan( b ) ) {
4802 float_raise( float_flag_invalid STATUS_VAR);
4803 }
4804 return 1;
4805 }
4806 return 0;
4807}
4808
158142c2
FB
4809/*----------------------------------------------------------------------------
4810| Returns the result of converting the quadruple-precision floating-point
4811| value `a' to the 32-bit two's complement integer format. The conversion
4812| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4813| Arithmetic---which means in particular that the conversion is rounded
4814| according to the current rounding mode. If `a' is a NaN, the largest
4815| positive integer is returned. Otherwise, if the conversion overflows, the
4816| largest integer with the same sign as `a' is returned.
4817*----------------------------------------------------------------------------*/
4818
4819int32 float128_to_int32( float128 a STATUS_PARAM )
4820{
4821 flag aSign;
4822 int32 aExp, shiftCount;
bb98fe42 4823 uint64_t aSig0, aSig1;
158142c2
FB
4824
4825 aSig1 = extractFloat128Frac1( a );
4826 aSig0 = extractFloat128Frac0( a );
4827 aExp = extractFloat128Exp( a );
4828 aSign = extractFloat128Sign( a );
4829 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4830 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4831 aSig0 |= ( aSig1 != 0 );
4832 shiftCount = 0x4028 - aExp;
4833 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4834 return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
4835
4836}
4837
4838/*----------------------------------------------------------------------------
4839| Returns the result of converting the quadruple-precision floating-point
4840| value `a' to the 32-bit two's complement integer format. The conversion
4841| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4842| Arithmetic, except that the conversion is always rounded toward zero. If
4843| `a' is a NaN, the largest positive integer is returned. Otherwise, if the
4844| conversion overflows, the largest integer with the same sign as `a' is
4845| returned.
4846*----------------------------------------------------------------------------*/
4847
4848int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
4849{
4850 flag aSign;
4851 int32 aExp, shiftCount;
bb98fe42 4852 uint64_t aSig0, aSig1, savedASig;
158142c2
FB
4853 int32 z;
4854
4855 aSig1 = extractFloat128Frac1( a );
4856 aSig0 = extractFloat128Frac0( a );
4857 aExp = extractFloat128Exp( a );
4858 aSign = extractFloat128Sign( a );
4859 aSig0 |= ( aSig1 != 0 );
4860 if ( 0x401E < aExp ) {
4861 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4862 goto invalid;
4863 }
4864 else if ( aExp < 0x3FFF ) {
4865 if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
4866 return 0;
4867 }
4868 aSig0 |= LIT64( 0x0001000000000000 );
4869 shiftCount = 0x402F - aExp;
4870 savedASig = aSig0;
4871 aSig0 >>= shiftCount;
4872 z = aSig0;
4873 if ( aSign ) z = - z;
4874 if ( ( z < 0 ) ^ aSign ) {
4875 invalid:
4876 float_raise( float_flag_invalid STATUS_VAR);
bb98fe42 4877 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2
FB
4878 }
4879 if ( ( aSig0<<shiftCount ) != savedASig ) {
4880 STATUS(float_exception_flags) |= float_flag_inexact;
4881 }
4882 return z;
4883
4884}
4885
4886/*----------------------------------------------------------------------------
4887| Returns the result of converting the quadruple-precision floating-point
4888| value `a' to the 64-bit two's complement integer format. The conversion
4889| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4890| Arithmetic---which means in particular that the conversion is rounded
4891| according to the current rounding mode. If `a' is a NaN, the largest
4892| positive integer is returned. Otherwise, if the conversion overflows, the
4893| largest integer with the same sign as `a' is returned.
4894*----------------------------------------------------------------------------*/
4895
4896int64 float128_to_int64( float128 a STATUS_PARAM )
4897{
4898 flag aSign;
4899 int32 aExp, shiftCount;
bb98fe42 4900 uint64_t aSig0, aSig1;
158142c2
FB
4901
4902 aSig1 = extractFloat128Frac1( a );
4903 aSig0 = extractFloat128Frac0( a );
4904 aExp = extractFloat128Exp( a );
4905 aSign = extractFloat128Sign( a );
4906 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4907 shiftCount = 0x402F - aExp;
4908 if ( shiftCount <= 0 ) {
4909 if ( 0x403E < aExp ) {
4910 float_raise( float_flag_invalid STATUS_VAR);
4911 if ( ! aSign
4912 || ( ( aExp == 0x7FFF )
4913 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4914 )
4915 ) {
4916 return LIT64( 0x7FFFFFFFFFFFFFFF );
4917 }
bb98fe42 4918 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
4919 }
4920 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4921 }
4922 else {
4923 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4924 }
4925 return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
4926
4927}
4928
4929/*----------------------------------------------------------------------------
4930| Returns the result of converting the quadruple-precision floating-point
4931| value `a' to the 64-bit two's complement integer format. The conversion
4932| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4933| Arithmetic, except that the conversion is always rounded toward zero.
4934| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
4935| the conversion overflows, the largest integer with the same sign as `a' is
4936| returned.
4937*----------------------------------------------------------------------------*/
4938
4939int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
4940{
4941 flag aSign;
4942 int32 aExp, shiftCount;
bb98fe42 4943 uint64_t aSig0, aSig1;
158142c2
FB
4944 int64 z;
4945
4946 aSig1 = extractFloat128Frac1( a );
4947 aSig0 = extractFloat128Frac0( a );
4948 aExp = extractFloat128Exp( a );
4949 aSign = extractFloat128Sign( a );
4950 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4951 shiftCount = aExp - 0x402F;
4952 if ( 0 < shiftCount ) {
4953 if ( 0x403E <= aExp ) {
4954 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4955 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
4956 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4957 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
4958 }
4959 else {
4960 float_raise( float_flag_invalid STATUS_VAR);
4961 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4962 return LIT64( 0x7FFFFFFFFFFFFFFF );
4963 }
4964 }
bb98fe42 4965 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
4966 }
4967 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
bb98fe42 4968 if ( (uint64_t) ( aSig1<<shiftCount ) ) {
158142c2
FB
4969 STATUS(float_exception_flags) |= float_flag_inexact;
4970 }
4971 }
4972 else {
4973 if ( aExp < 0x3FFF ) {
4974 if ( aExp | aSig0 | aSig1 ) {
4975 STATUS(float_exception_flags) |= float_flag_inexact;
4976 }
4977 return 0;
4978 }
4979 z = aSig0>>( - shiftCount );
4980 if ( aSig1
bb98fe42 4981 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
158142c2
FB
4982 STATUS(float_exception_flags) |= float_flag_inexact;
4983 }
4984 }
4985 if ( aSign ) z = - z;
4986 return z;
4987
4988}
4989
4990/*----------------------------------------------------------------------------
4991| Returns the result of converting the quadruple-precision floating-point
4992| value `a' to the single-precision floating-point format. The conversion
4993| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4994| Arithmetic.
4995*----------------------------------------------------------------------------*/
4996
4997float32 float128_to_float32( float128 a STATUS_PARAM )
4998{
4999 flag aSign;
5000 int32 aExp;
bb98fe42
AF
5001 uint64_t aSig0, aSig1;
5002 uint32_t zSig;
158142c2
FB
5003
5004 aSig1 = extractFloat128Frac1( a );
5005 aSig0 = extractFloat128Frac0( a );
5006 aExp = extractFloat128Exp( a );
5007 aSign = extractFloat128Sign( a );
5008 if ( aExp == 0x7FFF ) {
5009 if ( aSig0 | aSig1 ) {
bcd4d9af 5010 return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
5011 }
5012 return packFloat32( aSign, 0xFF, 0 );
5013 }
5014 aSig0 |= ( aSig1 != 0 );
5015 shift64RightJamming( aSig0, 18, &aSig0 );
5016 zSig = aSig0;
5017 if ( aExp || zSig ) {
5018 zSig |= 0x40000000;
5019 aExp -= 0x3F81;
5020 }
5021 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
5022
5023}
5024
5025/*----------------------------------------------------------------------------
5026| Returns the result of converting the quadruple-precision floating-point
5027| value `a' to the double-precision floating-point format. The conversion
5028| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5029| Arithmetic.
5030*----------------------------------------------------------------------------*/
5031
5032float64 float128_to_float64( float128 a STATUS_PARAM )
5033{
5034 flag aSign;
5035 int32 aExp;
bb98fe42 5036 uint64_t aSig0, aSig1;
158142c2
FB
5037
5038 aSig1 = extractFloat128Frac1( a );
5039 aSig0 = extractFloat128Frac0( a );
5040 aExp = extractFloat128Exp( a );
5041 aSign = extractFloat128Sign( a );
5042 if ( aExp == 0x7FFF ) {
5043 if ( aSig0 | aSig1 ) {
bcd4d9af 5044 return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
5045 }
5046 return packFloat64( aSign, 0x7FF, 0 );
5047 }
5048 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5049 aSig0 |= ( aSig1 != 0 );
5050 if ( aExp || aSig0 ) {
5051 aSig0 |= LIT64( 0x4000000000000000 );
5052 aExp -= 0x3C01;
5053 }
5054 return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
5055
5056}
5057
158142c2
FB
5058/*----------------------------------------------------------------------------
5059| Returns the result of converting the quadruple-precision floating-point
5060| value `a' to the extended double-precision floating-point format. The
5061| conversion is performed according to the IEC/IEEE Standard for Binary
5062| Floating-Point Arithmetic.
5063*----------------------------------------------------------------------------*/
5064
5065floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
5066{
5067 flag aSign;
5068 int32 aExp;
bb98fe42 5069 uint64_t aSig0, aSig1;
158142c2
FB
5070
5071 aSig1 = extractFloat128Frac1( a );
5072 aSig0 = extractFloat128Frac0( a );
5073 aExp = extractFloat128Exp( a );
5074 aSign = extractFloat128Sign( a );
5075 if ( aExp == 0x7FFF ) {
5076 if ( aSig0 | aSig1 ) {
bcd4d9af 5077 return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
5078 }
5079 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5080 }
5081 if ( aExp == 0 ) {
5082 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5083 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5084 }
5085 else {
5086 aSig0 |= LIT64( 0x0001000000000000 );
5087 }
5088 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5089 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
5090
5091}
5092
158142c2
FB
5093/*----------------------------------------------------------------------------
5094| Rounds the quadruple-precision floating-point value `a' to an integer, and
5095| returns the result as a quadruple-precision floating-point value. The
5096| operation is performed according to the IEC/IEEE Standard for Binary
5097| Floating-Point Arithmetic.
5098*----------------------------------------------------------------------------*/
5099
5100float128 float128_round_to_int( float128 a STATUS_PARAM )
5101{
5102 flag aSign;
5103 int32 aExp;
bb98fe42 5104 uint64_t lastBitMask, roundBitsMask;
158142c2
FB
5105 int8 roundingMode;
5106 float128 z;
5107
5108 aExp = extractFloat128Exp( a );
5109 if ( 0x402F <= aExp ) {
5110 if ( 0x406F <= aExp ) {
5111 if ( ( aExp == 0x7FFF )
5112 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5113 ) {
5114 return propagateFloat128NaN( a, a STATUS_VAR );
5115 }
5116 return a;
5117 }
5118 lastBitMask = 1;
5119 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5120 roundBitsMask = lastBitMask - 1;
5121 z = a;
5122 roundingMode = STATUS(float_rounding_mode);
5123 if ( roundingMode == float_round_nearest_even ) {
5124 if ( lastBitMask ) {
5125 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
5126 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
5127 }
5128 else {
bb98fe42 5129 if ( (int64_t) z.low < 0 ) {
158142c2 5130 ++z.high;
bb98fe42 5131 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
158142c2
FB
5132 }
5133 }
5134 }
5135 else if ( roundingMode != float_round_to_zero ) {
5136 if ( extractFloat128Sign( z )
5137 ^ ( roundingMode == float_round_up ) ) {
5138 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
5139 }
5140 }
5141 z.low &= ~ roundBitsMask;
5142 }
5143 else {
5144 if ( aExp < 0x3FFF ) {
bb98fe42 5145 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
158142c2
FB
5146 STATUS(float_exception_flags) |= float_flag_inexact;
5147 aSign = extractFloat128Sign( a );
5148 switch ( STATUS(float_rounding_mode) ) {
5149 case float_round_nearest_even:
5150 if ( ( aExp == 0x3FFE )
5151 && ( extractFloat128Frac0( a )
5152 | extractFloat128Frac1( a ) )
5153 ) {
5154 return packFloat128( aSign, 0x3FFF, 0, 0 );
5155 }
5156 break;
5157 case float_round_down:
5158 return
5159 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
5160 : packFloat128( 0, 0, 0, 0 );
5161 case float_round_up:
5162 return
5163 aSign ? packFloat128( 1, 0, 0, 0 )
5164 : packFloat128( 0, 0x3FFF, 0, 0 );
5165 }
5166 return packFloat128( aSign, 0, 0, 0 );
5167 }
5168 lastBitMask = 1;
5169 lastBitMask <<= 0x402F - aExp;
5170 roundBitsMask = lastBitMask - 1;
5171 z.low = 0;
5172 z.high = a.high;
5173 roundingMode = STATUS(float_rounding_mode);
5174 if ( roundingMode == float_round_nearest_even ) {
5175 z.high += lastBitMask>>1;
5176 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
5177 z.high &= ~ lastBitMask;
5178 }
5179 }
5180 else if ( roundingMode != float_round_to_zero ) {
5181 if ( extractFloat128Sign( z )
5182 ^ ( roundingMode == float_round_up ) ) {
5183 z.high |= ( a.low != 0 );
5184 z.high += roundBitsMask;
5185 }
5186 }
5187 z.high &= ~ roundBitsMask;
5188 }
5189 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
5190 STATUS(float_exception_flags) |= float_flag_inexact;
5191 }
5192 return z;
5193
5194}
5195
5196/*----------------------------------------------------------------------------
5197| Returns the result of adding the absolute values of the quadruple-precision
5198| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
5199| before being returned. `zSign' is ignored if the result is a NaN.
5200| The addition is performed according to the IEC/IEEE Standard for Binary
5201| Floating-Point Arithmetic.
5202*----------------------------------------------------------------------------*/
5203
5204static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5205{
5206 int32 aExp, bExp, zExp;
bb98fe42 5207 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
158142c2
FB
5208 int32 expDiff;
5209
5210 aSig1 = extractFloat128Frac1( a );
5211 aSig0 = extractFloat128Frac0( a );
5212 aExp = extractFloat128Exp( a );
5213 bSig1 = extractFloat128Frac1( b );
5214 bSig0 = extractFloat128Frac0( b );
5215 bExp = extractFloat128Exp( b );
5216 expDiff = aExp - bExp;
5217 if ( 0 < expDiff ) {
5218 if ( aExp == 0x7FFF ) {
5219 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5220 return a;
5221 }
5222 if ( bExp == 0 ) {
5223 --expDiff;
5224 }
5225 else {
5226 bSig0 |= LIT64( 0x0001000000000000 );
5227 }
5228 shift128ExtraRightJamming(
5229 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
5230 zExp = aExp;
5231 }
5232 else if ( expDiff < 0 ) {
5233 if ( bExp == 0x7FFF ) {
5234 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5235 return packFloat128( zSign, 0x7FFF, 0, 0 );
5236 }
5237 if ( aExp == 0 ) {
5238 ++expDiff;
5239 }
5240 else {
5241 aSig0 |= LIT64( 0x0001000000000000 );
5242 }
5243 shift128ExtraRightJamming(
5244 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
5245 zExp = bExp;
5246 }
5247 else {
5248 if ( aExp == 0x7FFF ) {
5249 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5250 return propagateFloat128NaN( a, b STATUS_VAR );
5251 }
5252 return a;
5253 }
5254 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
fe76d976 5255 if ( aExp == 0 ) {
e6afc87f
PM
5256 if (STATUS(flush_to_zero)) {
5257 if (zSig0 | zSig1) {
5258 float_raise(float_flag_output_denormal STATUS_VAR);
5259 }
5260 return packFloat128(zSign, 0, 0, 0);
5261 }
fe76d976
PB
5262 return packFloat128( zSign, 0, zSig0, zSig1 );
5263 }
158142c2
FB
5264 zSig2 = 0;
5265 zSig0 |= LIT64( 0x0002000000000000 );
5266 zExp = aExp;
5267 goto shiftRight1;
5268 }
5269 aSig0 |= LIT64( 0x0001000000000000 );
5270 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5271 --zExp;
5272 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
5273 ++zExp;
5274 shiftRight1:
5275 shift128ExtraRightJamming(
5276 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5277 roundAndPack:
5278 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5279
5280}
5281
5282/*----------------------------------------------------------------------------
5283| Returns the result of subtracting the absolute values of the quadruple-
5284| precision floating-point values `a' and `b'. If `zSign' is 1, the
5285| difference is negated before being returned. `zSign' is ignored if the
5286| result is a NaN. The subtraction is performed according to the IEC/IEEE
5287| Standard for Binary Floating-Point Arithmetic.
5288*----------------------------------------------------------------------------*/
5289
5290static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5291{
5292 int32 aExp, bExp, zExp;
bb98fe42 5293 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
158142c2
FB
5294 int32 expDiff;
5295 float128 z;
5296
5297 aSig1 = extractFloat128Frac1( a );
5298 aSig0 = extractFloat128Frac0( a );
5299 aExp = extractFloat128Exp( a );
5300 bSig1 = extractFloat128Frac1( b );
5301 bSig0 = extractFloat128Frac0( b );
5302 bExp = extractFloat128Exp( b );
5303 expDiff = aExp - bExp;
5304 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5305 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
5306 if ( 0 < expDiff ) goto aExpBigger;
5307 if ( expDiff < 0 ) goto bExpBigger;
5308 if ( aExp == 0x7FFF ) {
5309 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5310 return propagateFloat128NaN( a, b STATUS_VAR );
5311 }
5312 float_raise( float_flag_invalid STATUS_VAR);
5313 z.low = float128_default_nan_low;
5314 z.high = float128_default_nan_high;
5315 return z;
5316 }
5317 if ( aExp == 0 ) {
5318 aExp = 1;
5319 bExp = 1;
5320 }
5321 if ( bSig0 < aSig0 ) goto aBigger;
5322 if ( aSig0 < bSig0 ) goto bBigger;
5323 if ( bSig1 < aSig1 ) goto aBigger;
5324 if ( aSig1 < bSig1 ) goto bBigger;
5325 return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
5326 bExpBigger:
5327 if ( bExp == 0x7FFF ) {
5328 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5329 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
5330 }
5331 if ( aExp == 0 ) {
5332 ++expDiff;
5333 }
5334 else {
5335 aSig0 |= LIT64( 0x4000000000000000 );
5336 }
5337 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5338 bSig0 |= LIT64( 0x4000000000000000 );
5339 bBigger:
5340 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
5341 zExp = bExp;
5342 zSign ^= 1;
5343 goto normalizeRoundAndPack;
5344 aExpBigger:
5345 if ( aExp == 0x7FFF ) {
5346 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5347 return a;
5348 }
5349 if ( bExp == 0 ) {
5350 --expDiff;
5351 }
5352 else {
5353 bSig0 |= LIT64( 0x4000000000000000 );
5354 }
5355 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
5356 aSig0 |= LIT64( 0x4000000000000000 );
5357 aBigger:
5358 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5359 zExp = aExp;
5360 normalizeRoundAndPack:
5361 --zExp;
5362 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
5363
5364}
5365
5366/*----------------------------------------------------------------------------
5367| Returns the result of adding the quadruple-precision floating-point values
5368| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
5369| for Binary Floating-Point Arithmetic.
5370*----------------------------------------------------------------------------*/
5371
5372float128 float128_add( float128 a, float128 b STATUS_PARAM )
5373{
5374 flag aSign, bSign;
5375
5376 aSign = extractFloat128Sign( a );
5377 bSign = extractFloat128Sign( b );
5378 if ( aSign == bSign ) {
5379 return addFloat128Sigs( a, b, aSign STATUS_VAR );
5380 }
5381 else {
5382 return subFloat128Sigs( a, b, aSign STATUS_VAR );
5383 }
5384
5385}
5386
5387/*----------------------------------------------------------------------------
5388| Returns the result of subtracting the quadruple-precision floating-point
5389| values `a' and `b'. The operation is performed according to the IEC/IEEE
5390| Standard for Binary Floating-Point Arithmetic.
5391*----------------------------------------------------------------------------*/
5392
5393float128 float128_sub( float128 a, float128 b STATUS_PARAM )
5394{
5395 flag aSign, bSign;
5396
5397 aSign = extractFloat128Sign( a );
5398 bSign = extractFloat128Sign( b );
5399 if ( aSign == bSign ) {
5400 return subFloat128Sigs( a, b, aSign STATUS_VAR );
5401 }
5402 else {
5403 return addFloat128Sigs( a, b, aSign STATUS_VAR );
5404 }
5405
5406}
5407
5408/*----------------------------------------------------------------------------
5409| Returns the result of multiplying the quadruple-precision floating-point
5410| values `a' and `b'. The operation is performed according to the IEC/IEEE
5411| Standard for Binary Floating-Point Arithmetic.
5412*----------------------------------------------------------------------------*/
5413
5414float128 float128_mul( float128 a, float128 b STATUS_PARAM )
5415{
5416 flag aSign, bSign, zSign;
5417 int32 aExp, bExp, zExp;
bb98fe42 5418 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
158142c2
FB
5419 float128 z;
5420
5421 aSig1 = extractFloat128Frac1( a );
5422 aSig0 = extractFloat128Frac0( a );
5423 aExp = extractFloat128Exp( a );
5424 aSign = extractFloat128Sign( a );
5425 bSig1 = extractFloat128Frac1( b );
5426 bSig0 = extractFloat128Frac0( b );
5427 bExp = extractFloat128Exp( b );
5428 bSign = extractFloat128Sign( b );
5429 zSign = aSign ^ bSign;
5430 if ( aExp == 0x7FFF ) {
5431 if ( ( aSig0 | aSig1 )
5432 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5433 return propagateFloat128NaN( a, b STATUS_VAR );
5434 }
5435 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
5436 return packFloat128( zSign, 0x7FFF, 0, 0 );
5437 }
5438 if ( bExp == 0x7FFF ) {
5439 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5440 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5441 invalid:
5442 float_raise( float_flag_invalid STATUS_VAR);
5443 z.low = float128_default_nan_low;
5444 z.high = float128_default_nan_high;
5445 return z;
5446 }
5447 return packFloat128( zSign, 0x7FFF, 0, 0 );
5448 }
5449 if ( aExp == 0 ) {
5450 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5451 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5452 }
5453 if ( bExp == 0 ) {
5454 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5455 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5456 }
5457 zExp = aExp + bExp - 0x4000;
5458 aSig0 |= LIT64( 0x0001000000000000 );
5459 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5460 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5461 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5462 zSig2 |= ( zSig3 != 0 );
5463 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5464 shift128ExtraRightJamming(
5465 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5466 ++zExp;
5467 }
5468 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5469
5470}
5471
5472/*----------------------------------------------------------------------------
5473| Returns the result of dividing the quadruple-precision floating-point value
5474| `a' by the corresponding value `b'. The operation is performed according to
5475| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5476*----------------------------------------------------------------------------*/
5477
5478float128 float128_div( float128 a, float128 b STATUS_PARAM )
5479{
5480 flag aSign, bSign, zSign;
5481 int32 aExp, bExp, zExp;
bb98fe42
AF
5482 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5483 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
158142c2
FB
5484 float128 z;
5485
5486 aSig1 = extractFloat128Frac1( a );
5487 aSig0 = extractFloat128Frac0( a );
5488 aExp = extractFloat128Exp( a );
5489 aSign = extractFloat128Sign( a );
5490 bSig1 = extractFloat128Frac1( b );
5491 bSig0 = extractFloat128Frac0( b );
5492 bExp = extractFloat128Exp( b );
5493 bSign = extractFloat128Sign( b );
5494 zSign = aSign ^ bSign;
5495 if ( aExp == 0x7FFF ) {
5496 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5497 if ( bExp == 0x7FFF ) {
5498 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5499 goto invalid;
5500 }
5501 return packFloat128( zSign, 0x7FFF, 0, 0 );
5502 }
5503 if ( bExp == 0x7FFF ) {
5504 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5505 return packFloat128( zSign, 0, 0, 0 );
5506 }
5507 if ( bExp == 0 ) {
5508 if ( ( bSig0 | bSig1 ) == 0 ) {
5509 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5510 invalid:
5511 float_raise( float_flag_invalid STATUS_VAR);
5512 z.low = float128_default_nan_low;
5513 z.high = float128_default_nan_high;
5514 return z;
5515 }
5516 float_raise( float_flag_divbyzero STATUS_VAR);
5517 return packFloat128( zSign, 0x7FFF, 0, 0 );
5518 }
5519 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5520 }
5521 if ( aExp == 0 ) {
5522 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5523 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5524 }
5525 zExp = aExp - bExp + 0x3FFD;
5526 shortShift128Left(
5527 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5528 shortShift128Left(
5529 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5530 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5531 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5532 ++zExp;
5533 }
5534 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5535 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5536 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
bb98fe42 5537 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
5538 --zSig0;
5539 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5540 }
5541 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5542 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5543 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5544 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
bb98fe42 5545 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
5546 --zSig1;
5547 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5548 }
5549 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5550 }
5551 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5552 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5553
5554}
5555
5556/*----------------------------------------------------------------------------
5557| Returns the remainder of the quadruple-precision floating-point value `a'
5558| with respect to the corresponding value `b'. The operation is performed
5559| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5560*----------------------------------------------------------------------------*/
5561
5562float128 float128_rem( float128 a, float128 b STATUS_PARAM )
5563{
ed086f3d 5564 flag aSign, zSign;
158142c2 5565 int32 aExp, bExp, expDiff;
bb98fe42
AF
5566 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5567 uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
5568 int64_t sigMean0;
158142c2
FB
5569 float128 z;
5570
5571 aSig1 = extractFloat128Frac1( a );
5572 aSig0 = extractFloat128Frac0( a );
5573 aExp = extractFloat128Exp( a );
5574 aSign = extractFloat128Sign( a );
5575 bSig1 = extractFloat128Frac1( b );
5576 bSig0 = extractFloat128Frac0( b );
5577 bExp = extractFloat128Exp( b );
158142c2
FB
5578 if ( aExp == 0x7FFF ) {
5579 if ( ( aSig0 | aSig1 )
5580 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5581 return propagateFloat128NaN( a, b STATUS_VAR );
5582 }
5583 goto invalid;
5584 }
5585 if ( bExp == 0x7FFF ) {
5586 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5587 return a;
5588 }
5589 if ( bExp == 0 ) {
5590 if ( ( bSig0 | bSig1 ) == 0 ) {
5591 invalid:
5592 float_raise( float_flag_invalid STATUS_VAR);
5593 z.low = float128_default_nan_low;
5594 z.high = float128_default_nan_high;
5595 return z;
5596 }
5597 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5598 }
5599 if ( aExp == 0 ) {
5600 if ( ( aSig0 | aSig1 ) == 0 ) return a;
5601 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5602 }
5603 expDiff = aExp - bExp;
5604 if ( expDiff < -1 ) return a;
5605 shortShift128Left(
5606 aSig0 | LIT64( 0x0001000000000000 ),
5607 aSig1,
5608 15 - ( expDiff < 0 ),
5609 &aSig0,
5610 &aSig1
5611 );
5612 shortShift128Left(
5613 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5614 q = le128( bSig0, bSig1, aSig0, aSig1 );
5615 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5616 expDiff -= 64;
5617 while ( 0 < expDiff ) {
5618 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5619 q = ( 4 < q ) ? q - 4 : 0;
5620 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5621 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5622 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5623 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5624 expDiff -= 61;
5625 }
5626 if ( -64 < expDiff ) {
5627 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5628 q = ( 4 < q ) ? q - 4 : 0;
5629 q >>= - expDiff;
5630 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5631 expDiff += 52;
5632 if ( expDiff < 0 ) {
5633 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5634 }
5635 else {
5636 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5637 }
5638 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5639 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5640 }
5641 else {
5642 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5643 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5644 }
5645 do {
5646 alternateASig0 = aSig0;
5647 alternateASig1 = aSig1;
5648 ++q;
5649 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
bb98fe42 5650 } while ( 0 <= (int64_t) aSig0 );
158142c2 5651 add128(
bb98fe42 5652 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
158142c2
FB
5653 if ( ( sigMean0 < 0 )
5654 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5655 aSig0 = alternateASig0;
5656 aSig1 = alternateASig1;
5657 }
bb98fe42 5658 zSign = ( (int64_t) aSig0 < 0 );
158142c2
FB
5659 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5660 return
5661 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
5662
5663}
5664
5665/*----------------------------------------------------------------------------
5666| Returns the square root of the quadruple-precision floating-point value `a'.
5667| The operation is performed according to the IEC/IEEE Standard for Binary
5668| Floating-Point Arithmetic.
5669*----------------------------------------------------------------------------*/
5670
5671float128 float128_sqrt( float128 a STATUS_PARAM )
5672{
5673 flag aSign;
5674 int32 aExp, zExp;
bb98fe42
AF
5675 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5676 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
158142c2
FB
5677 float128 z;
5678
5679 aSig1 = extractFloat128Frac1( a );
5680 aSig0 = extractFloat128Frac0( a );
5681 aExp = extractFloat128Exp( a );
5682 aSign = extractFloat128Sign( a );
5683 if ( aExp == 0x7FFF ) {
5684 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
5685 if ( ! aSign ) return a;
5686 goto invalid;
5687 }
5688 if ( aSign ) {
5689 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5690 invalid:
5691 float_raise( float_flag_invalid STATUS_VAR);
5692 z.low = float128_default_nan_low;
5693 z.high = float128_default_nan_high;
5694 return z;
5695 }
5696 if ( aExp == 0 ) {
5697 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5698 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5699 }
5700 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5701 aSig0 |= LIT64( 0x0001000000000000 );
5702 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5703 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5704 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5705 doubleZSig0 = zSig0<<1;
5706 mul64To128( zSig0, zSig0, &term0, &term1 );
5707 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
bb98fe42 5708 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
5709 --zSig0;
5710 doubleZSig0 -= 2;
5711 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5712 }
5713 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5714 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5715 if ( zSig1 == 0 ) zSig1 = 1;
5716 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5717 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5718 mul64To128( zSig1, zSig1, &term2, &term3 );
5719 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
bb98fe42 5720 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
5721 --zSig1;
5722 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5723 term3 |= 1;
5724 term2 |= doubleZSig0;
5725 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5726 }
5727 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5728 }
5729 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5730 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5731
5732}
5733
5734/*----------------------------------------------------------------------------
5735| Returns 1 if the quadruple-precision floating-point value `a' is equal to
b689362d
AJ
5736| the corresponding value `b', and 0 otherwise. The invalid exception is
5737| raised if either operand is a NaN. Otherwise, the comparison is performed
158142c2
FB
5738| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5739*----------------------------------------------------------------------------*/
5740
b689362d 5741int float128_eq( float128 a, float128 b STATUS_PARAM )
158142c2
FB
5742{
5743
5744 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5745 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5746 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5747 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5748 ) {
b689362d 5749 float_raise( float_flag_invalid STATUS_VAR);
158142c2
FB
5750 return 0;
5751 }
5752 return
5753 ( a.low == b.low )
5754 && ( ( a.high == b.high )
5755 || ( ( a.low == 0 )
bb98fe42 5756 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
5757 );
5758
5759}
5760
5761/*----------------------------------------------------------------------------
5762| Returns 1 if the quadruple-precision floating-point value `a' is less than
f5a64251
AJ
5763| or equal to the corresponding value `b', and 0 otherwise. The invalid
5764| exception is raised if either operand is a NaN. The comparison is performed
5765| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
5766*----------------------------------------------------------------------------*/
5767
750afe93 5768int float128_le( float128 a, float128 b STATUS_PARAM )
158142c2
FB
5769{
5770 flag aSign, bSign;
5771
5772 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5773 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5774 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5775 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5776 ) {
5777 float_raise( float_flag_invalid STATUS_VAR);
5778 return 0;
5779 }
5780 aSign = extractFloat128Sign( a );
5781 bSign = extractFloat128Sign( b );
5782 if ( aSign != bSign ) {
5783 return
5784 aSign
bb98fe42 5785 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5786 == 0 );
5787 }
5788 return
5789 aSign ? le128( b.high, b.low, a.high, a.low )
5790 : le128( a.high, a.low, b.high, b.low );
5791
5792}
5793
5794/*----------------------------------------------------------------------------
5795| Returns 1 if the quadruple-precision floating-point value `a' is less than
f5a64251
AJ
5796| the corresponding value `b', and 0 otherwise. The invalid exception is
5797| raised if either operand is a NaN. The comparison is performed according
5798| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
5799*----------------------------------------------------------------------------*/
5800
750afe93 5801int float128_lt( float128 a, float128 b STATUS_PARAM )
158142c2
FB
5802{
5803 flag aSign, bSign;
5804
5805 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5806 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5807 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5808 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5809 ) {
5810 float_raise( float_flag_invalid STATUS_VAR);
5811 return 0;
5812 }
5813 aSign = extractFloat128Sign( a );
5814 bSign = extractFloat128Sign( b );
5815 if ( aSign != bSign ) {
5816 return
5817 aSign
bb98fe42 5818 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5819 != 0 );
5820 }
5821 return
5822 aSign ? lt128( b.high, b.low, a.high, a.low )
5823 : lt128( a.high, a.low, b.high, b.low );
5824
5825}
5826
67b7861d
AJ
5827/*----------------------------------------------------------------------------
5828| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
f5a64251
AJ
5829| be compared, and 0 otherwise. The invalid exception is raised if either
5830| operand is a NaN. The comparison is performed according to the IEC/IEEE
5831| Standard for Binary Floating-Point Arithmetic.
67b7861d
AJ
5832*----------------------------------------------------------------------------*/
5833
5834int float128_unordered( float128 a, float128 b STATUS_PARAM )
5835{
5836 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5837 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5838 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5839 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5840 ) {
5841 float_raise( float_flag_invalid STATUS_VAR);
5842 return 1;
5843 }
5844 return 0;
5845}
5846
158142c2
FB
5847/*----------------------------------------------------------------------------
5848| Returns 1 if the quadruple-precision floating-point value `a' is equal to
f5a64251
AJ
5849| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5850| exception. The comparison is performed according to the IEC/IEEE Standard
5851| for Binary Floating-Point Arithmetic.
158142c2
FB
5852*----------------------------------------------------------------------------*/
5853
b689362d 5854int float128_eq_quiet( float128 a, float128 b STATUS_PARAM )
158142c2
FB
5855{
5856
5857 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5858 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5859 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5860 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5861 ) {
b689362d
AJ
5862 if ( float128_is_signaling_nan( a )
5863 || float128_is_signaling_nan( b ) ) {
5864 float_raise( float_flag_invalid STATUS_VAR);
5865 }
158142c2
FB
5866 return 0;
5867 }
5868 return
5869 ( a.low == b.low )
5870 && ( ( a.high == b.high )
5871 || ( ( a.low == 0 )
bb98fe42 5872 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
5873 );
5874
5875}
5876
5877/*----------------------------------------------------------------------------
5878| Returns 1 if the quadruple-precision floating-point value `a' is less than
5879| or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5880| cause an exception. Otherwise, the comparison is performed according to the
5881| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5882*----------------------------------------------------------------------------*/
5883
750afe93 5884int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
158142c2
FB
5885{
5886 flag aSign, bSign;
5887
5888 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5889 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5890 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5891 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5892 ) {
5893 if ( float128_is_signaling_nan( a )
5894 || float128_is_signaling_nan( b ) ) {
5895 float_raise( float_flag_invalid STATUS_VAR);
5896 }
5897 return 0;
5898 }
5899 aSign = extractFloat128Sign( a );
5900 bSign = extractFloat128Sign( b );
5901 if ( aSign != bSign ) {
5902 return
5903 aSign
bb98fe42 5904 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5905 == 0 );
5906 }
5907 return
5908 aSign ? le128( b.high, b.low, a.high, a.low )
5909 : le128( a.high, a.low, b.high, b.low );
5910
5911}
5912
5913/*----------------------------------------------------------------------------
5914| Returns 1 if the quadruple-precision floating-point value `a' is less than
5915| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5916| exception. Otherwise, the comparison is performed according to the IEC/IEEE
5917| Standard for Binary Floating-Point Arithmetic.
5918*----------------------------------------------------------------------------*/
5919
750afe93 5920int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
158142c2
FB
5921{
5922 flag aSign, bSign;
5923
5924 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5925 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5926 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5927 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5928 ) {
5929 if ( float128_is_signaling_nan( a )
5930 || float128_is_signaling_nan( b ) ) {
5931 float_raise( float_flag_invalid STATUS_VAR);
5932 }
5933 return 0;
5934 }
5935 aSign = extractFloat128Sign( a );
5936 bSign = extractFloat128Sign( b );
5937 if ( aSign != bSign ) {
5938 return
5939 aSign
bb98fe42 5940 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5941 != 0 );
5942 }
5943 return
5944 aSign ? lt128( b.high, b.low, a.high, a.low )
5945 : lt128( a.high, a.low, b.high, b.low );
5946
5947}
5948
67b7861d
AJ
5949/*----------------------------------------------------------------------------
5950| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
5951| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
5952| comparison is performed according to the IEC/IEEE Standard for Binary
5953| Floating-Point Arithmetic.
5954*----------------------------------------------------------------------------*/
5955
5956int float128_unordered_quiet( float128 a, float128 b STATUS_PARAM )
5957{
5958 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5959 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5960 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5961 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5962 ) {
5963 if ( float128_is_signaling_nan( a )
5964 || float128_is_signaling_nan( b ) ) {
5965 float_raise( float_flag_invalid STATUS_VAR);
5966 }
5967 return 1;
5968 }
5969 return 0;
5970}
5971
1d6bda35 5972/* misc functions */
9f8d2a09 5973float32 uint32_to_float32( uint32 a STATUS_PARAM )
1d6bda35
FB
5974{
5975 return int64_to_float32(a STATUS_VAR);
5976}
5977
9f8d2a09 5978float64 uint32_to_float64( uint32 a STATUS_PARAM )
1d6bda35
FB
5979{
5980 return int64_to_float64(a STATUS_VAR);
5981}
5982
9f8d2a09 5983uint32 float32_to_uint32( float32 a STATUS_PARAM )
1d6bda35
FB
5984{
5985 int64_t v;
9f8d2a09 5986 uint32 res;
1d6bda35
FB
5987
5988 v = float32_to_int64(a STATUS_VAR);
5989 if (v < 0) {
5990 res = 0;
5991 float_raise( float_flag_invalid STATUS_VAR);
5992 } else if (v > 0xffffffff) {
5993 res = 0xffffffff;
5994 float_raise( float_flag_invalid STATUS_VAR);
5995 } else {
5996 res = v;
5997 }
5998 return res;
5999}
6000
9f8d2a09 6001uint32 float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
1d6bda35
FB
6002{
6003 int64_t v;
9f8d2a09 6004 uint32 res;
1d6bda35
FB
6005
6006 v = float32_to_int64_round_to_zero(a STATUS_VAR);
6007 if (v < 0) {
6008 res = 0;
6009 float_raise( float_flag_invalid STATUS_VAR);
6010 } else if (v > 0xffffffff) {
6011 res = 0xffffffff;
6012 float_raise( float_flag_invalid STATUS_VAR);
6013 } else {
6014 res = v;
6015 }
6016 return res;
6017}
6018
38641f8f 6019uint16 float32_to_uint16_round_to_zero( float32 a STATUS_PARAM )
cbcef455
PM
6020{
6021 int64_t v;
38641f8f 6022 uint16 res;
cbcef455
PM
6023
6024 v = float32_to_int64_round_to_zero(a STATUS_VAR);
6025 if (v < 0) {
6026 res = 0;
6027 float_raise( float_flag_invalid STATUS_VAR);
6028 } else if (v > 0xffff) {
6029 res = 0xffff;
6030 float_raise( float_flag_invalid STATUS_VAR);
6031 } else {
6032 res = v;
6033 }
6034 return res;
6035}
6036
9f8d2a09 6037uint32 float64_to_uint32( float64 a STATUS_PARAM )
1d6bda35
FB
6038{
6039 int64_t v;
9f8d2a09 6040 uint32 res;
1d6bda35
FB
6041
6042 v = float64_to_int64(a STATUS_VAR);
6043 if (v < 0) {
6044 res = 0;
6045 float_raise( float_flag_invalid STATUS_VAR);
6046 } else if (v > 0xffffffff) {
6047 res = 0xffffffff;
6048 float_raise( float_flag_invalid STATUS_VAR);
6049 } else {
6050 res = v;
6051 }
6052 return res;
6053}
6054
9f8d2a09 6055uint32 float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
1d6bda35
FB
6056{
6057 int64_t v;
9f8d2a09 6058 uint32 res;
1d6bda35
FB
6059
6060 v = float64_to_int64_round_to_zero(a STATUS_VAR);
6061 if (v < 0) {
6062 res = 0;
6063 float_raise( float_flag_invalid STATUS_VAR);
6064 } else if (v > 0xffffffff) {
6065 res = 0xffffffff;
6066 float_raise( float_flag_invalid STATUS_VAR);
6067 } else {
6068 res = v;
6069 }
6070 return res;
6071}
6072
38641f8f 6073uint16 float64_to_uint16_round_to_zero( float64 a STATUS_PARAM )
cbcef455
PM
6074{
6075 int64_t v;
38641f8f 6076 uint16 res;
cbcef455
PM
6077
6078 v = float64_to_int64_round_to_zero(a STATUS_VAR);
6079 if (v < 0) {
6080 res = 0;
6081 float_raise( float_flag_invalid STATUS_VAR);
6082 } else if (v > 0xffff) {
6083 res = 0xffff;
6084 float_raise( float_flag_invalid STATUS_VAR);
6085 } else {
6086 res = v;
6087 }
6088 return res;
6089}
6090
f090c9d4 6091/* FIXME: This looks broken. */
75d62a58
JM
6092uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
6093{
6094 int64_t v;
6095
f090c9d4
PB
6096 v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
6097 v += float64_val(a);
6098 v = float64_to_int64(make_float64(v) STATUS_VAR);
75d62a58
JM
6099
6100 return v - INT64_MIN;
6101}
6102
6103uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
6104{
6105 int64_t v;
6106
f090c9d4
PB
6107 v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
6108 v += float64_val(a);
6109 v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);
75d62a58
JM
6110
6111 return v - INT64_MIN;
6112}
6113
1d6bda35 6114#define COMPARE(s, nan_exp) \
750afe93 6115INLINE int float ## s ## _compare_internal( float ## s a, float ## s b, \
1d6bda35
FB
6116 int is_quiet STATUS_PARAM ) \
6117{ \
6118 flag aSign, bSign; \
bb98fe42 6119 uint ## s ## _t av, bv; \
37d18660
PM
6120 a = float ## s ## _squash_input_denormal(a STATUS_VAR); \
6121 b = float ## s ## _squash_input_denormal(b STATUS_VAR); \
1d6bda35
FB
6122 \
6123 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
6124 extractFloat ## s ## Frac( a ) ) || \
6125 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
6126 extractFloat ## s ## Frac( b ) )) { \
6127 if (!is_quiet || \
6128 float ## s ## _is_signaling_nan( a ) || \
6129 float ## s ## _is_signaling_nan( b ) ) { \
6130 float_raise( float_flag_invalid STATUS_VAR); \
6131 } \
6132 return float_relation_unordered; \
6133 } \
6134 aSign = extractFloat ## s ## Sign( a ); \
6135 bSign = extractFloat ## s ## Sign( b ); \
f090c9d4 6136 av = float ## s ## _val(a); \
cd8a2533 6137 bv = float ## s ## _val(b); \
1d6bda35 6138 if ( aSign != bSign ) { \
bb98fe42 6139 if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) { \
1d6bda35
FB
6140 /* zero case */ \
6141 return float_relation_equal; \
6142 } else { \
6143 return 1 - (2 * aSign); \
6144 } \
6145 } else { \
f090c9d4 6146 if (av == bv) { \
1d6bda35
FB
6147 return float_relation_equal; \
6148 } else { \
f090c9d4 6149 return 1 - 2 * (aSign ^ ( av < bv )); \
1d6bda35
FB
6150 } \
6151 } \
6152} \
6153 \
750afe93 6154int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM ) \
1d6bda35
FB
6155{ \
6156 return float ## s ## _compare_internal(a, b, 0 STATUS_VAR); \
6157} \
6158 \
750afe93 6159int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \
1d6bda35
FB
6160{ \
6161 return float ## s ## _compare_internal(a, b, 1 STATUS_VAR); \
6162}
6163
6164COMPARE(32, 0xff)
6165COMPARE(64, 0x7ff)
9ee6e8bb 6166
f6714d36
AJ
6167INLINE int floatx80_compare_internal( floatx80 a, floatx80 b,
6168 int is_quiet STATUS_PARAM )
6169{
6170 flag aSign, bSign;
6171
6172 if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
6173 ( extractFloatx80Frac( a )<<1 ) ) ||
6174 ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
6175 ( extractFloatx80Frac( b )<<1 ) )) {
6176 if (!is_quiet ||
6177 floatx80_is_signaling_nan( a ) ||
6178 floatx80_is_signaling_nan( b ) ) {
6179 float_raise( float_flag_invalid STATUS_VAR);
6180 }
6181 return float_relation_unordered;
6182 }
6183 aSign = extractFloatx80Sign( a );
6184 bSign = extractFloatx80Sign( b );
6185 if ( aSign != bSign ) {
6186
6187 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
6188 ( ( a.low | b.low ) == 0 ) ) {
6189 /* zero case */
6190 return float_relation_equal;
6191 } else {
6192 return 1 - (2 * aSign);
6193 }
6194 } else {
6195 if (a.low == b.low && a.high == b.high) {
6196 return float_relation_equal;
6197 } else {
6198 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6199 }
6200 }
6201}
6202
6203int floatx80_compare( floatx80 a, floatx80 b STATUS_PARAM )
6204{
6205 return floatx80_compare_internal(a, b, 0 STATUS_VAR);
6206}
6207
6208int floatx80_compare_quiet( floatx80 a, floatx80 b STATUS_PARAM )
6209{
6210 return floatx80_compare_internal(a, b, 1 STATUS_VAR);
6211}
6212
1f587329
BS
6213INLINE int float128_compare_internal( float128 a, float128 b,
6214 int is_quiet STATUS_PARAM )
6215{
6216 flag aSign, bSign;
6217
6218 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6219 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6220 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6221 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6222 if (!is_quiet ||
6223 float128_is_signaling_nan( a ) ||
6224 float128_is_signaling_nan( b ) ) {
6225 float_raise( float_flag_invalid STATUS_VAR);
6226 }
6227 return float_relation_unordered;
6228 }
6229 aSign = extractFloat128Sign( a );
6230 bSign = extractFloat128Sign( b );
6231 if ( aSign != bSign ) {
6232 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6233 /* zero case */
6234 return float_relation_equal;
6235 } else {
6236 return 1 - (2 * aSign);
6237 }
6238 } else {
6239 if (a.low == b.low && a.high == b.high) {
6240 return float_relation_equal;
6241 } else {
6242 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6243 }
6244 }
6245}
6246
6247int float128_compare( float128 a, float128 b STATUS_PARAM )
6248{
6249 return float128_compare_internal(a, b, 0 STATUS_VAR);
6250}
6251
6252int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
6253{
6254 return float128_compare_internal(a, b, 1 STATUS_VAR);
6255}
6256
274f1b04
PM
6257/* min() and max() functions. These can't be implemented as
6258 * 'compare and pick one input' because that would mishandle
6259 * NaNs and +0 vs -0.
6260 */
6261#define MINMAX(s, nan_exp) \
6262INLINE float ## s float ## s ## _minmax(float ## s a, float ## s b, \
6263 int ismin STATUS_PARAM ) \
6264{ \
6265 flag aSign, bSign; \
6266 uint ## s ## _t av, bv; \
6267 a = float ## s ## _squash_input_denormal(a STATUS_VAR); \
6268 b = float ## s ## _squash_input_denormal(b STATUS_VAR); \
6269 if (float ## s ## _is_any_nan(a) || \
6270 float ## s ## _is_any_nan(b)) { \
6271 return propagateFloat ## s ## NaN(a, b STATUS_VAR); \
6272 } \
6273 aSign = extractFloat ## s ## Sign(a); \
6274 bSign = extractFloat ## s ## Sign(b); \
6275 av = float ## s ## _val(a); \
6276 bv = float ## s ## _val(b); \
6277 if (aSign != bSign) { \
6278 if (ismin) { \
6279 return aSign ? a : b; \
6280 } else { \
6281 return aSign ? b : a; \
6282 } \
6283 } else { \
6284 if (ismin) { \
6285 return (aSign ^ (av < bv)) ? a : b; \
6286 } else { \
6287 return (aSign ^ (av < bv)) ? b : a; \
6288 } \
6289 } \
6290} \
6291 \
6292float ## s float ## s ## _min(float ## s a, float ## s b STATUS_PARAM) \
6293{ \
6294 return float ## s ## _minmax(a, b, 1 STATUS_VAR); \
6295} \
6296 \
6297float ## s float ## s ## _max(float ## s a, float ## s b STATUS_PARAM) \
6298{ \
6299 return float ## s ## _minmax(a, b, 0 STATUS_VAR); \
6300}
6301
6302MINMAX(32, 0xff)
6303MINMAX(64, 0x7ff)
6304
6305
9ee6e8bb
PB
6306/* Multiply A by 2 raised to the power N. */
6307float32 float32_scalbn( float32 a, int n STATUS_PARAM )
6308{
6309 flag aSign;
326b9e98 6310 int16_t aExp;
bb98fe42 6311 uint32_t aSig;
9ee6e8bb 6312
37d18660 6313 a = float32_squash_input_denormal(a STATUS_VAR);
9ee6e8bb
PB
6314 aSig = extractFloat32Frac( a );
6315 aExp = extractFloat32Exp( a );
6316 aSign = extractFloat32Sign( a );
6317
6318 if ( aExp == 0xFF ) {
326b9e98
AJ
6319 if ( aSig ) {
6320 return propagateFloat32NaN( a, a STATUS_VAR );
6321 }
9ee6e8bb
PB
6322 return a;
6323 }
69397542
PB
6324 if ( aExp != 0 )
6325 aSig |= 0x00800000;
6326 else if ( aSig == 0 )
6327 return a;
6328
326b9e98
AJ
6329 if (n > 0x200) {
6330 n = 0x200;
6331 } else if (n < -0x200) {
6332 n = -0x200;
6333 }
6334
69397542
PB
6335 aExp += n - 1;
6336 aSig <<= 7;
6337 return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
9ee6e8bb
PB
6338}
6339
6340float64 float64_scalbn( float64 a, int n STATUS_PARAM )
6341{
6342 flag aSign;
326b9e98 6343 int16_t aExp;
bb98fe42 6344 uint64_t aSig;
9ee6e8bb 6345
37d18660 6346 a = float64_squash_input_denormal(a STATUS_VAR);
9ee6e8bb
PB
6347 aSig = extractFloat64Frac( a );
6348 aExp = extractFloat64Exp( a );
6349 aSign = extractFloat64Sign( a );
6350
6351 if ( aExp == 0x7FF ) {
326b9e98
AJ
6352 if ( aSig ) {
6353 return propagateFloat64NaN( a, a STATUS_VAR );
6354 }
9ee6e8bb
PB
6355 return a;
6356 }
69397542
PB
6357 if ( aExp != 0 )
6358 aSig |= LIT64( 0x0010000000000000 );
6359 else if ( aSig == 0 )
6360 return a;
6361
326b9e98
AJ
6362 if (n > 0x1000) {
6363 n = 0x1000;
6364 } else if (n < -0x1000) {
6365 n = -0x1000;
6366 }
6367
69397542
PB
6368 aExp += n - 1;
6369 aSig <<= 10;
6370 return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
9ee6e8bb
PB
6371}
6372
9ee6e8bb
PB
6373floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
6374{
6375 flag aSign;
326b9e98 6376 int32_t aExp;
bb98fe42 6377 uint64_t aSig;
9ee6e8bb
PB
6378
6379 aSig = extractFloatx80Frac( a );
6380 aExp = extractFloatx80Exp( a );
6381 aSign = extractFloatx80Sign( a );
6382
326b9e98
AJ
6383 if ( aExp == 0x7FFF ) {
6384 if ( aSig<<1 ) {
6385 return propagateFloatx80NaN( a, a STATUS_VAR );
6386 }
9ee6e8bb
PB
6387 return a;
6388 }
326b9e98 6389
69397542
PB
6390 if (aExp == 0 && aSig == 0)
6391 return a;
6392
326b9e98
AJ
6393 if (n > 0x10000) {
6394 n = 0x10000;
6395 } else if (n < -0x10000) {
6396 n = -0x10000;
6397 }
6398
9ee6e8bb 6399 aExp += n;
69397542
PB
6400 return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
6401 aSign, aExp, aSig, 0 STATUS_VAR );
9ee6e8bb 6402}
9ee6e8bb 6403
9ee6e8bb
PB
6404float128 float128_scalbn( float128 a, int n STATUS_PARAM )
6405{
6406 flag aSign;
326b9e98 6407 int32_t aExp;
bb98fe42 6408 uint64_t aSig0, aSig1;
9ee6e8bb
PB
6409
6410 aSig1 = extractFloat128Frac1( a );
6411 aSig0 = extractFloat128Frac0( a );
6412 aExp = extractFloat128Exp( a );
6413 aSign = extractFloat128Sign( a );
6414 if ( aExp == 0x7FFF ) {
326b9e98
AJ
6415 if ( aSig0 | aSig1 ) {
6416 return propagateFloat128NaN( a, a STATUS_VAR );
6417 }
9ee6e8bb
PB
6418 return a;
6419 }
69397542
PB
6420 if ( aExp != 0 )
6421 aSig0 |= LIT64( 0x0001000000000000 );
6422 else if ( aSig0 == 0 && aSig1 == 0 )
6423 return a;
6424
326b9e98
AJ
6425 if (n > 0x10000) {
6426 n = 0x10000;
6427 } else if (n < -0x10000) {
6428 n = -0x10000;
6429 }
6430
69397542
PB
6431 aExp += n - 1;
6432 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
6433 STATUS_VAR );
9ee6e8bb
PB
6434
6435}