]> git.proxmox.com Git - qemu.git/blame - fpu/softfloat.c
usb-host: add usb_host_full_speed_compat
[qemu.git] / fpu / softfloat.c
CommitLineData
8d725fac
AF
1/*
2 * QEMU float support
3 *
4 * Derived from SoftFloat.
5 */
158142c2
FB
6
7/*============================================================================
8
9This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
10Package, Release 2b.
11
12Written by John R. Hauser. This work was made possible in part by the
13International Computer Science Institute, located at Suite 600, 1947 Center
14Street, Berkeley, California 94704. Funding was partially provided by the
15National Science Foundation under grant MIP-9311980. The original version
16of this code was written as part of a project to build a fixed-point vector
17processor in collaboration with the University of California at Berkeley,
18overseen by Profs. Nelson Morgan and John Wawrzynek. More information
19is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
20arithmetic/SoftFloat.html'.
21
22THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
23been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
24RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
25AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
26COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
27EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
28INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
29OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
30
31Derivative works are acceptable, even for commercial purposes, so long as
32(1) the source code for the derivative work includes prominent notice that
33the work is derivative, and (2) the source code includes prominent notice with
34these four paragraphs for those parts of this code that are retained.
35
36=============================================================================*/
37
2ac8bd03
PM
38/* softfloat (and in particular the code in softfloat-specialize.h) is
39 * target-dependent and needs the TARGET_* macros.
40 */
41#include "config.h"
42
6b4c305c 43#include "fpu/softfloat.h"
158142c2
FB
44
45/*----------------------------------------------------------------------------
46| Primitive arithmetic functions, including multi-word arithmetic, and
47| division and square root approximations. (Can be specialized to target if
48| desired.)
49*----------------------------------------------------------------------------*/
50#include "softfloat-macros.h"
51
52/*----------------------------------------------------------------------------
53| Functions and definitions to determine: (1) whether tininess for underflow
54| is detected before or after rounding by default, (2) what (if anything)
55| happens when exceptions are raised, (3) how signaling NaNs are distinguished
56| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
57| are propagated from function inputs to output. These details are target-
58| specific.
59*----------------------------------------------------------------------------*/
60#include "softfloat-specialize.h"
61
62void set_float_rounding_mode(int val STATUS_PARAM)
63{
64 STATUS(float_rounding_mode) = val;
65}
66
1d6bda35
FB
67void set_float_exception_flags(int val STATUS_PARAM)
68{
69 STATUS(float_exception_flags) = val;
70}
71
158142c2
FB
72void set_floatx80_rounding_precision(int val STATUS_PARAM)
73{
74 STATUS(floatx80_rounding_precision) = val;
75}
158142c2 76
bb4d4bb3
PM
77/*----------------------------------------------------------------------------
78| Returns the fraction bits of the half-precision floating-point value `a'.
79*----------------------------------------------------------------------------*/
80
81INLINE uint32_t extractFloat16Frac(float16 a)
82{
83 return float16_val(a) & 0x3ff;
84}
85
86/*----------------------------------------------------------------------------
87| Returns the exponent bits of the half-precision floating-point value `a'.
88*----------------------------------------------------------------------------*/
89
94a49d86 90INLINE int_fast16_t extractFloat16Exp(float16 a)
bb4d4bb3
PM
91{
92 return (float16_val(a) >> 10) & 0x1f;
93}
94
95/*----------------------------------------------------------------------------
96| Returns the sign bit of the single-precision floating-point value `a'.
97*----------------------------------------------------------------------------*/
98
99INLINE flag extractFloat16Sign(float16 a)
100{
101 return float16_val(a)>>15;
102}
103
158142c2
FB
104/*----------------------------------------------------------------------------
105| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
106| and 7, and returns the properly rounded 32-bit integer corresponding to the
107| input. If `zSign' is 1, the input is negated before being converted to an
108| integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
109| is simply rounded to an integer, with the inexact exception raised if the
110| input cannot be represented exactly as an integer. However, if the fixed-
111| point input is too large, the invalid exception is raised and the largest
112| positive or negative integer is returned.
113*----------------------------------------------------------------------------*/
114
bb98fe42 115static int32 roundAndPackInt32( flag zSign, uint64_t absZ STATUS_PARAM)
158142c2
FB
116{
117 int8 roundingMode;
118 flag roundNearestEven;
119 int8 roundIncrement, roundBits;
760e1416 120 int32_t z;
158142c2
FB
121
122 roundingMode = STATUS(float_rounding_mode);
123 roundNearestEven = ( roundingMode == float_round_nearest_even );
124 roundIncrement = 0x40;
125 if ( ! roundNearestEven ) {
126 if ( roundingMode == float_round_to_zero ) {
127 roundIncrement = 0;
128 }
129 else {
130 roundIncrement = 0x7F;
131 if ( zSign ) {
132 if ( roundingMode == float_round_up ) roundIncrement = 0;
133 }
134 else {
135 if ( roundingMode == float_round_down ) roundIncrement = 0;
136 }
137 }
138 }
139 roundBits = absZ & 0x7F;
140 absZ = ( absZ + roundIncrement )>>7;
141 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
142 z = absZ;
143 if ( zSign ) z = - z;
144 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
145 float_raise( float_flag_invalid STATUS_VAR);
bb98fe42 146 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2
FB
147 }
148 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
149 return z;
150
151}
152
153/*----------------------------------------------------------------------------
154| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
155| `absZ1', with binary point between bits 63 and 64 (between the input words),
156| and returns the properly rounded 64-bit integer corresponding to the input.
157| If `zSign' is 1, the input is negated before being converted to an integer.
158| Ordinarily, the fixed-point input is simply rounded to an integer, with
159| the inexact exception raised if the input cannot be represented exactly as
160| an integer. However, if the fixed-point input is too large, the invalid
161| exception is raised and the largest positive or negative integer is
162| returned.
163*----------------------------------------------------------------------------*/
164
bb98fe42 165static int64 roundAndPackInt64( flag zSign, uint64_t absZ0, uint64_t absZ1 STATUS_PARAM)
158142c2
FB
166{
167 int8 roundingMode;
168 flag roundNearestEven, increment;
760e1416 169 int64_t z;
158142c2
FB
170
171 roundingMode = STATUS(float_rounding_mode);
172 roundNearestEven = ( roundingMode == float_round_nearest_even );
bb98fe42 173 increment = ( (int64_t) absZ1 < 0 );
158142c2
FB
174 if ( ! roundNearestEven ) {
175 if ( roundingMode == float_round_to_zero ) {
176 increment = 0;
177 }
178 else {
179 if ( zSign ) {
180 increment = ( roundingMode == float_round_down ) && absZ1;
181 }
182 else {
183 increment = ( roundingMode == float_round_up ) && absZ1;
184 }
185 }
186 }
187 if ( increment ) {
188 ++absZ0;
189 if ( absZ0 == 0 ) goto overflow;
bb98fe42 190 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
158142c2
FB
191 }
192 z = absZ0;
193 if ( zSign ) z = - z;
194 if ( z && ( ( z < 0 ) ^ zSign ) ) {
195 overflow:
196 float_raise( float_flag_invalid STATUS_VAR);
197 return
bb98fe42 198 zSign ? (int64_t) LIT64( 0x8000000000000000 )
158142c2
FB
199 : LIT64( 0x7FFFFFFFFFFFFFFF );
200 }
201 if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
202 return z;
203
204}
205
206/*----------------------------------------------------------------------------
207| Returns the fraction bits of the single-precision floating-point value `a'.
208*----------------------------------------------------------------------------*/
209
bb98fe42 210INLINE uint32_t extractFloat32Frac( float32 a )
158142c2
FB
211{
212
f090c9d4 213 return float32_val(a) & 0x007FFFFF;
158142c2
FB
214
215}
216
217/*----------------------------------------------------------------------------
218| Returns the exponent bits of the single-precision floating-point value `a'.
219*----------------------------------------------------------------------------*/
220
94a49d86 221INLINE int_fast16_t extractFloat32Exp(float32 a)
158142c2
FB
222{
223
f090c9d4 224 return ( float32_val(a)>>23 ) & 0xFF;
158142c2
FB
225
226}
227
228/*----------------------------------------------------------------------------
229| Returns the sign bit of the single-precision floating-point value `a'.
230*----------------------------------------------------------------------------*/
231
232INLINE flag extractFloat32Sign( float32 a )
233{
234
f090c9d4 235 return float32_val(a)>>31;
158142c2
FB
236
237}
238
37d18660
PM
239/*----------------------------------------------------------------------------
240| If `a' is denormal and we are in flush-to-zero mode then set the
241| input-denormal exception and return zero. Otherwise just return the value.
242*----------------------------------------------------------------------------*/
243static float32 float32_squash_input_denormal(float32 a STATUS_PARAM)
244{
245 if (STATUS(flush_inputs_to_zero)) {
246 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
247 float_raise(float_flag_input_denormal STATUS_VAR);
248 return make_float32(float32_val(a) & 0x80000000);
249 }
250 }
251 return a;
252}
253
158142c2
FB
254/*----------------------------------------------------------------------------
255| Normalizes the subnormal single-precision floating-point value represented
256| by the denormalized significand `aSig'. The normalized exponent and
257| significand are stored at the locations pointed to by `zExpPtr' and
258| `zSigPtr', respectively.
259*----------------------------------------------------------------------------*/
260
261static void
94a49d86 262 normalizeFloat32Subnormal(uint32_t aSig, int_fast16_t *zExpPtr, uint32_t *zSigPtr)
158142c2
FB
263{
264 int8 shiftCount;
265
266 shiftCount = countLeadingZeros32( aSig ) - 8;
267 *zSigPtr = aSig<<shiftCount;
268 *zExpPtr = 1 - shiftCount;
269
270}
271
272/*----------------------------------------------------------------------------
273| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
274| single-precision floating-point value, returning the result. After being
275| shifted into the proper positions, the three fields are simply added
276| together to form the result. This means that any integer portion of `zSig'
277| will be added into the exponent. Since a properly normalized significand
278| will have an integer portion equal to 1, the `zExp' input should be 1 less
279| than the desired result exponent whenever `zSig' is a complete, normalized
280| significand.
281*----------------------------------------------------------------------------*/
282
94a49d86 283INLINE float32 packFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig)
158142c2
FB
284{
285
f090c9d4 286 return make_float32(
bb98fe42 287 ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
158142c2
FB
288
289}
290
291/*----------------------------------------------------------------------------
292| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
293| and significand `zSig', and returns the proper single-precision floating-
294| point value corresponding to the abstract input. Ordinarily, the abstract
295| value is simply rounded and packed into the single-precision format, with
296| the inexact exception raised if the abstract input cannot be represented
297| exactly. However, if the abstract value is too large, the overflow and
298| inexact exceptions are raised and an infinity or maximal finite value is
299| returned. If the abstract value is too small, the input value is rounded to
300| a subnormal number, and the underflow and inexact exceptions are raised if
301| the abstract input cannot be represented exactly as a subnormal single-
302| precision floating-point number.
303| The input significand `zSig' has its binary point between bits 30
304| and 29, which is 7 bits to the left of the usual location. This shifted
305| significand must be normalized or smaller. If `zSig' is not normalized,
306| `zExp' must be 0; in that case, the result returned is a subnormal number,
307| and it must not require rounding. In the usual case that `zSig' is
308| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
309| The handling of underflow and overflow follows the IEC/IEEE Standard for
310| Binary Floating-Point Arithmetic.
311*----------------------------------------------------------------------------*/
312
94a49d86 313static float32 roundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM)
158142c2
FB
314{
315 int8 roundingMode;
316 flag roundNearestEven;
317 int8 roundIncrement, roundBits;
318 flag isTiny;
319
320 roundingMode = STATUS(float_rounding_mode);
321 roundNearestEven = ( roundingMode == float_round_nearest_even );
322 roundIncrement = 0x40;
323 if ( ! roundNearestEven ) {
324 if ( roundingMode == float_round_to_zero ) {
325 roundIncrement = 0;
326 }
327 else {
328 roundIncrement = 0x7F;
329 if ( zSign ) {
330 if ( roundingMode == float_round_up ) roundIncrement = 0;
331 }
332 else {
333 if ( roundingMode == float_round_down ) roundIncrement = 0;
334 }
335 }
336 }
337 roundBits = zSig & 0x7F;
bb98fe42 338 if ( 0xFD <= (uint16_t) zExp ) {
158142c2
FB
339 if ( ( 0xFD < zExp )
340 || ( ( zExp == 0xFD )
bb98fe42 341 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
158142c2
FB
342 ) {
343 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
f090c9d4 344 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
158142c2
FB
345 }
346 if ( zExp < 0 ) {
e6afc87f
PM
347 if (STATUS(flush_to_zero)) {
348 float_raise(float_flag_output_denormal STATUS_VAR);
349 return packFloat32(zSign, 0, 0);
350 }
158142c2
FB
351 isTiny =
352 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
353 || ( zExp < -1 )
354 || ( zSig + roundIncrement < 0x80000000 );
355 shift32RightJamming( zSig, - zExp, &zSig );
356 zExp = 0;
357 roundBits = zSig & 0x7F;
358 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
359 }
360 }
361 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
362 zSig = ( zSig + roundIncrement )>>7;
363 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
364 if ( zSig == 0 ) zExp = 0;
365 return packFloat32( zSign, zExp, zSig );
366
367}
368
369/*----------------------------------------------------------------------------
370| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
371| and significand `zSig', and returns the proper single-precision floating-
372| point value corresponding to the abstract input. This routine is just like
373| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
374| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
375| floating-point exponent.
376*----------------------------------------------------------------------------*/
377
378static float32
94a49d86 379 normalizeRoundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM)
158142c2
FB
380{
381 int8 shiftCount;
382
383 shiftCount = countLeadingZeros32( zSig ) - 1;
384 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
385
386}
387
388/*----------------------------------------------------------------------------
389| Returns the fraction bits of the double-precision floating-point value `a'.
390*----------------------------------------------------------------------------*/
391
bb98fe42 392INLINE uint64_t extractFloat64Frac( float64 a )
158142c2
FB
393{
394
f090c9d4 395 return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
158142c2
FB
396
397}
398
399/*----------------------------------------------------------------------------
400| Returns the exponent bits of the double-precision floating-point value `a'.
401*----------------------------------------------------------------------------*/
402
94a49d86 403INLINE int_fast16_t extractFloat64Exp(float64 a)
158142c2
FB
404{
405
f090c9d4 406 return ( float64_val(a)>>52 ) & 0x7FF;
158142c2
FB
407
408}
409
410/*----------------------------------------------------------------------------
411| Returns the sign bit of the double-precision floating-point value `a'.
412*----------------------------------------------------------------------------*/
413
414INLINE flag extractFloat64Sign( float64 a )
415{
416
f090c9d4 417 return float64_val(a)>>63;
158142c2
FB
418
419}
420
37d18660
PM
421/*----------------------------------------------------------------------------
422| If `a' is denormal and we are in flush-to-zero mode then set the
423| input-denormal exception and return zero. Otherwise just return the value.
424*----------------------------------------------------------------------------*/
425static float64 float64_squash_input_denormal(float64 a STATUS_PARAM)
426{
427 if (STATUS(flush_inputs_to_zero)) {
428 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
429 float_raise(float_flag_input_denormal STATUS_VAR);
430 return make_float64(float64_val(a) & (1ULL << 63));
431 }
432 }
433 return a;
434}
435
158142c2
FB
436/*----------------------------------------------------------------------------
437| Normalizes the subnormal double-precision floating-point value represented
438| by the denormalized significand `aSig'. The normalized exponent and
439| significand are stored at the locations pointed to by `zExpPtr' and
440| `zSigPtr', respectively.
441*----------------------------------------------------------------------------*/
442
443static void
94a49d86 444 normalizeFloat64Subnormal(uint64_t aSig, int_fast16_t *zExpPtr, uint64_t *zSigPtr)
158142c2
FB
445{
446 int8 shiftCount;
447
448 shiftCount = countLeadingZeros64( aSig ) - 11;
449 *zSigPtr = aSig<<shiftCount;
450 *zExpPtr = 1 - shiftCount;
451
452}
453
454/*----------------------------------------------------------------------------
455| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
456| double-precision floating-point value, returning the result. After being
457| shifted into the proper positions, the three fields are simply added
458| together to form the result. This means that any integer portion of `zSig'
459| will be added into the exponent. Since a properly normalized significand
460| will have an integer portion equal to 1, the `zExp' input should be 1 less
461| than the desired result exponent whenever `zSig' is a complete, normalized
462| significand.
463*----------------------------------------------------------------------------*/
464
94a49d86 465INLINE float64 packFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig)
158142c2
FB
466{
467
f090c9d4 468 return make_float64(
bb98fe42 469 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
158142c2
FB
470
471}
472
473/*----------------------------------------------------------------------------
474| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
475| and significand `zSig', and returns the proper double-precision floating-
476| point value corresponding to the abstract input. Ordinarily, the abstract
477| value is simply rounded and packed into the double-precision format, with
478| the inexact exception raised if the abstract input cannot be represented
479| exactly. However, if the abstract value is too large, the overflow and
480| inexact exceptions are raised and an infinity or maximal finite value is
481| returned. If the abstract value is too small, the input value is rounded
482| to a subnormal number, and the underflow and inexact exceptions are raised
483| if the abstract input cannot be represented exactly as a subnormal double-
484| precision floating-point number.
485| The input significand `zSig' has its binary point between bits 62
486| and 61, which is 10 bits to the left of the usual location. This shifted
487| significand must be normalized or smaller. If `zSig' is not normalized,
488| `zExp' must be 0; in that case, the result returned is a subnormal number,
489| and it must not require rounding. In the usual case that `zSig' is
490| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
491| The handling of underflow and overflow follows the IEC/IEEE Standard for
492| Binary Floating-Point Arithmetic.
493*----------------------------------------------------------------------------*/
494
94a49d86 495static float64 roundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM)
158142c2
FB
496{
497 int8 roundingMode;
498 flag roundNearestEven;
94a49d86 499 int_fast16_t roundIncrement, roundBits;
158142c2
FB
500 flag isTiny;
501
502 roundingMode = STATUS(float_rounding_mode);
503 roundNearestEven = ( roundingMode == float_round_nearest_even );
504 roundIncrement = 0x200;
505 if ( ! roundNearestEven ) {
506 if ( roundingMode == float_round_to_zero ) {
507 roundIncrement = 0;
508 }
509 else {
510 roundIncrement = 0x3FF;
511 if ( zSign ) {
512 if ( roundingMode == float_round_up ) roundIncrement = 0;
513 }
514 else {
515 if ( roundingMode == float_round_down ) roundIncrement = 0;
516 }
517 }
518 }
519 roundBits = zSig & 0x3FF;
bb98fe42 520 if ( 0x7FD <= (uint16_t) zExp ) {
158142c2
FB
521 if ( ( 0x7FD < zExp )
522 || ( ( zExp == 0x7FD )
bb98fe42 523 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
158142c2
FB
524 ) {
525 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
f090c9d4 526 return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
158142c2
FB
527 }
528 if ( zExp < 0 ) {
e6afc87f
PM
529 if (STATUS(flush_to_zero)) {
530 float_raise(float_flag_output_denormal STATUS_VAR);
531 return packFloat64(zSign, 0, 0);
532 }
158142c2
FB
533 isTiny =
534 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
535 || ( zExp < -1 )
536 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
537 shift64RightJamming( zSig, - zExp, &zSig );
538 zExp = 0;
539 roundBits = zSig & 0x3FF;
540 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
541 }
542 }
543 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
544 zSig = ( zSig + roundIncrement )>>10;
545 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
546 if ( zSig == 0 ) zExp = 0;
547 return packFloat64( zSign, zExp, zSig );
548
549}
550
551/*----------------------------------------------------------------------------
552| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
553| and significand `zSig', and returns the proper double-precision floating-
554| point value corresponding to the abstract input. This routine is just like
555| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
556| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
557| floating-point exponent.
558*----------------------------------------------------------------------------*/
559
560static float64
94a49d86 561 normalizeRoundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM)
158142c2
FB
562{
563 int8 shiftCount;
564
565 shiftCount = countLeadingZeros64( zSig ) - 1;
566 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
567
568}
569
158142c2
FB
570/*----------------------------------------------------------------------------
571| Returns the fraction bits of the extended double-precision floating-point
572| value `a'.
573*----------------------------------------------------------------------------*/
574
bb98fe42 575INLINE uint64_t extractFloatx80Frac( floatx80 a )
158142c2
FB
576{
577
578 return a.low;
579
580}
581
582/*----------------------------------------------------------------------------
583| Returns the exponent bits of the extended double-precision floating-point
584| value `a'.
585*----------------------------------------------------------------------------*/
586
587INLINE int32 extractFloatx80Exp( floatx80 a )
588{
589
590 return a.high & 0x7FFF;
591
592}
593
594/*----------------------------------------------------------------------------
595| Returns the sign bit of the extended double-precision floating-point value
596| `a'.
597*----------------------------------------------------------------------------*/
598
599INLINE flag extractFloatx80Sign( floatx80 a )
600{
601
602 return a.high>>15;
603
604}
605
606/*----------------------------------------------------------------------------
607| Normalizes the subnormal extended double-precision floating-point value
608| represented by the denormalized significand `aSig'. The normalized exponent
609| and significand are stored at the locations pointed to by `zExpPtr' and
610| `zSigPtr', respectively.
611*----------------------------------------------------------------------------*/
612
613static void
bb98fe42 614 normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr )
158142c2
FB
615{
616 int8 shiftCount;
617
618 shiftCount = countLeadingZeros64( aSig );
619 *zSigPtr = aSig<<shiftCount;
620 *zExpPtr = 1 - shiftCount;
621
622}
623
624/*----------------------------------------------------------------------------
625| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
626| extended double-precision floating-point value, returning the result.
627*----------------------------------------------------------------------------*/
628
bb98fe42 629INLINE floatx80 packFloatx80( flag zSign, int32 zExp, uint64_t zSig )
158142c2
FB
630{
631 floatx80 z;
632
633 z.low = zSig;
bb98fe42 634 z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
158142c2
FB
635 return z;
636
637}
638
639/*----------------------------------------------------------------------------
640| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
641| and extended significand formed by the concatenation of `zSig0' and `zSig1',
642| and returns the proper extended double-precision floating-point value
643| corresponding to the abstract input. Ordinarily, the abstract value is
644| rounded and packed into the extended double-precision format, with the
645| inexact exception raised if the abstract input cannot be represented
646| exactly. However, if the abstract value is too large, the overflow and
647| inexact exceptions are raised and an infinity or maximal finite value is
648| returned. If the abstract value is too small, the input value is rounded to
649| a subnormal number, and the underflow and inexact exceptions are raised if
650| the abstract input cannot be represented exactly as a subnormal extended
651| double-precision floating-point number.
652| If `roundingPrecision' is 32 or 64, the result is rounded to the same
653| number of bits as single or double precision, respectively. Otherwise, the
654| result is rounded to the full precision of the extended double-precision
655| format.
656| The input significand must be normalized or smaller. If the input
657| significand is not normalized, `zExp' must be 0; in that case, the result
658| returned is a subnormal number, and it must not require rounding. The
659| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
660| Floating-Point Arithmetic.
661*----------------------------------------------------------------------------*/
662
663static floatx80
664 roundAndPackFloatx80(
bb98fe42 665 int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
158142c2
FB
666 STATUS_PARAM)
667{
668 int8 roundingMode;
669 flag roundNearestEven, increment, isTiny;
670 int64 roundIncrement, roundMask, roundBits;
671
672 roundingMode = STATUS(float_rounding_mode);
673 roundNearestEven = ( roundingMode == float_round_nearest_even );
674 if ( roundingPrecision == 80 ) goto precision80;
675 if ( roundingPrecision == 64 ) {
676 roundIncrement = LIT64( 0x0000000000000400 );
677 roundMask = LIT64( 0x00000000000007FF );
678 }
679 else if ( roundingPrecision == 32 ) {
680 roundIncrement = LIT64( 0x0000008000000000 );
681 roundMask = LIT64( 0x000000FFFFFFFFFF );
682 }
683 else {
684 goto precision80;
685 }
686 zSig0 |= ( zSig1 != 0 );
687 if ( ! roundNearestEven ) {
688 if ( roundingMode == float_round_to_zero ) {
689 roundIncrement = 0;
690 }
691 else {
692 roundIncrement = roundMask;
693 if ( zSign ) {
694 if ( roundingMode == float_round_up ) roundIncrement = 0;
695 }
696 else {
697 if ( roundingMode == float_round_down ) roundIncrement = 0;
698 }
699 }
700 }
701 roundBits = zSig0 & roundMask;
bb98fe42 702 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
158142c2
FB
703 if ( ( 0x7FFE < zExp )
704 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
705 ) {
706 goto overflow;
707 }
708 if ( zExp <= 0 ) {
e6afc87f
PM
709 if (STATUS(flush_to_zero)) {
710 float_raise(float_flag_output_denormal STATUS_VAR);
711 return packFloatx80(zSign, 0, 0);
712 }
158142c2
FB
713 isTiny =
714 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
715 || ( zExp < 0 )
716 || ( zSig0 <= zSig0 + roundIncrement );
717 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
718 zExp = 0;
719 roundBits = zSig0 & roundMask;
720 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
721 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
722 zSig0 += roundIncrement;
bb98fe42 723 if ( (int64_t) zSig0 < 0 ) zExp = 1;
158142c2
FB
724 roundIncrement = roundMask + 1;
725 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
726 roundMask |= roundIncrement;
727 }
728 zSig0 &= ~ roundMask;
729 return packFloatx80( zSign, zExp, zSig0 );
730 }
731 }
732 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
733 zSig0 += roundIncrement;
734 if ( zSig0 < roundIncrement ) {
735 ++zExp;
736 zSig0 = LIT64( 0x8000000000000000 );
737 }
738 roundIncrement = roundMask + 1;
739 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
740 roundMask |= roundIncrement;
741 }
742 zSig0 &= ~ roundMask;
743 if ( zSig0 == 0 ) zExp = 0;
744 return packFloatx80( zSign, zExp, zSig0 );
745 precision80:
bb98fe42 746 increment = ( (int64_t) zSig1 < 0 );
158142c2
FB
747 if ( ! roundNearestEven ) {
748 if ( roundingMode == float_round_to_zero ) {
749 increment = 0;
750 }
751 else {
752 if ( zSign ) {
753 increment = ( roundingMode == float_round_down ) && zSig1;
754 }
755 else {
756 increment = ( roundingMode == float_round_up ) && zSig1;
757 }
758 }
759 }
bb98fe42 760 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
158142c2
FB
761 if ( ( 0x7FFE < zExp )
762 || ( ( zExp == 0x7FFE )
763 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
764 && increment
765 )
766 ) {
767 roundMask = 0;
768 overflow:
769 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
770 if ( ( roundingMode == float_round_to_zero )
771 || ( zSign && ( roundingMode == float_round_up ) )
772 || ( ! zSign && ( roundingMode == float_round_down ) )
773 ) {
774 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
775 }
776 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
777 }
778 if ( zExp <= 0 ) {
779 isTiny =
780 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
781 || ( zExp < 0 )
782 || ! increment
783 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
784 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
785 zExp = 0;
786 if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
787 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
788 if ( roundNearestEven ) {
bb98fe42 789 increment = ( (int64_t) zSig1 < 0 );
158142c2
FB
790 }
791 else {
792 if ( zSign ) {
793 increment = ( roundingMode == float_round_down ) && zSig1;
794 }
795 else {
796 increment = ( roundingMode == float_round_up ) && zSig1;
797 }
798 }
799 if ( increment ) {
800 ++zSig0;
801 zSig0 &=
bb98fe42
AF
802 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
803 if ( (int64_t) zSig0 < 0 ) zExp = 1;
158142c2
FB
804 }
805 return packFloatx80( zSign, zExp, zSig0 );
806 }
807 }
808 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
809 if ( increment ) {
810 ++zSig0;
811 if ( zSig0 == 0 ) {
812 ++zExp;
813 zSig0 = LIT64( 0x8000000000000000 );
814 }
815 else {
bb98fe42 816 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
158142c2
FB
817 }
818 }
819 else {
820 if ( zSig0 == 0 ) zExp = 0;
821 }
822 return packFloatx80( zSign, zExp, zSig0 );
823
824}
825
826/*----------------------------------------------------------------------------
827| Takes an abstract floating-point value having sign `zSign', exponent
828| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
829| and returns the proper extended double-precision floating-point value
830| corresponding to the abstract input. This routine is just like
831| `roundAndPackFloatx80' except that the input significand does not have to be
832| normalized.
833*----------------------------------------------------------------------------*/
834
835static floatx80
836 normalizeRoundAndPackFloatx80(
bb98fe42 837 int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
158142c2
FB
838 STATUS_PARAM)
839{
840 int8 shiftCount;
841
842 if ( zSig0 == 0 ) {
843 zSig0 = zSig1;
844 zSig1 = 0;
845 zExp -= 64;
846 }
847 shiftCount = countLeadingZeros64( zSig0 );
848 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
849 zExp -= shiftCount;
850 return
851 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
852
853}
854
158142c2
FB
855/*----------------------------------------------------------------------------
856| Returns the least-significant 64 fraction bits of the quadruple-precision
857| floating-point value `a'.
858*----------------------------------------------------------------------------*/
859
bb98fe42 860INLINE uint64_t extractFloat128Frac1( float128 a )
158142c2
FB
861{
862
863 return a.low;
864
865}
866
867/*----------------------------------------------------------------------------
868| Returns the most-significant 48 fraction bits of the quadruple-precision
869| floating-point value `a'.
870*----------------------------------------------------------------------------*/
871
bb98fe42 872INLINE uint64_t extractFloat128Frac0( float128 a )
158142c2
FB
873{
874
875 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
876
877}
878
879/*----------------------------------------------------------------------------
880| Returns the exponent bits of the quadruple-precision floating-point value
881| `a'.
882*----------------------------------------------------------------------------*/
883
884INLINE int32 extractFloat128Exp( float128 a )
885{
886
887 return ( a.high>>48 ) & 0x7FFF;
888
889}
890
891/*----------------------------------------------------------------------------
892| Returns the sign bit of the quadruple-precision floating-point value `a'.
893*----------------------------------------------------------------------------*/
894
895INLINE flag extractFloat128Sign( float128 a )
896{
897
898 return a.high>>63;
899
900}
901
902/*----------------------------------------------------------------------------
903| Normalizes the subnormal quadruple-precision floating-point value
904| represented by the denormalized significand formed by the concatenation of
905| `aSig0' and `aSig1'. The normalized exponent is stored at the location
906| pointed to by `zExpPtr'. The most significant 49 bits of the normalized
907| significand are stored at the location pointed to by `zSig0Ptr', and the
908| least significant 64 bits of the normalized significand are stored at the
909| location pointed to by `zSig1Ptr'.
910*----------------------------------------------------------------------------*/
911
912static void
913 normalizeFloat128Subnormal(
bb98fe42
AF
914 uint64_t aSig0,
915 uint64_t aSig1,
158142c2 916 int32 *zExpPtr,
bb98fe42
AF
917 uint64_t *zSig0Ptr,
918 uint64_t *zSig1Ptr
158142c2
FB
919 )
920{
921 int8 shiftCount;
922
923 if ( aSig0 == 0 ) {
924 shiftCount = countLeadingZeros64( aSig1 ) - 15;
925 if ( shiftCount < 0 ) {
926 *zSig0Ptr = aSig1>>( - shiftCount );
927 *zSig1Ptr = aSig1<<( shiftCount & 63 );
928 }
929 else {
930 *zSig0Ptr = aSig1<<shiftCount;
931 *zSig1Ptr = 0;
932 }
933 *zExpPtr = - shiftCount - 63;
934 }
935 else {
936 shiftCount = countLeadingZeros64( aSig0 ) - 15;
937 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
938 *zExpPtr = 1 - shiftCount;
939 }
940
941}
942
943/*----------------------------------------------------------------------------
944| Packs the sign `zSign', the exponent `zExp', and the significand formed
945| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
946| floating-point value, returning the result. After being shifted into the
947| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
948| added together to form the most significant 32 bits of the result. This
949| means that any integer portion of `zSig0' will be added into the exponent.
950| Since a properly normalized significand will have an integer portion equal
951| to 1, the `zExp' input should be 1 less than the desired result exponent
952| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
953| significand.
954*----------------------------------------------------------------------------*/
955
956INLINE float128
bb98fe42 957 packFloat128( flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 )
158142c2
FB
958{
959 float128 z;
960
961 z.low = zSig1;
bb98fe42 962 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
158142c2
FB
963 return z;
964
965}
966
967/*----------------------------------------------------------------------------
968| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
969| and extended significand formed by the concatenation of `zSig0', `zSig1',
970| and `zSig2', and returns the proper quadruple-precision floating-point value
971| corresponding to the abstract input. Ordinarily, the abstract value is
972| simply rounded and packed into the quadruple-precision format, with the
973| inexact exception raised if the abstract input cannot be represented
974| exactly. However, if the abstract value is too large, the overflow and
975| inexact exceptions are raised and an infinity or maximal finite value is
976| returned. If the abstract value is too small, the input value is rounded to
977| a subnormal number, and the underflow and inexact exceptions are raised if
978| the abstract input cannot be represented exactly as a subnormal quadruple-
979| precision floating-point number.
980| The input significand must be normalized or smaller. If the input
981| significand is not normalized, `zExp' must be 0; in that case, the result
982| returned is a subnormal number, and it must not require rounding. In the
983| usual case that the input significand is normalized, `zExp' must be 1 less
984| than the ``true'' floating-point exponent. The handling of underflow and
985| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
986*----------------------------------------------------------------------------*/
987
988static float128
989 roundAndPackFloat128(
bb98fe42 990 flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1, uint64_t zSig2 STATUS_PARAM)
158142c2
FB
991{
992 int8 roundingMode;
993 flag roundNearestEven, increment, isTiny;
994
995 roundingMode = STATUS(float_rounding_mode);
996 roundNearestEven = ( roundingMode == float_round_nearest_even );
bb98fe42 997 increment = ( (int64_t) zSig2 < 0 );
158142c2
FB
998 if ( ! roundNearestEven ) {
999 if ( roundingMode == float_round_to_zero ) {
1000 increment = 0;
1001 }
1002 else {
1003 if ( zSign ) {
1004 increment = ( roundingMode == float_round_down ) && zSig2;
1005 }
1006 else {
1007 increment = ( roundingMode == float_round_up ) && zSig2;
1008 }
1009 }
1010 }
bb98fe42 1011 if ( 0x7FFD <= (uint32_t) zExp ) {
158142c2
FB
1012 if ( ( 0x7FFD < zExp )
1013 || ( ( zExp == 0x7FFD )
1014 && eq128(
1015 LIT64( 0x0001FFFFFFFFFFFF ),
1016 LIT64( 0xFFFFFFFFFFFFFFFF ),
1017 zSig0,
1018 zSig1
1019 )
1020 && increment
1021 )
1022 ) {
1023 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
1024 if ( ( roundingMode == float_round_to_zero )
1025 || ( zSign && ( roundingMode == float_round_up ) )
1026 || ( ! zSign && ( roundingMode == float_round_down ) )
1027 ) {
1028 return
1029 packFloat128(
1030 zSign,
1031 0x7FFE,
1032 LIT64( 0x0000FFFFFFFFFFFF ),
1033 LIT64( 0xFFFFFFFFFFFFFFFF )
1034 );
1035 }
1036 return packFloat128( zSign, 0x7FFF, 0, 0 );
1037 }
1038 if ( zExp < 0 ) {
e6afc87f
PM
1039 if (STATUS(flush_to_zero)) {
1040 float_raise(float_flag_output_denormal STATUS_VAR);
1041 return packFloat128(zSign, 0, 0, 0);
1042 }
158142c2
FB
1043 isTiny =
1044 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
1045 || ( zExp < -1 )
1046 || ! increment
1047 || lt128(
1048 zSig0,
1049 zSig1,
1050 LIT64( 0x0001FFFFFFFFFFFF ),
1051 LIT64( 0xFFFFFFFFFFFFFFFF )
1052 );
1053 shift128ExtraRightJamming(
1054 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1055 zExp = 0;
1056 if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
1057 if ( roundNearestEven ) {
bb98fe42 1058 increment = ( (int64_t) zSig2 < 0 );
158142c2
FB
1059 }
1060 else {
1061 if ( zSign ) {
1062 increment = ( roundingMode == float_round_down ) && zSig2;
1063 }
1064 else {
1065 increment = ( roundingMode == float_round_up ) && zSig2;
1066 }
1067 }
1068 }
1069 }
1070 if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1071 if ( increment ) {
1072 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1073 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1074 }
1075 else {
1076 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1077 }
1078 return packFloat128( zSign, zExp, zSig0, zSig1 );
1079
1080}
1081
1082/*----------------------------------------------------------------------------
1083| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1084| and significand formed by the concatenation of `zSig0' and `zSig1', and
1085| returns the proper quadruple-precision floating-point value corresponding
1086| to the abstract input. This routine is just like `roundAndPackFloat128'
1087| except that the input significand has fewer bits and does not have to be
1088| normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1089| point exponent.
1090*----------------------------------------------------------------------------*/
1091
1092static float128
1093 normalizeRoundAndPackFloat128(
bb98fe42 1094 flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 STATUS_PARAM)
158142c2
FB
1095{
1096 int8 shiftCount;
bb98fe42 1097 uint64_t zSig2;
158142c2
FB
1098
1099 if ( zSig0 == 0 ) {
1100 zSig0 = zSig1;
1101 zSig1 = 0;
1102 zExp -= 64;
1103 }
1104 shiftCount = countLeadingZeros64( zSig0 ) - 15;
1105 if ( 0 <= shiftCount ) {
1106 zSig2 = 0;
1107 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1108 }
1109 else {
1110 shift128ExtraRightJamming(
1111 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1112 }
1113 zExp -= shiftCount;
1114 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1115
1116}
1117
158142c2
FB
1118/*----------------------------------------------------------------------------
1119| Returns the result of converting the 32-bit two's complement integer `a'
1120| to the single-precision floating-point format. The conversion is performed
1121| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1122*----------------------------------------------------------------------------*/
1123
1124float32 int32_to_float32( int32 a STATUS_PARAM )
1125{
1126 flag zSign;
1127
f090c9d4 1128 if ( a == 0 ) return float32_zero;
bb98fe42 1129 if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
158142c2
FB
1130 zSign = ( a < 0 );
1131 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1132
1133}
1134
1135/*----------------------------------------------------------------------------
1136| Returns the result of converting the 32-bit two's complement integer `a'
1137| to the double-precision floating-point format. The conversion is performed
1138| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1139*----------------------------------------------------------------------------*/
1140
1141float64 int32_to_float64( int32 a STATUS_PARAM )
1142{
1143 flag zSign;
1144 uint32 absA;
1145 int8 shiftCount;
bb98fe42 1146 uint64_t zSig;
158142c2 1147
f090c9d4 1148 if ( a == 0 ) return float64_zero;
158142c2
FB
1149 zSign = ( a < 0 );
1150 absA = zSign ? - a : a;
1151 shiftCount = countLeadingZeros32( absA ) + 21;
1152 zSig = absA;
1153 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1154
1155}
1156
158142c2
FB
1157/*----------------------------------------------------------------------------
1158| Returns the result of converting the 32-bit two's complement integer `a'
1159| to the extended double-precision floating-point format. The conversion
1160| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1161| Arithmetic.
1162*----------------------------------------------------------------------------*/
1163
1164floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1165{
1166 flag zSign;
1167 uint32 absA;
1168 int8 shiftCount;
bb98fe42 1169 uint64_t zSig;
158142c2
FB
1170
1171 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1172 zSign = ( a < 0 );
1173 absA = zSign ? - a : a;
1174 shiftCount = countLeadingZeros32( absA ) + 32;
1175 zSig = absA;
1176 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1177
1178}
1179
158142c2
FB
1180/*----------------------------------------------------------------------------
1181| Returns the result of converting the 32-bit two's complement integer `a' to
1182| the quadruple-precision floating-point format. The conversion is performed
1183| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1184*----------------------------------------------------------------------------*/
1185
1186float128 int32_to_float128( int32 a STATUS_PARAM )
1187{
1188 flag zSign;
1189 uint32 absA;
1190 int8 shiftCount;
bb98fe42 1191 uint64_t zSig0;
158142c2
FB
1192
1193 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1194 zSign = ( a < 0 );
1195 absA = zSign ? - a : a;
1196 shiftCount = countLeadingZeros32( absA ) + 17;
1197 zSig0 = absA;
1198 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1199
1200}
1201
158142c2
FB
1202/*----------------------------------------------------------------------------
1203| Returns the result of converting the 64-bit two's complement integer `a'
1204| to the single-precision floating-point format. The conversion is performed
1205| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1206*----------------------------------------------------------------------------*/
1207
1208float32 int64_to_float32( int64 a STATUS_PARAM )
1209{
1210 flag zSign;
1211 uint64 absA;
1212 int8 shiftCount;
1213
f090c9d4 1214 if ( a == 0 ) return float32_zero;
158142c2
FB
1215 zSign = ( a < 0 );
1216 absA = zSign ? - a : a;
1217 shiftCount = countLeadingZeros64( absA ) - 40;
1218 if ( 0 <= shiftCount ) {
1219 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1220 }
1221 else {
1222 shiftCount += 7;
1223 if ( shiftCount < 0 ) {
1224 shift64RightJamming( absA, - shiftCount, &absA );
1225 }
1226 else {
1227 absA <<= shiftCount;
1228 }
1229 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1230 }
1231
1232}
1233
3430b0be 1234float32 uint64_to_float32( uint64 a STATUS_PARAM )
75d62a58
JM
1235{
1236 int8 shiftCount;
1237
f090c9d4 1238 if ( a == 0 ) return float32_zero;
75d62a58
JM
1239 shiftCount = countLeadingZeros64( a ) - 40;
1240 if ( 0 <= shiftCount ) {
e744c06f 1241 return packFloat32(0, 0x95 - shiftCount, a<<shiftCount);
75d62a58
JM
1242 }
1243 else {
1244 shiftCount += 7;
1245 if ( shiftCount < 0 ) {
1246 shift64RightJamming( a, - shiftCount, &a );
1247 }
1248 else {
1249 a <<= shiftCount;
1250 }
e744c06f 1251 return roundAndPackFloat32(0, 0x9C - shiftCount, a STATUS_VAR);
75d62a58
JM
1252 }
1253}
1254
158142c2
FB
1255/*----------------------------------------------------------------------------
1256| Returns the result of converting the 64-bit two's complement integer `a'
1257| to the double-precision floating-point format. The conversion is performed
1258| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1259*----------------------------------------------------------------------------*/
1260
1261float64 int64_to_float64( int64 a STATUS_PARAM )
1262{
1263 flag zSign;
1264
f090c9d4 1265 if ( a == 0 ) return float64_zero;
bb98fe42 1266 if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
158142c2
FB
1267 return packFloat64( 1, 0x43E, 0 );
1268 }
1269 zSign = ( a < 0 );
1270 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1271
1272}
1273
17ed2293 1274float64 uint64_to_float64(uint64 a STATUS_PARAM)
75d62a58 1275{
17ed2293 1276 int exp = 0x43C;
75d62a58 1277
17ed2293
RH
1278 if (a == 0) {
1279 return float64_zero;
1280 }
1281 if ((int64_t)a < 0) {
1282 shift64RightJamming(a, 1, &a);
1283 exp += 1;
1284 }
1285 return normalizeRoundAndPackFloat64(0, exp, a STATUS_VAR);
75d62a58
JM
1286}
1287
158142c2
FB
1288/*----------------------------------------------------------------------------
1289| Returns the result of converting the 64-bit two's complement integer `a'
1290| to the extended double-precision floating-point format. The conversion
1291| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1292| Arithmetic.
1293*----------------------------------------------------------------------------*/
1294
1295floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1296{
1297 flag zSign;
1298 uint64 absA;
1299 int8 shiftCount;
1300
1301 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1302 zSign = ( a < 0 );
1303 absA = zSign ? - a : a;
1304 shiftCount = countLeadingZeros64( absA );
1305 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1306
1307}
1308
158142c2
FB
1309/*----------------------------------------------------------------------------
1310| Returns the result of converting the 64-bit two's complement integer `a' to
1311| the quadruple-precision floating-point format. The conversion is performed
1312| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1313*----------------------------------------------------------------------------*/
1314
1315float128 int64_to_float128( int64 a STATUS_PARAM )
1316{
1317 flag zSign;
1318 uint64 absA;
1319 int8 shiftCount;
1320 int32 zExp;
bb98fe42 1321 uint64_t zSig0, zSig1;
158142c2
FB
1322
1323 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1324 zSign = ( a < 0 );
1325 absA = zSign ? - a : a;
1326 shiftCount = countLeadingZeros64( absA ) + 49;
1327 zExp = 0x406E - shiftCount;
1328 if ( 64 <= shiftCount ) {
1329 zSig1 = 0;
1330 zSig0 = absA;
1331 shiftCount -= 64;
1332 }
1333 else {
1334 zSig1 = absA;
1335 zSig0 = 0;
1336 }
1337 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1338 return packFloat128( zSign, zExp, zSig0, zSig1 );
1339
1340}
1341
1e397ead
RH
1342float128 uint64_to_float128(uint64 a STATUS_PARAM)
1343{
1344 if (a == 0) {
1345 return float128_zero;
1346 }
1347 return normalizeRoundAndPackFloat128(0, 0x406E, a, 0 STATUS_VAR);
1348}
1349
158142c2
FB
1350/*----------------------------------------------------------------------------
1351| Returns the result of converting the single-precision floating-point value
1352| `a' to the 32-bit two's complement integer format. The conversion is
1353| performed according to the IEC/IEEE Standard for Binary Floating-Point
1354| Arithmetic---which means in particular that the conversion is rounded
1355| according to the current rounding mode. If `a' is a NaN, the largest
1356| positive integer is returned. Otherwise, if the conversion overflows, the
1357| largest integer with the same sign as `a' is returned.
1358*----------------------------------------------------------------------------*/
1359
1360int32 float32_to_int32( float32 a STATUS_PARAM )
1361{
1362 flag aSign;
94a49d86 1363 int_fast16_t aExp, shiftCount;
bb98fe42
AF
1364 uint32_t aSig;
1365 uint64_t aSig64;
158142c2 1366
37d18660 1367 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1368 aSig = extractFloat32Frac( a );
1369 aExp = extractFloat32Exp( a );
1370 aSign = extractFloat32Sign( a );
1371 if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1372 if ( aExp ) aSig |= 0x00800000;
1373 shiftCount = 0xAF - aExp;
1374 aSig64 = aSig;
1375 aSig64 <<= 32;
1376 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1377 return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1378
1379}
1380
1381/*----------------------------------------------------------------------------
1382| Returns the result of converting the single-precision floating-point value
1383| `a' to the 32-bit two's complement integer format. The conversion is
1384| performed according to the IEC/IEEE Standard for Binary Floating-Point
1385| Arithmetic, except that the conversion is always rounded toward zero.
1386| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1387| the conversion overflows, the largest integer with the same sign as `a' is
1388| returned.
1389*----------------------------------------------------------------------------*/
1390
1391int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1392{
1393 flag aSign;
94a49d86 1394 int_fast16_t aExp, shiftCount;
bb98fe42 1395 uint32_t aSig;
b3a6a2e0 1396 int32_t z;
37d18660 1397 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1398
1399 aSig = extractFloat32Frac( a );
1400 aExp = extractFloat32Exp( a );
1401 aSign = extractFloat32Sign( a );
1402 shiftCount = aExp - 0x9E;
1403 if ( 0 <= shiftCount ) {
f090c9d4 1404 if ( float32_val(a) != 0xCF000000 ) {
158142c2
FB
1405 float_raise( float_flag_invalid STATUS_VAR);
1406 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1407 }
bb98fe42 1408 return (int32_t) 0x80000000;
158142c2
FB
1409 }
1410 else if ( aExp <= 0x7E ) {
1411 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1412 return 0;
1413 }
1414 aSig = ( aSig | 0x00800000 )<<8;
1415 z = aSig>>( - shiftCount );
bb98fe42 1416 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
158142c2
FB
1417 STATUS(float_exception_flags) |= float_flag_inexact;
1418 }
1419 if ( aSign ) z = - z;
1420 return z;
1421
1422}
1423
cbcef455
PM
1424/*----------------------------------------------------------------------------
1425| Returns the result of converting the single-precision floating-point value
1426| `a' to the 16-bit two's complement integer format. The conversion is
1427| performed according to the IEC/IEEE Standard for Binary Floating-Point
1428| Arithmetic, except that the conversion is always rounded toward zero.
1429| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1430| the conversion overflows, the largest integer with the same sign as `a' is
1431| returned.
1432*----------------------------------------------------------------------------*/
1433
94a49d86 1434int_fast16_t float32_to_int16_round_to_zero(float32 a STATUS_PARAM)
cbcef455
PM
1435{
1436 flag aSign;
94a49d86 1437 int_fast16_t aExp, shiftCount;
bb98fe42 1438 uint32_t aSig;
cbcef455
PM
1439 int32 z;
1440
1441 aSig = extractFloat32Frac( a );
1442 aExp = extractFloat32Exp( a );
1443 aSign = extractFloat32Sign( a );
1444 shiftCount = aExp - 0x8E;
1445 if ( 0 <= shiftCount ) {
1446 if ( float32_val(a) != 0xC7000000 ) {
1447 float_raise( float_flag_invalid STATUS_VAR);
1448 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1449 return 0x7FFF;
1450 }
1451 }
bb98fe42 1452 return (int32_t) 0xffff8000;
cbcef455
PM
1453 }
1454 else if ( aExp <= 0x7E ) {
1455 if ( aExp | aSig ) {
1456 STATUS(float_exception_flags) |= float_flag_inexact;
1457 }
1458 return 0;
1459 }
1460 shiftCount -= 0x10;
1461 aSig = ( aSig | 0x00800000 )<<8;
1462 z = aSig>>( - shiftCount );
bb98fe42 1463 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
cbcef455
PM
1464 STATUS(float_exception_flags) |= float_flag_inexact;
1465 }
1466 if ( aSign ) {
1467 z = - z;
1468 }
1469 return z;
1470
1471}
1472
158142c2
FB
1473/*----------------------------------------------------------------------------
1474| Returns the result of converting the single-precision floating-point value
1475| `a' to the 64-bit two's complement integer format. The conversion is
1476| performed according to the IEC/IEEE Standard for Binary Floating-Point
1477| Arithmetic---which means in particular that the conversion is rounded
1478| according to the current rounding mode. If `a' is a NaN, the largest
1479| positive integer is returned. Otherwise, if the conversion overflows, the
1480| largest integer with the same sign as `a' is returned.
1481*----------------------------------------------------------------------------*/
1482
1483int64 float32_to_int64( float32 a STATUS_PARAM )
1484{
1485 flag aSign;
94a49d86 1486 int_fast16_t aExp, shiftCount;
bb98fe42
AF
1487 uint32_t aSig;
1488 uint64_t aSig64, aSigExtra;
37d18660 1489 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1490
1491 aSig = extractFloat32Frac( a );
1492 aExp = extractFloat32Exp( a );
1493 aSign = extractFloat32Sign( a );
1494 shiftCount = 0xBE - aExp;
1495 if ( shiftCount < 0 ) {
1496 float_raise( float_flag_invalid STATUS_VAR);
1497 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1498 return LIT64( 0x7FFFFFFFFFFFFFFF );
1499 }
bb98fe42 1500 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
1501 }
1502 if ( aExp ) aSig |= 0x00800000;
1503 aSig64 = aSig;
1504 aSig64 <<= 40;
1505 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1506 return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1507
1508}
1509
1510/*----------------------------------------------------------------------------
1511| Returns the result of converting the single-precision floating-point value
1512| `a' to the 64-bit two's complement integer format. The conversion is
1513| performed according to the IEC/IEEE Standard for Binary Floating-Point
1514| Arithmetic, except that the conversion is always rounded toward zero. If
1515| `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1516| conversion overflows, the largest integer with the same sign as `a' is
1517| returned.
1518*----------------------------------------------------------------------------*/
1519
1520int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1521{
1522 flag aSign;
94a49d86 1523 int_fast16_t aExp, shiftCount;
bb98fe42
AF
1524 uint32_t aSig;
1525 uint64_t aSig64;
158142c2 1526 int64 z;
37d18660 1527 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1528
1529 aSig = extractFloat32Frac( a );
1530 aExp = extractFloat32Exp( a );
1531 aSign = extractFloat32Sign( a );
1532 shiftCount = aExp - 0xBE;
1533 if ( 0 <= shiftCount ) {
f090c9d4 1534 if ( float32_val(a) != 0xDF000000 ) {
158142c2
FB
1535 float_raise( float_flag_invalid STATUS_VAR);
1536 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1537 return LIT64( 0x7FFFFFFFFFFFFFFF );
1538 }
1539 }
bb98fe42 1540 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
1541 }
1542 else if ( aExp <= 0x7E ) {
1543 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1544 return 0;
1545 }
1546 aSig64 = aSig | 0x00800000;
1547 aSig64 <<= 40;
1548 z = aSig64>>( - shiftCount );
bb98fe42 1549 if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
158142c2
FB
1550 STATUS(float_exception_flags) |= float_flag_inexact;
1551 }
1552 if ( aSign ) z = - z;
1553 return z;
1554
1555}
1556
1557/*----------------------------------------------------------------------------
1558| Returns the result of converting the single-precision floating-point value
1559| `a' to the double-precision floating-point format. The conversion is
1560| performed according to the IEC/IEEE Standard for Binary Floating-Point
1561| Arithmetic.
1562*----------------------------------------------------------------------------*/
1563
1564float64 float32_to_float64( float32 a STATUS_PARAM )
1565{
1566 flag aSign;
94a49d86 1567 int_fast16_t aExp;
bb98fe42 1568 uint32_t aSig;
37d18660 1569 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1570
1571 aSig = extractFloat32Frac( a );
1572 aExp = extractFloat32Exp( a );
1573 aSign = extractFloat32Sign( a );
1574 if ( aExp == 0xFF ) {
bcd4d9af 1575 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
1576 return packFloat64( aSign, 0x7FF, 0 );
1577 }
1578 if ( aExp == 0 ) {
1579 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1580 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1581 --aExp;
1582 }
bb98fe42 1583 return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
158142c2
FB
1584
1585}
1586
158142c2
FB
1587/*----------------------------------------------------------------------------
1588| Returns the result of converting the single-precision floating-point value
1589| `a' to the extended double-precision floating-point format. The conversion
1590| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1591| Arithmetic.
1592*----------------------------------------------------------------------------*/
1593
1594floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1595{
1596 flag aSign;
94a49d86 1597 int_fast16_t aExp;
bb98fe42 1598 uint32_t aSig;
158142c2 1599
37d18660 1600 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1601 aSig = extractFloat32Frac( a );
1602 aExp = extractFloat32Exp( a );
1603 aSign = extractFloat32Sign( a );
1604 if ( aExp == 0xFF ) {
bcd4d9af 1605 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
1606 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1607 }
1608 if ( aExp == 0 ) {
1609 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1610 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1611 }
1612 aSig |= 0x00800000;
bb98fe42 1613 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
158142c2
FB
1614
1615}
1616
158142c2
FB
1617/*----------------------------------------------------------------------------
1618| Returns the result of converting the single-precision floating-point value
1619| `a' to the double-precision floating-point format. The conversion is
1620| performed according to the IEC/IEEE Standard for Binary Floating-Point
1621| Arithmetic.
1622*----------------------------------------------------------------------------*/
1623
1624float128 float32_to_float128( float32 a STATUS_PARAM )
1625{
1626 flag aSign;
94a49d86 1627 int_fast16_t aExp;
bb98fe42 1628 uint32_t aSig;
158142c2 1629
37d18660 1630 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1631 aSig = extractFloat32Frac( a );
1632 aExp = extractFloat32Exp( a );
1633 aSign = extractFloat32Sign( a );
1634 if ( aExp == 0xFF ) {
bcd4d9af 1635 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
1636 return packFloat128( aSign, 0x7FFF, 0, 0 );
1637 }
1638 if ( aExp == 0 ) {
1639 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1640 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1641 --aExp;
1642 }
bb98fe42 1643 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
158142c2
FB
1644
1645}
1646
158142c2
FB
1647/*----------------------------------------------------------------------------
1648| Rounds the single-precision floating-point value `a' to an integer, and
1649| returns the result as a single-precision floating-point value. The
1650| operation is performed according to the IEC/IEEE Standard for Binary
1651| Floating-Point Arithmetic.
1652*----------------------------------------------------------------------------*/
1653
1654float32 float32_round_to_int( float32 a STATUS_PARAM)
1655{
1656 flag aSign;
94a49d86 1657 int_fast16_t aExp;
bb98fe42 1658 uint32_t lastBitMask, roundBitsMask;
158142c2 1659 int8 roundingMode;
bb98fe42 1660 uint32_t z;
37d18660 1661 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1662
1663 aExp = extractFloat32Exp( a );
1664 if ( 0x96 <= aExp ) {
1665 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1666 return propagateFloat32NaN( a, a STATUS_VAR );
1667 }
1668 return a;
1669 }
1670 if ( aExp <= 0x7E ) {
bb98fe42 1671 if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
158142c2
FB
1672 STATUS(float_exception_flags) |= float_flag_inexact;
1673 aSign = extractFloat32Sign( a );
1674 switch ( STATUS(float_rounding_mode) ) {
1675 case float_round_nearest_even:
1676 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1677 return packFloat32( aSign, 0x7F, 0 );
1678 }
1679 break;
1680 case float_round_down:
f090c9d4 1681 return make_float32(aSign ? 0xBF800000 : 0);
158142c2 1682 case float_round_up:
f090c9d4 1683 return make_float32(aSign ? 0x80000000 : 0x3F800000);
158142c2
FB
1684 }
1685 return packFloat32( aSign, 0, 0 );
1686 }
1687 lastBitMask = 1;
1688 lastBitMask <<= 0x96 - aExp;
1689 roundBitsMask = lastBitMask - 1;
f090c9d4 1690 z = float32_val(a);
158142c2
FB
1691 roundingMode = STATUS(float_rounding_mode);
1692 if ( roundingMode == float_round_nearest_even ) {
1693 z += lastBitMask>>1;
1694 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1695 }
1696 else if ( roundingMode != float_round_to_zero ) {
f090c9d4 1697 if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
158142c2
FB
1698 z += roundBitsMask;
1699 }
1700 }
1701 z &= ~ roundBitsMask;
f090c9d4
PB
1702 if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1703 return make_float32(z);
158142c2
FB
1704
1705}
1706
1707/*----------------------------------------------------------------------------
1708| Returns the result of adding the absolute values of the single-precision
1709| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1710| before being returned. `zSign' is ignored if the result is a NaN.
1711| The addition is performed according to the IEC/IEEE Standard for Binary
1712| Floating-Point Arithmetic.
1713*----------------------------------------------------------------------------*/
1714
1715static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1716{
94a49d86 1717 int_fast16_t aExp, bExp, zExp;
bb98fe42 1718 uint32_t aSig, bSig, zSig;
94a49d86 1719 int_fast16_t expDiff;
158142c2
FB
1720
1721 aSig = extractFloat32Frac( a );
1722 aExp = extractFloat32Exp( a );
1723 bSig = extractFloat32Frac( b );
1724 bExp = extractFloat32Exp( b );
1725 expDiff = aExp - bExp;
1726 aSig <<= 6;
1727 bSig <<= 6;
1728 if ( 0 < expDiff ) {
1729 if ( aExp == 0xFF ) {
1730 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1731 return a;
1732 }
1733 if ( bExp == 0 ) {
1734 --expDiff;
1735 }
1736 else {
1737 bSig |= 0x20000000;
1738 }
1739 shift32RightJamming( bSig, expDiff, &bSig );
1740 zExp = aExp;
1741 }
1742 else if ( expDiff < 0 ) {
1743 if ( bExp == 0xFF ) {
1744 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1745 return packFloat32( zSign, 0xFF, 0 );
1746 }
1747 if ( aExp == 0 ) {
1748 ++expDiff;
1749 }
1750 else {
1751 aSig |= 0x20000000;
1752 }
1753 shift32RightJamming( aSig, - expDiff, &aSig );
1754 zExp = bExp;
1755 }
1756 else {
1757 if ( aExp == 0xFF ) {
1758 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1759 return a;
1760 }
fe76d976 1761 if ( aExp == 0 ) {
e6afc87f
PM
1762 if (STATUS(flush_to_zero)) {
1763 if (aSig | bSig) {
1764 float_raise(float_flag_output_denormal STATUS_VAR);
1765 }
1766 return packFloat32(zSign, 0, 0);
1767 }
fe76d976
PB
1768 return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1769 }
158142c2
FB
1770 zSig = 0x40000000 + aSig + bSig;
1771 zExp = aExp;
1772 goto roundAndPack;
1773 }
1774 aSig |= 0x20000000;
1775 zSig = ( aSig + bSig )<<1;
1776 --zExp;
bb98fe42 1777 if ( (int32_t) zSig < 0 ) {
158142c2
FB
1778 zSig = aSig + bSig;
1779 ++zExp;
1780 }
1781 roundAndPack:
1782 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1783
1784}
1785
1786/*----------------------------------------------------------------------------
1787| Returns the result of subtracting the absolute values of the single-
1788| precision floating-point values `a' and `b'. If `zSign' is 1, the
1789| difference is negated before being returned. `zSign' is ignored if the
1790| result is a NaN. The subtraction is performed according to the IEC/IEEE
1791| Standard for Binary Floating-Point Arithmetic.
1792*----------------------------------------------------------------------------*/
1793
1794static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1795{
94a49d86 1796 int_fast16_t aExp, bExp, zExp;
bb98fe42 1797 uint32_t aSig, bSig, zSig;
94a49d86 1798 int_fast16_t expDiff;
158142c2
FB
1799
1800 aSig = extractFloat32Frac( a );
1801 aExp = extractFloat32Exp( a );
1802 bSig = extractFloat32Frac( b );
1803 bExp = extractFloat32Exp( b );
1804 expDiff = aExp - bExp;
1805 aSig <<= 7;
1806 bSig <<= 7;
1807 if ( 0 < expDiff ) goto aExpBigger;
1808 if ( expDiff < 0 ) goto bExpBigger;
1809 if ( aExp == 0xFF ) {
1810 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1811 float_raise( float_flag_invalid STATUS_VAR);
1812 return float32_default_nan;
1813 }
1814 if ( aExp == 0 ) {
1815 aExp = 1;
1816 bExp = 1;
1817 }
1818 if ( bSig < aSig ) goto aBigger;
1819 if ( aSig < bSig ) goto bBigger;
1820 return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1821 bExpBigger:
1822 if ( bExp == 0xFF ) {
1823 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1824 return packFloat32( zSign ^ 1, 0xFF, 0 );
1825 }
1826 if ( aExp == 0 ) {
1827 ++expDiff;
1828 }
1829 else {
1830 aSig |= 0x40000000;
1831 }
1832 shift32RightJamming( aSig, - expDiff, &aSig );
1833 bSig |= 0x40000000;
1834 bBigger:
1835 zSig = bSig - aSig;
1836 zExp = bExp;
1837 zSign ^= 1;
1838 goto normalizeRoundAndPack;
1839 aExpBigger:
1840 if ( aExp == 0xFF ) {
1841 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1842 return a;
1843 }
1844 if ( bExp == 0 ) {
1845 --expDiff;
1846 }
1847 else {
1848 bSig |= 0x40000000;
1849 }
1850 shift32RightJamming( bSig, expDiff, &bSig );
1851 aSig |= 0x40000000;
1852 aBigger:
1853 zSig = aSig - bSig;
1854 zExp = aExp;
1855 normalizeRoundAndPack:
1856 --zExp;
1857 return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1858
1859}
1860
1861/*----------------------------------------------------------------------------
1862| Returns the result of adding the single-precision floating-point values `a'
1863| and `b'. The operation is performed according to the IEC/IEEE Standard for
1864| Binary Floating-Point Arithmetic.
1865*----------------------------------------------------------------------------*/
1866
1867float32 float32_add( float32 a, float32 b STATUS_PARAM )
1868{
1869 flag aSign, bSign;
37d18660
PM
1870 a = float32_squash_input_denormal(a STATUS_VAR);
1871 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
1872
1873 aSign = extractFloat32Sign( a );
1874 bSign = extractFloat32Sign( b );
1875 if ( aSign == bSign ) {
1876 return addFloat32Sigs( a, b, aSign STATUS_VAR);
1877 }
1878 else {
1879 return subFloat32Sigs( a, b, aSign STATUS_VAR );
1880 }
1881
1882}
1883
1884/*----------------------------------------------------------------------------
1885| Returns the result of subtracting the single-precision floating-point values
1886| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1887| for Binary Floating-Point Arithmetic.
1888*----------------------------------------------------------------------------*/
1889
1890float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1891{
1892 flag aSign, bSign;
37d18660
PM
1893 a = float32_squash_input_denormal(a STATUS_VAR);
1894 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
1895
1896 aSign = extractFloat32Sign( a );
1897 bSign = extractFloat32Sign( b );
1898 if ( aSign == bSign ) {
1899 return subFloat32Sigs( a, b, aSign STATUS_VAR );
1900 }
1901 else {
1902 return addFloat32Sigs( a, b, aSign STATUS_VAR );
1903 }
1904
1905}
1906
1907/*----------------------------------------------------------------------------
1908| Returns the result of multiplying the single-precision floating-point values
1909| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1910| for Binary Floating-Point Arithmetic.
1911*----------------------------------------------------------------------------*/
1912
1913float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1914{
1915 flag aSign, bSign, zSign;
94a49d86 1916 int_fast16_t aExp, bExp, zExp;
bb98fe42
AF
1917 uint32_t aSig, bSig;
1918 uint64_t zSig64;
1919 uint32_t zSig;
158142c2 1920
37d18660
PM
1921 a = float32_squash_input_denormal(a STATUS_VAR);
1922 b = float32_squash_input_denormal(b STATUS_VAR);
1923
158142c2
FB
1924 aSig = extractFloat32Frac( a );
1925 aExp = extractFloat32Exp( a );
1926 aSign = extractFloat32Sign( a );
1927 bSig = extractFloat32Frac( b );
1928 bExp = extractFloat32Exp( b );
1929 bSign = extractFloat32Sign( b );
1930 zSign = aSign ^ bSign;
1931 if ( aExp == 0xFF ) {
1932 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1933 return propagateFloat32NaN( a, b STATUS_VAR );
1934 }
1935 if ( ( bExp | bSig ) == 0 ) {
1936 float_raise( float_flag_invalid STATUS_VAR);
1937 return float32_default_nan;
1938 }
1939 return packFloat32( zSign, 0xFF, 0 );
1940 }
1941 if ( bExp == 0xFF ) {
1942 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1943 if ( ( aExp | aSig ) == 0 ) {
1944 float_raise( float_flag_invalid STATUS_VAR);
1945 return float32_default_nan;
1946 }
1947 return packFloat32( zSign, 0xFF, 0 );
1948 }
1949 if ( aExp == 0 ) {
1950 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1951 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1952 }
1953 if ( bExp == 0 ) {
1954 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1955 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1956 }
1957 zExp = aExp + bExp - 0x7F;
1958 aSig = ( aSig | 0x00800000 )<<7;
1959 bSig = ( bSig | 0x00800000 )<<8;
bb98fe42 1960 shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
158142c2 1961 zSig = zSig64;
bb98fe42 1962 if ( 0 <= (int32_t) ( zSig<<1 ) ) {
158142c2
FB
1963 zSig <<= 1;
1964 --zExp;
1965 }
1966 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1967
1968}
1969
1970/*----------------------------------------------------------------------------
1971| Returns the result of dividing the single-precision floating-point value `a'
1972| by the corresponding value `b'. The operation is performed according to the
1973| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1974*----------------------------------------------------------------------------*/
1975
1976float32 float32_div( float32 a, float32 b STATUS_PARAM )
1977{
1978 flag aSign, bSign, zSign;
94a49d86 1979 int_fast16_t aExp, bExp, zExp;
bb98fe42 1980 uint32_t aSig, bSig, zSig;
37d18660
PM
1981 a = float32_squash_input_denormal(a STATUS_VAR);
1982 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
1983
1984 aSig = extractFloat32Frac( a );
1985 aExp = extractFloat32Exp( a );
1986 aSign = extractFloat32Sign( a );
1987 bSig = extractFloat32Frac( b );
1988 bExp = extractFloat32Exp( b );
1989 bSign = extractFloat32Sign( b );
1990 zSign = aSign ^ bSign;
1991 if ( aExp == 0xFF ) {
1992 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1993 if ( bExp == 0xFF ) {
1994 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1995 float_raise( float_flag_invalid STATUS_VAR);
1996 return float32_default_nan;
1997 }
1998 return packFloat32( zSign, 0xFF, 0 );
1999 }
2000 if ( bExp == 0xFF ) {
2001 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2002 return packFloat32( zSign, 0, 0 );
2003 }
2004 if ( bExp == 0 ) {
2005 if ( bSig == 0 ) {
2006 if ( ( aExp | aSig ) == 0 ) {
2007 float_raise( float_flag_invalid STATUS_VAR);
2008 return float32_default_nan;
2009 }
2010 float_raise( float_flag_divbyzero STATUS_VAR);
2011 return packFloat32( zSign, 0xFF, 0 );
2012 }
2013 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2014 }
2015 if ( aExp == 0 ) {
2016 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2017 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2018 }
2019 zExp = aExp - bExp + 0x7D;
2020 aSig = ( aSig | 0x00800000 )<<7;
2021 bSig = ( bSig | 0x00800000 )<<8;
2022 if ( bSig <= ( aSig + aSig ) ) {
2023 aSig >>= 1;
2024 ++zExp;
2025 }
bb98fe42 2026 zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
158142c2 2027 if ( ( zSig & 0x3F ) == 0 ) {
bb98fe42 2028 zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
158142c2
FB
2029 }
2030 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2031
2032}
2033
2034/*----------------------------------------------------------------------------
2035| Returns the remainder of the single-precision floating-point value `a'
2036| with respect to the corresponding value `b'. The operation is performed
2037| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2038*----------------------------------------------------------------------------*/
2039
2040float32 float32_rem( float32 a, float32 b STATUS_PARAM )
2041{
ed086f3d 2042 flag aSign, zSign;
94a49d86 2043 int_fast16_t aExp, bExp, expDiff;
bb98fe42
AF
2044 uint32_t aSig, bSig;
2045 uint32_t q;
2046 uint64_t aSig64, bSig64, q64;
2047 uint32_t alternateASig;
2048 int32_t sigMean;
37d18660
PM
2049 a = float32_squash_input_denormal(a STATUS_VAR);
2050 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2051
2052 aSig = extractFloat32Frac( a );
2053 aExp = extractFloat32Exp( a );
2054 aSign = extractFloat32Sign( a );
2055 bSig = extractFloat32Frac( b );
2056 bExp = extractFloat32Exp( b );
158142c2
FB
2057 if ( aExp == 0xFF ) {
2058 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2059 return propagateFloat32NaN( a, b STATUS_VAR );
2060 }
2061 float_raise( float_flag_invalid STATUS_VAR);
2062 return float32_default_nan;
2063 }
2064 if ( bExp == 0xFF ) {
2065 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2066 return a;
2067 }
2068 if ( bExp == 0 ) {
2069 if ( bSig == 0 ) {
2070 float_raise( float_flag_invalid STATUS_VAR);
2071 return float32_default_nan;
2072 }
2073 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2074 }
2075 if ( aExp == 0 ) {
2076 if ( aSig == 0 ) return a;
2077 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2078 }
2079 expDiff = aExp - bExp;
2080 aSig |= 0x00800000;
2081 bSig |= 0x00800000;
2082 if ( expDiff < 32 ) {
2083 aSig <<= 8;
2084 bSig <<= 8;
2085 if ( expDiff < 0 ) {
2086 if ( expDiff < -1 ) return a;
2087 aSig >>= 1;
2088 }
2089 q = ( bSig <= aSig );
2090 if ( q ) aSig -= bSig;
2091 if ( 0 < expDiff ) {
bb98fe42 2092 q = ( ( (uint64_t) aSig )<<32 ) / bSig;
158142c2
FB
2093 q >>= 32 - expDiff;
2094 bSig >>= 2;
2095 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2096 }
2097 else {
2098 aSig >>= 2;
2099 bSig >>= 2;
2100 }
2101 }
2102 else {
2103 if ( bSig <= aSig ) aSig -= bSig;
bb98fe42
AF
2104 aSig64 = ( (uint64_t) aSig )<<40;
2105 bSig64 = ( (uint64_t) bSig )<<40;
158142c2
FB
2106 expDiff -= 64;
2107 while ( 0 < expDiff ) {
2108 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2109 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2110 aSig64 = - ( ( bSig * q64 )<<38 );
2111 expDiff -= 62;
2112 }
2113 expDiff += 64;
2114 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2115 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2116 q = q64>>( 64 - expDiff );
2117 bSig <<= 6;
2118 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2119 }
2120 do {
2121 alternateASig = aSig;
2122 ++q;
2123 aSig -= bSig;
bb98fe42 2124 } while ( 0 <= (int32_t) aSig );
158142c2
FB
2125 sigMean = aSig + alternateASig;
2126 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2127 aSig = alternateASig;
2128 }
bb98fe42 2129 zSign = ( (int32_t) aSig < 0 );
158142c2
FB
2130 if ( zSign ) aSig = - aSig;
2131 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2132
2133}
2134
369be8f6
PM
2135/*----------------------------------------------------------------------------
2136| Returns the result of multiplying the single-precision floating-point values
2137| `a' and `b' then adding 'c', with no intermediate rounding step after the
2138| multiplication. The operation is performed according to the IEC/IEEE
2139| Standard for Binary Floating-Point Arithmetic 754-2008.
2140| The flags argument allows the caller to select negation of the
2141| addend, the intermediate product, or the final result. (The difference
2142| between this and having the caller do a separate negation is that negating
2143| externally will flip the sign bit on NaNs.)
2144*----------------------------------------------------------------------------*/
2145
2146float32 float32_muladd(float32 a, float32 b, float32 c, int flags STATUS_PARAM)
2147{
2148 flag aSign, bSign, cSign, zSign;
94a49d86 2149 int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
369be8f6
PM
2150 uint32_t aSig, bSig, cSig;
2151 flag pInf, pZero, pSign;
2152 uint64_t pSig64, cSig64, zSig64;
2153 uint32_t pSig;
2154 int shiftcount;
2155 flag signflip, infzero;
2156
2157 a = float32_squash_input_denormal(a STATUS_VAR);
2158 b = float32_squash_input_denormal(b STATUS_VAR);
2159 c = float32_squash_input_denormal(c STATUS_VAR);
2160 aSig = extractFloat32Frac(a);
2161 aExp = extractFloat32Exp(a);
2162 aSign = extractFloat32Sign(a);
2163 bSig = extractFloat32Frac(b);
2164 bExp = extractFloat32Exp(b);
2165 bSign = extractFloat32Sign(b);
2166 cSig = extractFloat32Frac(c);
2167 cExp = extractFloat32Exp(c);
2168 cSign = extractFloat32Sign(c);
2169
2170 infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) ||
2171 (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0));
2172
2173 /* It is implementation-defined whether the cases of (0,inf,qnan)
2174 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
2175 * they return if they do), so we have to hand this information
2176 * off to the target-specific pick-a-NaN routine.
2177 */
2178 if (((aExp == 0xff) && aSig) ||
2179 ((bExp == 0xff) && bSig) ||
2180 ((cExp == 0xff) && cSig)) {
2181 return propagateFloat32MulAddNaN(a, b, c, infzero STATUS_VAR);
2182 }
2183
2184 if (infzero) {
2185 float_raise(float_flag_invalid STATUS_VAR);
2186 return float32_default_nan;
2187 }
2188
2189 if (flags & float_muladd_negate_c) {
2190 cSign ^= 1;
2191 }
2192
2193 signflip = (flags & float_muladd_negate_result) ? 1 : 0;
2194
2195 /* Work out the sign and type of the product */
2196 pSign = aSign ^ bSign;
2197 if (flags & float_muladd_negate_product) {
2198 pSign ^= 1;
2199 }
2200 pInf = (aExp == 0xff) || (bExp == 0xff);
2201 pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
2202
2203 if (cExp == 0xff) {
2204 if (pInf && (pSign ^ cSign)) {
2205 /* addition of opposite-signed infinities => InvalidOperation */
2206 float_raise(float_flag_invalid STATUS_VAR);
2207 return float32_default_nan;
2208 }
2209 /* Otherwise generate an infinity of the same sign */
2210 return packFloat32(cSign ^ signflip, 0xff, 0);
2211 }
2212
2213 if (pInf) {
2214 return packFloat32(pSign ^ signflip, 0xff, 0);
2215 }
2216
2217 if (pZero) {
2218 if (cExp == 0) {
2219 if (cSig == 0) {
2220 /* Adding two exact zeroes */
2221 if (pSign == cSign) {
2222 zSign = pSign;
2223 } else if (STATUS(float_rounding_mode) == float_round_down) {
2224 zSign = 1;
2225 } else {
2226 zSign = 0;
2227 }
2228 return packFloat32(zSign ^ signflip, 0, 0);
2229 }
2230 /* Exact zero plus a denorm */
2231 if (STATUS(flush_to_zero)) {
2232 float_raise(float_flag_output_denormal STATUS_VAR);
2233 return packFloat32(cSign ^ signflip, 0, 0);
2234 }
2235 }
2236 /* Zero plus something non-zero : just return the something */
a6e7c184 2237 return packFloat32(cSign ^ signflip, cExp, cSig);
369be8f6
PM
2238 }
2239
2240 if (aExp == 0) {
2241 normalizeFloat32Subnormal(aSig, &aExp, &aSig);
2242 }
2243 if (bExp == 0) {
2244 normalizeFloat32Subnormal(bSig, &bExp, &bSig);
2245 }
2246
2247 /* Calculate the actual result a * b + c */
2248
2249 /* Multiply first; this is easy. */
2250 /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
2251 * because we want the true exponent, not the "one-less-than"
2252 * flavour that roundAndPackFloat32() takes.
2253 */
2254 pExp = aExp + bExp - 0x7e;
2255 aSig = (aSig | 0x00800000) << 7;
2256 bSig = (bSig | 0x00800000) << 8;
2257 pSig64 = (uint64_t)aSig * bSig;
2258 if ((int64_t)(pSig64 << 1) >= 0) {
2259 pSig64 <<= 1;
2260 pExp--;
2261 }
2262
2263 zSign = pSign ^ signflip;
2264
2265 /* Now pSig64 is the significand of the multiply, with the explicit bit in
2266 * position 62.
2267 */
2268 if (cExp == 0) {
2269 if (!cSig) {
2270 /* Throw out the special case of c being an exact zero now */
2271 shift64RightJamming(pSig64, 32, &pSig64);
2272 pSig = pSig64;
2273 return roundAndPackFloat32(zSign, pExp - 1,
2274 pSig STATUS_VAR);
2275 }
2276 normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2277 }
2278
2279 cSig64 = (uint64_t)cSig << (62 - 23);
2280 cSig64 |= LIT64(0x4000000000000000);
2281 expDiff = pExp - cExp;
2282
2283 if (pSign == cSign) {
2284 /* Addition */
2285 if (expDiff > 0) {
2286 /* scale c to match p */
2287 shift64RightJamming(cSig64, expDiff, &cSig64);
2288 zExp = pExp;
2289 } else if (expDiff < 0) {
2290 /* scale p to match c */
2291 shift64RightJamming(pSig64, -expDiff, &pSig64);
2292 zExp = cExp;
2293 } else {
2294 /* no scaling needed */
2295 zExp = cExp;
2296 }
2297 /* Add significands and make sure explicit bit ends up in posn 62 */
2298 zSig64 = pSig64 + cSig64;
2299 if ((int64_t)zSig64 < 0) {
2300 shift64RightJamming(zSig64, 1, &zSig64);
2301 } else {
2302 zExp--;
2303 }
2304 } else {
2305 /* Subtraction */
2306 if (expDiff > 0) {
2307 shift64RightJamming(cSig64, expDiff, &cSig64);
2308 zSig64 = pSig64 - cSig64;
2309 zExp = pExp;
2310 } else if (expDiff < 0) {
2311 shift64RightJamming(pSig64, -expDiff, &pSig64);
2312 zSig64 = cSig64 - pSig64;
2313 zExp = cExp;
2314 zSign ^= 1;
2315 } else {
2316 zExp = pExp;
2317 if (cSig64 < pSig64) {
2318 zSig64 = pSig64 - cSig64;
2319 } else if (pSig64 < cSig64) {
2320 zSig64 = cSig64 - pSig64;
2321 zSign ^= 1;
2322 } else {
2323 /* Exact zero */
2324 zSign = signflip;
2325 if (STATUS(float_rounding_mode) == float_round_down) {
2326 zSign ^= 1;
2327 }
2328 return packFloat32(zSign, 0, 0);
2329 }
2330 }
2331 --zExp;
2332 /* Normalize to put the explicit bit back into bit 62. */
2333 shiftcount = countLeadingZeros64(zSig64) - 1;
2334 zSig64 <<= shiftcount;
2335 zExp -= shiftcount;
2336 }
2337 shift64RightJamming(zSig64, 32, &zSig64);
2338 return roundAndPackFloat32(zSign, zExp, zSig64 STATUS_VAR);
2339}
2340
2341
158142c2
FB
2342/*----------------------------------------------------------------------------
2343| Returns the square root of the single-precision floating-point value `a'.
2344| The operation is performed according to the IEC/IEEE Standard for Binary
2345| Floating-Point Arithmetic.
2346*----------------------------------------------------------------------------*/
2347
2348float32 float32_sqrt( float32 a STATUS_PARAM )
2349{
2350 flag aSign;
94a49d86 2351 int_fast16_t aExp, zExp;
bb98fe42
AF
2352 uint32_t aSig, zSig;
2353 uint64_t rem, term;
37d18660 2354 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
2355
2356 aSig = extractFloat32Frac( a );
2357 aExp = extractFloat32Exp( a );
2358 aSign = extractFloat32Sign( a );
2359 if ( aExp == 0xFF ) {
f090c9d4 2360 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
158142c2
FB
2361 if ( ! aSign ) return a;
2362 float_raise( float_flag_invalid STATUS_VAR);
2363 return float32_default_nan;
2364 }
2365 if ( aSign ) {
2366 if ( ( aExp | aSig ) == 0 ) return a;
2367 float_raise( float_flag_invalid STATUS_VAR);
2368 return float32_default_nan;
2369 }
2370 if ( aExp == 0 ) {
f090c9d4 2371 if ( aSig == 0 ) return float32_zero;
158142c2
FB
2372 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2373 }
2374 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2375 aSig = ( aSig | 0x00800000 )<<8;
2376 zSig = estimateSqrt32( aExp, aSig ) + 2;
2377 if ( ( zSig & 0x7F ) <= 5 ) {
2378 if ( zSig < 2 ) {
2379 zSig = 0x7FFFFFFF;
2380 goto roundAndPack;
2381 }
2382 aSig >>= aExp & 1;
bb98fe42
AF
2383 term = ( (uint64_t) zSig ) * zSig;
2384 rem = ( ( (uint64_t) aSig )<<32 ) - term;
2385 while ( (int64_t) rem < 0 ) {
158142c2 2386 --zSig;
bb98fe42 2387 rem += ( ( (uint64_t) zSig )<<1 ) | 1;
158142c2
FB
2388 }
2389 zSig |= ( rem != 0 );
2390 }
2391 shift32RightJamming( zSig, 1, &zSig );
2392 roundAndPack:
2393 return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2394
2395}
2396
8229c991
AJ
2397/*----------------------------------------------------------------------------
2398| Returns the binary exponential of the single-precision floating-point value
2399| `a'. The operation is performed according to the IEC/IEEE Standard for
2400| Binary Floating-Point Arithmetic.
2401|
2402| Uses the following identities:
2403|
2404| 1. -------------------------------------------------------------------------
2405| x x*ln(2)
2406| 2 = e
2407|
2408| 2. -------------------------------------------------------------------------
2409| 2 3 4 5 n
2410| x x x x x x x
2411| e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2412| 1! 2! 3! 4! 5! n!
2413*----------------------------------------------------------------------------*/
2414
2415static const float64 float32_exp2_coefficients[15] =
2416{
d5138cf4
PM
2417 const_float64( 0x3ff0000000000000ll ), /* 1 */
2418 const_float64( 0x3fe0000000000000ll ), /* 2 */
2419 const_float64( 0x3fc5555555555555ll ), /* 3 */
2420 const_float64( 0x3fa5555555555555ll ), /* 4 */
2421 const_float64( 0x3f81111111111111ll ), /* 5 */
2422 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
2423 const_float64( 0x3f2a01a01a01a01all ), /* 7 */
2424 const_float64( 0x3efa01a01a01a01all ), /* 8 */
2425 const_float64( 0x3ec71de3a556c734ll ), /* 9 */
2426 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2427 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2428 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2429 const_float64( 0x3de6124613a86d09ll ), /* 13 */
2430 const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2431 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
8229c991
AJ
2432};
2433
2434float32 float32_exp2( float32 a STATUS_PARAM )
2435{
2436 flag aSign;
94a49d86 2437 int_fast16_t aExp;
bb98fe42 2438 uint32_t aSig;
8229c991
AJ
2439 float64 r, x, xn;
2440 int i;
37d18660 2441 a = float32_squash_input_denormal(a STATUS_VAR);
8229c991
AJ
2442
2443 aSig = extractFloat32Frac( a );
2444 aExp = extractFloat32Exp( a );
2445 aSign = extractFloat32Sign( a );
2446
2447 if ( aExp == 0xFF) {
2448 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2449 return (aSign) ? float32_zero : a;
2450 }
2451 if (aExp == 0) {
2452 if (aSig == 0) return float32_one;
2453 }
2454
2455 float_raise( float_flag_inexact STATUS_VAR);
2456
2457 /* ******************************* */
2458 /* using float64 for approximation */
2459 /* ******************************* */
2460 x = float32_to_float64(a STATUS_VAR);
2461 x = float64_mul(x, float64_ln2 STATUS_VAR);
2462
2463 xn = x;
2464 r = float64_one;
2465 for (i = 0 ; i < 15 ; i++) {
2466 float64 f;
2467
2468 f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2469 r = float64_add(r, f STATUS_VAR);
2470
2471 xn = float64_mul(xn, x STATUS_VAR);
2472 }
2473
2474 return float64_to_float32(r, status);
2475}
2476
374dfc33
AJ
2477/*----------------------------------------------------------------------------
2478| Returns the binary log of the single-precision floating-point value `a'.
2479| The operation is performed according to the IEC/IEEE Standard for Binary
2480| Floating-Point Arithmetic.
2481*----------------------------------------------------------------------------*/
2482float32 float32_log2( float32 a STATUS_PARAM )
2483{
2484 flag aSign, zSign;
94a49d86 2485 int_fast16_t aExp;
bb98fe42 2486 uint32_t aSig, zSig, i;
374dfc33 2487
37d18660 2488 a = float32_squash_input_denormal(a STATUS_VAR);
374dfc33
AJ
2489 aSig = extractFloat32Frac( a );
2490 aExp = extractFloat32Exp( a );
2491 aSign = extractFloat32Sign( a );
2492
2493 if ( aExp == 0 ) {
2494 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2495 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2496 }
2497 if ( aSign ) {
2498 float_raise( float_flag_invalid STATUS_VAR);
2499 return float32_default_nan;
2500 }
2501 if ( aExp == 0xFF ) {
2502 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2503 return a;
2504 }
2505
2506 aExp -= 0x7F;
2507 aSig |= 0x00800000;
2508 zSign = aExp < 0;
2509 zSig = aExp << 23;
2510
2511 for (i = 1 << 22; i > 0; i >>= 1) {
bb98fe42 2512 aSig = ( (uint64_t)aSig * aSig ) >> 23;
374dfc33
AJ
2513 if ( aSig & 0x01000000 ) {
2514 aSig >>= 1;
2515 zSig |= i;
2516 }
2517 }
2518
2519 if ( zSign )
2520 zSig = -zSig;
2521
2522 return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2523}
2524
158142c2
FB
2525/*----------------------------------------------------------------------------
2526| Returns 1 if the single-precision floating-point value `a' is equal to
b689362d
AJ
2527| the corresponding value `b', and 0 otherwise. The invalid exception is
2528| raised if either operand is a NaN. Otherwise, the comparison is performed
158142c2
FB
2529| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2530*----------------------------------------------------------------------------*/
2531
b689362d 2532int float32_eq( float32 a, float32 b STATUS_PARAM )
158142c2 2533{
b689362d 2534 uint32_t av, bv;
37d18660
PM
2535 a = float32_squash_input_denormal(a STATUS_VAR);
2536 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2537
2538 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2539 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2540 ) {
b689362d 2541 float_raise( float_flag_invalid STATUS_VAR);
158142c2
FB
2542 return 0;
2543 }
b689362d
AJ
2544 av = float32_val(a);
2545 bv = float32_val(b);
2546 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
158142c2
FB
2547}
2548
2549/*----------------------------------------------------------------------------
2550| Returns 1 if the single-precision floating-point value `a' is less than
f5a64251
AJ
2551| or equal to the corresponding value `b', and 0 otherwise. The invalid
2552| exception is raised if either operand is a NaN. The comparison is performed
2553| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
2554*----------------------------------------------------------------------------*/
2555
750afe93 2556int float32_le( float32 a, float32 b STATUS_PARAM )
158142c2
FB
2557{
2558 flag aSign, bSign;
bb98fe42 2559 uint32_t av, bv;
37d18660
PM
2560 a = float32_squash_input_denormal(a STATUS_VAR);
2561 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2562
2563 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2564 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2565 ) {
2566 float_raise( float_flag_invalid STATUS_VAR);
2567 return 0;
2568 }
2569 aSign = extractFloat32Sign( a );
2570 bSign = extractFloat32Sign( b );
f090c9d4
PB
2571 av = float32_val(a);
2572 bv = float32_val(b);
bb98fe42 2573 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 2574 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
2575
2576}
2577
2578/*----------------------------------------------------------------------------
2579| Returns 1 if the single-precision floating-point value `a' is less than
f5a64251
AJ
2580| the corresponding value `b', and 0 otherwise. The invalid exception is
2581| raised if either operand is a NaN. The comparison is performed according
2582| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
2583*----------------------------------------------------------------------------*/
2584
750afe93 2585int float32_lt( float32 a, float32 b STATUS_PARAM )
158142c2
FB
2586{
2587 flag aSign, bSign;
bb98fe42 2588 uint32_t av, bv;
37d18660
PM
2589 a = float32_squash_input_denormal(a STATUS_VAR);
2590 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2591
2592 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2593 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2594 ) {
2595 float_raise( float_flag_invalid STATUS_VAR);
2596 return 0;
2597 }
2598 aSign = extractFloat32Sign( a );
2599 bSign = extractFloat32Sign( b );
f090c9d4
PB
2600 av = float32_val(a);
2601 bv = float32_val(b);
bb98fe42 2602 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 2603 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
2604
2605}
2606
67b7861d
AJ
2607/*----------------------------------------------------------------------------
2608| Returns 1 if the single-precision floating-point values `a' and `b' cannot
f5a64251
AJ
2609| be compared, and 0 otherwise. The invalid exception is raised if either
2610| operand is a NaN. The comparison is performed according to the IEC/IEEE
2611| Standard for Binary Floating-Point Arithmetic.
67b7861d
AJ
2612*----------------------------------------------------------------------------*/
2613
2614int float32_unordered( float32 a, float32 b STATUS_PARAM )
2615{
2616 a = float32_squash_input_denormal(a STATUS_VAR);
2617 b = float32_squash_input_denormal(b STATUS_VAR);
2618
2619 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2620 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2621 ) {
2622 float_raise( float_flag_invalid STATUS_VAR);
2623 return 1;
2624 }
2625 return 0;
2626}
b689362d 2627
158142c2
FB
2628/*----------------------------------------------------------------------------
2629| Returns 1 if the single-precision floating-point value `a' is equal to
f5a64251
AJ
2630| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2631| exception. The comparison is performed according to the IEC/IEEE Standard
2632| for Binary Floating-Point Arithmetic.
158142c2
FB
2633*----------------------------------------------------------------------------*/
2634
b689362d 2635int float32_eq_quiet( float32 a, float32 b STATUS_PARAM )
158142c2 2636{
37d18660
PM
2637 a = float32_squash_input_denormal(a STATUS_VAR);
2638 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2639
2640 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2641 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2642 ) {
b689362d
AJ
2643 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2644 float_raise( float_flag_invalid STATUS_VAR);
2645 }
158142c2
FB
2646 return 0;
2647 }
b689362d
AJ
2648 return ( float32_val(a) == float32_val(b) ) ||
2649 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
158142c2
FB
2650}
2651
2652/*----------------------------------------------------------------------------
2653| Returns 1 if the single-precision floating-point value `a' is less than or
2654| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2655| cause an exception. Otherwise, the comparison is performed according to the
2656| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2657*----------------------------------------------------------------------------*/
2658
750afe93 2659int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
158142c2
FB
2660{
2661 flag aSign, bSign;
bb98fe42 2662 uint32_t av, bv;
37d18660
PM
2663 a = float32_squash_input_denormal(a STATUS_VAR);
2664 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2665
2666 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2667 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2668 ) {
2669 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2670 float_raise( float_flag_invalid STATUS_VAR);
2671 }
2672 return 0;
2673 }
2674 aSign = extractFloat32Sign( a );
2675 bSign = extractFloat32Sign( b );
f090c9d4
PB
2676 av = float32_val(a);
2677 bv = float32_val(b);
bb98fe42 2678 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 2679 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
2680
2681}
2682
2683/*----------------------------------------------------------------------------
2684| Returns 1 if the single-precision floating-point value `a' is less than
2685| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2686| exception. Otherwise, the comparison is performed according to the IEC/IEEE
2687| Standard for Binary Floating-Point Arithmetic.
2688*----------------------------------------------------------------------------*/
2689
750afe93 2690int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
158142c2
FB
2691{
2692 flag aSign, bSign;
bb98fe42 2693 uint32_t av, bv;
37d18660
PM
2694 a = float32_squash_input_denormal(a STATUS_VAR);
2695 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2696
2697 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2698 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2699 ) {
2700 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2701 float_raise( float_flag_invalid STATUS_VAR);
2702 }
2703 return 0;
2704 }
2705 aSign = extractFloat32Sign( a );
2706 bSign = extractFloat32Sign( b );
f090c9d4
PB
2707 av = float32_val(a);
2708 bv = float32_val(b);
bb98fe42 2709 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 2710 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
2711
2712}
2713
67b7861d
AJ
2714/*----------------------------------------------------------------------------
2715| Returns 1 if the single-precision floating-point values `a' and `b' cannot
2716| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
2717| comparison is performed according to the IEC/IEEE Standard for Binary
2718| Floating-Point Arithmetic.
2719*----------------------------------------------------------------------------*/
2720
2721int float32_unordered_quiet( float32 a, float32 b STATUS_PARAM )
2722{
2723 a = float32_squash_input_denormal(a STATUS_VAR);
2724 b = float32_squash_input_denormal(b STATUS_VAR);
2725
2726 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2727 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2728 ) {
2729 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2730 float_raise( float_flag_invalid STATUS_VAR);
2731 }
2732 return 1;
2733 }
2734 return 0;
2735}
2736
158142c2
FB
2737/*----------------------------------------------------------------------------
2738| Returns the result of converting the double-precision floating-point value
2739| `a' to the 32-bit two's complement integer format. The conversion is
2740| performed according to the IEC/IEEE Standard for Binary Floating-Point
2741| Arithmetic---which means in particular that the conversion is rounded
2742| according to the current rounding mode. If `a' is a NaN, the largest
2743| positive integer is returned. Otherwise, if the conversion overflows, the
2744| largest integer with the same sign as `a' is returned.
2745*----------------------------------------------------------------------------*/
2746
2747int32 float64_to_int32( float64 a STATUS_PARAM )
2748{
2749 flag aSign;
94a49d86 2750 int_fast16_t aExp, shiftCount;
bb98fe42 2751 uint64_t aSig;
37d18660 2752 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
2753
2754 aSig = extractFloat64Frac( a );
2755 aExp = extractFloat64Exp( a );
2756 aSign = extractFloat64Sign( a );
2757 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2758 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2759 shiftCount = 0x42C - aExp;
2760 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2761 return roundAndPackInt32( aSign, aSig STATUS_VAR );
2762
2763}
2764
2765/*----------------------------------------------------------------------------
2766| Returns the result of converting the double-precision floating-point value
2767| `a' to the 32-bit two's complement integer format. The conversion is
2768| performed according to the IEC/IEEE Standard for Binary Floating-Point
2769| Arithmetic, except that the conversion is always rounded toward zero.
2770| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2771| the conversion overflows, the largest integer with the same sign as `a' is
2772| returned.
2773*----------------------------------------------------------------------------*/
2774
2775int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2776{
2777 flag aSign;
94a49d86 2778 int_fast16_t aExp, shiftCount;
bb98fe42 2779 uint64_t aSig, savedASig;
b3a6a2e0 2780 int32_t z;
37d18660 2781 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
2782
2783 aSig = extractFloat64Frac( a );
2784 aExp = extractFloat64Exp( a );
2785 aSign = extractFloat64Sign( a );
2786 if ( 0x41E < aExp ) {
2787 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2788 goto invalid;
2789 }
2790 else if ( aExp < 0x3FF ) {
2791 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2792 return 0;
2793 }
2794 aSig |= LIT64( 0x0010000000000000 );
2795 shiftCount = 0x433 - aExp;
2796 savedASig = aSig;
2797 aSig >>= shiftCount;
2798 z = aSig;
2799 if ( aSign ) z = - z;
2800 if ( ( z < 0 ) ^ aSign ) {
2801 invalid:
2802 float_raise( float_flag_invalid STATUS_VAR);
bb98fe42 2803 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2
FB
2804 }
2805 if ( ( aSig<<shiftCount ) != savedASig ) {
2806 STATUS(float_exception_flags) |= float_flag_inexact;
2807 }
2808 return z;
2809
2810}
2811
cbcef455
PM
2812/*----------------------------------------------------------------------------
2813| Returns the result of converting the double-precision floating-point value
2814| `a' to the 16-bit two's complement integer format. The conversion is
2815| performed according to the IEC/IEEE Standard for Binary Floating-Point
2816| Arithmetic, except that the conversion is always rounded toward zero.
2817| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2818| the conversion overflows, the largest integer with the same sign as `a' is
2819| returned.
2820*----------------------------------------------------------------------------*/
2821
94a49d86 2822int_fast16_t float64_to_int16_round_to_zero(float64 a STATUS_PARAM)
cbcef455
PM
2823{
2824 flag aSign;
94a49d86 2825 int_fast16_t aExp, shiftCount;
bb98fe42 2826 uint64_t aSig, savedASig;
cbcef455
PM
2827 int32 z;
2828
2829 aSig = extractFloat64Frac( a );
2830 aExp = extractFloat64Exp( a );
2831 aSign = extractFloat64Sign( a );
2832 if ( 0x40E < aExp ) {
2833 if ( ( aExp == 0x7FF ) && aSig ) {
2834 aSign = 0;
2835 }
2836 goto invalid;
2837 }
2838 else if ( aExp < 0x3FF ) {
2839 if ( aExp || aSig ) {
2840 STATUS(float_exception_flags) |= float_flag_inexact;
2841 }
2842 return 0;
2843 }
2844 aSig |= LIT64( 0x0010000000000000 );
2845 shiftCount = 0x433 - aExp;
2846 savedASig = aSig;
2847 aSig >>= shiftCount;
2848 z = aSig;
2849 if ( aSign ) {
2850 z = - z;
2851 }
2852 if ( ( (int16_t)z < 0 ) ^ aSign ) {
2853 invalid:
2854 float_raise( float_flag_invalid STATUS_VAR);
bb98fe42 2855 return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
cbcef455
PM
2856 }
2857 if ( ( aSig<<shiftCount ) != savedASig ) {
2858 STATUS(float_exception_flags) |= float_flag_inexact;
2859 }
2860 return z;
2861}
2862
158142c2
FB
2863/*----------------------------------------------------------------------------
2864| Returns the result of converting the double-precision floating-point value
2865| `a' to the 64-bit two's complement integer format. The conversion is
2866| performed according to the IEC/IEEE Standard for Binary Floating-Point
2867| Arithmetic---which means in particular that the conversion is rounded
2868| according to the current rounding mode. If `a' is a NaN, the largest
2869| positive integer is returned. Otherwise, if the conversion overflows, the
2870| largest integer with the same sign as `a' is returned.
2871*----------------------------------------------------------------------------*/
2872
2873int64 float64_to_int64( float64 a STATUS_PARAM )
2874{
2875 flag aSign;
94a49d86 2876 int_fast16_t aExp, shiftCount;
bb98fe42 2877 uint64_t aSig, aSigExtra;
37d18660 2878 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
2879
2880 aSig = extractFloat64Frac( a );
2881 aExp = extractFloat64Exp( a );
2882 aSign = extractFloat64Sign( a );
2883 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2884 shiftCount = 0x433 - aExp;
2885 if ( shiftCount <= 0 ) {
2886 if ( 0x43E < aExp ) {
2887 float_raise( float_flag_invalid STATUS_VAR);
2888 if ( ! aSign
2889 || ( ( aExp == 0x7FF )
2890 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2891 ) {
2892 return LIT64( 0x7FFFFFFFFFFFFFFF );
2893 }
bb98fe42 2894 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
2895 }
2896 aSigExtra = 0;
2897 aSig <<= - shiftCount;
2898 }
2899 else {
2900 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2901 }
2902 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2903
2904}
2905
2906/*----------------------------------------------------------------------------
2907| Returns the result of converting the double-precision floating-point value
2908| `a' to the 64-bit two's complement integer format. The conversion is
2909| performed according to the IEC/IEEE Standard for Binary Floating-Point
2910| Arithmetic, except that the conversion is always rounded toward zero.
2911| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2912| the conversion overflows, the largest integer with the same sign as `a' is
2913| returned.
2914*----------------------------------------------------------------------------*/
2915
2916int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2917{
2918 flag aSign;
94a49d86 2919 int_fast16_t aExp, shiftCount;
bb98fe42 2920 uint64_t aSig;
158142c2 2921 int64 z;
37d18660 2922 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
2923
2924 aSig = extractFloat64Frac( a );
2925 aExp = extractFloat64Exp( a );
2926 aSign = extractFloat64Sign( a );
2927 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2928 shiftCount = aExp - 0x433;
2929 if ( 0 <= shiftCount ) {
2930 if ( 0x43E <= aExp ) {
f090c9d4 2931 if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
158142c2
FB
2932 float_raise( float_flag_invalid STATUS_VAR);
2933 if ( ! aSign
2934 || ( ( aExp == 0x7FF )
2935 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2936 ) {
2937 return LIT64( 0x7FFFFFFFFFFFFFFF );
2938 }
2939 }
bb98fe42 2940 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
2941 }
2942 z = aSig<<shiftCount;
2943 }
2944 else {
2945 if ( aExp < 0x3FE ) {
2946 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2947 return 0;
2948 }
2949 z = aSig>>( - shiftCount );
bb98fe42 2950 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
158142c2
FB
2951 STATUS(float_exception_flags) |= float_flag_inexact;
2952 }
2953 }
2954 if ( aSign ) z = - z;
2955 return z;
2956
2957}
2958
2959/*----------------------------------------------------------------------------
2960| Returns the result of converting the double-precision floating-point value
2961| `a' to the single-precision floating-point format. The conversion is
2962| performed according to the IEC/IEEE Standard for Binary Floating-Point
2963| Arithmetic.
2964*----------------------------------------------------------------------------*/
2965
2966float32 float64_to_float32( float64 a STATUS_PARAM )
2967{
2968 flag aSign;
94a49d86 2969 int_fast16_t aExp;
bb98fe42
AF
2970 uint64_t aSig;
2971 uint32_t zSig;
37d18660 2972 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
2973
2974 aSig = extractFloat64Frac( a );
2975 aExp = extractFloat64Exp( a );
2976 aSign = extractFloat64Sign( a );
2977 if ( aExp == 0x7FF ) {
bcd4d9af 2978 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
2979 return packFloat32( aSign, 0xFF, 0 );
2980 }
2981 shift64RightJamming( aSig, 22, &aSig );
2982 zSig = aSig;
2983 if ( aExp || zSig ) {
2984 zSig |= 0x40000000;
2985 aExp -= 0x381;
2986 }
2987 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2988
2989}
2990
60011498
PB
2991
2992/*----------------------------------------------------------------------------
2993| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2994| half-precision floating-point value, returning the result. After being
2995| shifted into the proper positions, the three fields are simply added
2996| together to form the result. This means that any integer portion of `zSig'
2997| will be added into the exponent. Since a properly normalized significand
2998| will have an integer portion equal to 1, the `zExp' input should be 1 less
2999| than the desired result exponent whenever `zSig' is a complete, normalized
3000| significand.
3001*----------------------------------------------------------------------------*/
94a49d86 3002static float16 packFloat16(flag zSign, int_fast16_t zExp, uint16_t zSig)
60011498 3003{
bb4d4bb3 3004 return make_float16(
bb98fe42 3005 (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
60011498
PB
3006}
3007
3008/* Half precision floats come in two formats: standard IEEE and "ARM" format.
3009 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
bb4d4bb3
PM
3010
3011float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM)
60011498
PB
3012{
3013 flag aSign;
94a49d86 3014 int_fast16_t aExp;
bb98fe42 3015 uint32_t aSig;
60011498 3016
bb4d4bb3
PM
3017 aSign = extractFloat16Sign(a);
3018 aExp = extractFloat16Exp(a);
3019 aSig = extractFloat16Frac(a);
60011498
PB
3020
3021 if (aExp == 0x1f && ieee) {
3022 if (aSig) {
f591e1be 3023 return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
60011498 3024 }
4be8eeac 3025 return packFloat32(aSign, 0xff, 0);
60011498
PB
3026 }
3027 if (aExp == 0) {
3028 int8 shiftCount;
3029
3030 if (aSig == 0) {
3031 return packFloat32(aSign, 0, 0);
3032 }
3033
3034 shiftCount = countLeadingZeros32( aSig ) - 21;
3035 aSig = aSig << shiftCount;
3036 aExp = -shiftCount;
3037 }
3038 return packFloat32( aSign, aExp + 0x70, aSig << 13);
3039}
3040
bb4d4bb3 3041float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM)
60011498
PB
3042{
3043 flag aSign;
94a49d86 3044 int_fast16_t aExp;
bb98fe42
AF
3045 uint32_t aSig;
3046 uint32_t mask;
3047 uint32_t increment;
60011498 3048 int8 roundingMode;
37d18660 3049 a = float32_squash_input_denormal(a STATUS_VAR);
60011498
PB
3050
3051 aSig = extractFloat32Frac( a );
3052 aExp = extractFloat32Exp( a );
3053 aSign = extractFloat32Sign( a );
3054 if ( aExp == 0xFF ) {
3055 if (aSig) {
600e30d2
PM
3056 /* Input is a NaN */
3057 float16 r = commonNaNToFloat16( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3058 if (!ieee) {
3059 return packFloat16(aSign, 0, 0);
3060 }
3061 return r;
60011498 3062 }
600e30d2
PM
3063 /* Infinity */
3064 if (!ieee) {
3065 float_raise(float_flag_invalid STATUS_VAR);
3066 return packFloat16(aSign, 0x1f, 0x3ff);
3067 }
3068 return packFloat16(aSign, 0x1f, 0);
60011498 3069 }
600e30d2 3070 if (aExp == 0 && aSig == 0) {
60011498
PB
3071 return packFloat16(aSign, 0, 0);
3072 }
3073 /* Decimal point between bits 22 and 23. */
3074 aSig |= 0x00800000;
3075 aExp -= 0x7f;
3076 if (aExp < -14) {
600e30d2
PM
3077 mask = 0x00ffffff;
3078 if (aExp >= -24) {
3079 mask >>= 25 + aExp;
60011498
PB
3080 }
3081 } else {
3082 mask = 0x00001fff;
3083 }
3084 if (aSig & mask) {
3085 float_raise( float_flag_underflow STATUS_VAR );
3086 roundingMode = STATUS(float_rounding_mode);
3087 switch (roundingMode) {
3088 case float_round_nearest_even:
3089 increment = (mask + 1) >> 1;
3090 if ((aSig & mask) == increment) {
3091 increment = aSig & (increment << 1);
3092 }
3093 break;
3094 case float_round_up:
3095 increment = aSign ? 0 : mask;
3096 break;
3097 case float_round_down:
3098 increment = aSign ? mask : 0;
3099 break;
3100 default: /* round_to_zero */
3101 increment = 0;
3102 break;
3103 }
3104 aSig += increment;
3105 if (aSig >= 0x01000000) {
3106 aSig >>= 1;
3107 aExp++;
3108 }
3109 } else if (aExp < -14
3110 && STATUS(float_detect_tininess) == float_tininess_before_rounding) {
3111 float_raise( float_flag_underflow STATUS_VAR);
3112 }
3113
3114 if (ieee) {
3115 if (aExp > 15) {
3116 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
3117 return packFloat16(aSign, 0x1f, 0);
3118 }
3119 } else {
3120 if (aExp > 16) {
600e30d2 3121 float_raise(float_flag_invalid | float_flag_inexact STATUS_VAR);
60011498
PB
3122 return packFloat16(aSign, 0x1f, 0x3ff);
3123 }
3124 }
3125 if (aExp < -24) {
3126 return packFloat16(aSign, 0, 0);
3127 }
3128 if (aExp < -14) {
3129 aSig >>= -14 - aExp;
3130 aExp = -14;
3131 }
3132 return packFloat16(aSign, aExp + 14, aSig >> 13);
3133}
3134
158142c2
FB
3135/*----------------------------------------------------------------------------
3136| Returns the result of converting the double-precision floating-point value
3137| `a' to the extended double-precision floating-point format. The conversion
3138| is performed according to the IEC/IEEE Standard for Binary Floating-Point
3139| Arithmetic.
3140*----------------------------------------------------------------------------*/
3141
3142floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
3143{
3144 flag aSign;
94a49d86 3145 int_fast16_t aExp;
bb98fe42 3146 uint64_t aSig;
158142c2 3147
37d18660 3148 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
3149 aSig = extractFloat64Frac( a );
3150 aExp = extractFloat64Exp( a );
3151 aSign = extractFloat64Sign( a );
3152 if ( aExp == 0x7FF ) {
bcd4d9af 3153 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
3154 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3155 }
3156 if ( aExp == 0 ) {
3157 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3158 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3159 }
3160 return
3161 packFloatx80(
3162 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
3163
3164}
3165
158142c2
FB
3166/*----------------------------------------------------------------------------
3167| Returns the result of converting the double-precision floating-point value
3168| `a' to the quadruple-precision floating-point format. The conversion is
3169| performed according to the IEC/IEEE Standard for Binary Floating-Point
3170| Arithmetic.
3171*----------------------------------------------------------------------------*/
3172
3173float128 float64_to_float128( float64 a STATUS_PARAM )
3174{
3175 flag aSign;
94a49d86 3176 int_fast16_t aExp;
bb98fe42 3177 uint64_t aSig, zSig0, zSig1;
158142c2 3178
37d18660 3179 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
3180 aSig = extractFloat64Frac( a );
3181 aExp = extractFloat64Exp( a );
3182 aSign = extractFloat64Sign( a );
3183 if ( aExp == 0x7FF ) {
bcd4d9af 3184 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
3185 return packFloat128( aSign, 0x7FFF, 0, 0 );
3186 }
3187 if ( aExp == 0 ) {
3188 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3189 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3190 --aExp;
3191 }
3192 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
3193 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
3194
3195}
3196
158142c2
FB
3197/*----------------------------------------------------------------------------
3198| Rounds the double-precision floating-point value `a' to an integer, and
3199| returns the result as a double-precision floating-point value. The
3200| operation is performed according to the IEC/IEEE Standard for Binary
3201| Floating-Point Arithmetic.
3202*----------------------------------------------------------------------------*/
3203
3204float64 float64_round_to_int( float64 a STATUS_PARAM )
3205{
3206 flag aSign;
94a49d86 3207 int_fast16_t aExp;
bb98fe42 3208 uint64_t lastBitMask, roundBitsMask;
158142c2 3209 int8 roundingMode;
bb98fe42 3210 uint64_t z;
37d18660 3211 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
3212
3213 aExp = extractFloat64Exp( a );
3214 if ( 0x433 <= aExp ) {
3215 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
3216 return propagateFloat64NaN( a, a STATUS_VAR );
3217 }
3218 return a;
3219 }
3220 if ( aExp < 0x3FF ) {
bb98fe42 3221 if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
158142c2
FB
3222 STATUS(float_exception_flags) |= float_flag_inexact;
3223 aSign = extractFloat64Sign( a );
3224 switch ( STATUS(float_rounding_mode) ) {
3225 case float_round_nearest_even:
3226 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3227 return packFloat64( aSign, 0x3FF, 0 );
3228 }
3229 break;
3230 case float_round_down:
f090c9d4 3231 return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
158142c2 3232 case float_round_up:
f090c9d4
PB
3233 return make_float64(
3234 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
158142c2
FB
3235 }
3236 return packFloat64( aSign, 0, 0 );
3237 }
3238 lastBitMask = 1;
3239 lastBitMask <<= 0x433 - aExp;
3240 roundBitsMask = lastBitMask - 1;
f090c9d4 3241 z = float64_val(a);
158142c2
FB
3242 roundingMode = STATUS(float_rounding_mode);
3243 if ( roundingMode == float_round_nearest_even ) {
3244 z += lastBitMask>>1;
3245 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
3246 }
3247 else if ( roundingMode != float_round_to_zero ) {
f090c9d4 3248 if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
158142c2
FB
3249 z += roundBitsMask;
3250 }
3251 }
3252 z &= ~ roundBitsMask;
f090c9d4
PB
3253 if ( z != float64_val(a) )
3254 STATUS(float_exception_flags) |= float_flag_inexact;
3255 return make_float64(z);
158142c2
FB
3256
3257}
3258
e6e5906b
PB
3259float64 float64_trunc_to_int( float64 a STATUS_PARAM)
3260{
3261 int oldmode;
3262 float64 res;
3263 oldmode = STATUS(float_rounding_mode);
3264 STATUS(float_rounding_mode) = float_round_to_zero;
3265 res = float64_round_to_int(a STATUS_VAR);
3266 STATUS(float_rounding_mode) = oldmode;
3267 return res;
3268}
3269
158142c2
FB
3270/*----------------------------------------------------------------------------
3271| Returns the result of adding the absolute values of the double-precision
3272| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
3273| before being returned. `zSign' is ignored if the result is a NaN.
3274| The addition is performed according to the IEC/IEEE Standard for Binary
3275| Floating-Point Arithmetic.
3276*----------------------------------------------------------------------------*/
3277
3278static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3279{
94a49d86 3280 int_fast16_t aExp, bExp, zExp;
bb98fe42 3281 uint64_t aSig, bSig, zSig;
94a49d86 3282 int_fast16_t expDiff;
158142c2
FB
3283
3284 aSig = extractFloat64Frac( a );
3285 aExp = extractFloat64Exp( a );
3286 bSig = extractFloat64Frac( b );
3287 bExp = extractFloat64Exp( b );
3288 expDiff = aExp - bExp;
3289 aSig <<= 9;
3290 bSig <<= 9;
3291 if ( 0 < expDiff ) {
3292 if ( aExp == 0x7FF ) {
3293 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3294 return a;
3295 }
3296 if ( bExp == 0 ) {
3297 --expDiff;
3298 }
3299 else {
3300 bSig |= LIT64( 0x2000000000000000 );
3301 }
3302 shift64RightJamming( bSig, expDiff, &bSig );
3303 zExp = aExp;
3304 }
3305 else if ( expDiff < 0 ) {
3306 if ( bExp == 0x7FF ) {
3307 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3308 return packFloat64( zSign, 0x7FF, 0 );
3309 }
3310 if ( aExp == 0 ) {
3311 ++expDiff;
3312 }
3313 else {
3314 aSig |= LIT64( 0x2000000000000000 );
3315 }
3316 shift64RightJamming( aSig, - expDiff, &aSig );
3317 zExp = bExp;
3318 }
3319 else {
3320 if ( aExp == 0x7FF ) {
3321 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3322 return a;
3323 }
fe76d976 3324 if ( aExp == 0 ) {
e6afc87f
PM
3325 if (STATUS(flush_to_zero)) {
3326 if (aSig | bSig) {
3327 float_raise(float_flag_output_denormal STATUS_VAR);
3328 }
3329 return packFloat64(zSign, 0, 0);
3330 }
fe76d976
PB
3331 return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3332 }
158142c2
FB
3333 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3334 zExp = aExp;
3335 goto roundAndPack;
3336 }
3337 aSig |= LIT64( 0x2000000000000000 );
3338 zSig = ( aSig + bSig )<<1;
3339 --zExp;
bb98fe42 3340 if ( (int64_t) zSig < 0 ) {
158142c2
FB
3341 zSig = aSig + bSig;
3342 ++zExp;
3343 }
3344 roundAndPack:
3345 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3346
3347}
3348
3349/*----------------------------------------------------------------------------
3350| Returns the result of subtracting the absolute values of the double-
3351| precision floating-point values `a' and `b'. If `zSign' is 1, the
3352| difference is negated before being returned. `zSign' is ignored if the
3353| result is a NaN. The subtraction is performed according to the IEC/IEEE
3354| Standard for Binary Floating-Point Arithmetic.
3355*----------------------------------------------------------------------------*/
3356
3357static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3358{
94a49d86 3359 int_fast16_t aExp, bExp, zExp;
bb98fe42 3360 uint64_t aSig, bSig, zSig;
94a49d86 3361 int_fast16_t expDiff;
158142c2
FB
3362
3363 aSig = extractFloat64Frac( a );
3364 aExp = extractFloat64Exp( a );
3365 bSig = extractFloat64Frac( b );
3366 bExp = extractFloat64Exp( b );
3367 expDiff = aExp - bExp;
3368 aSig <<= 10;
3369 bSig <<= 10;
3370 if ( 0 < expDiff ) goto aExpBigger;
3371 if ( expDiff < 0 ) goto bExpBigger;
3372 if ( aExp == 0x7FF ) {
3373 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3374 float_raise( float_flag_invalid STATUS_VAR);
3375 return float64_default_nan;
3376 }
3377 if ( aExp == 0 ) {
3378 aExp = 1;
3379 bExp = 1;
3380 }
3381 if ( bSig < aSig ) goto aBigger;
3382 if ( aSig < bSig ) goto bBigger;
3383 return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3384 bExpBigger:
3385 if ( bExp == 0x7FF ) {
3386 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3387 return packFloat64( zSign ^ 1, 0x7FF, 0 );
3388 }
3389 if ( aExp == 0 ) {
3390 ++expDiff;
3391 }
3392 else {
3393 aSig |= LIT64( 0x4000000000000000 );
3394 }
3395 shift64RightJamming( aSig, - expDiff, &aSig );
3396 bSig |= LIT64( 0x4000000000000000 );
3397 bBigger:
3398 zSig = bSig - aSig;
3399 zExp = bExp;
3400 zSign ^= 1;
3401 goto normalizeRoundAndPack;
3402 aExpBigger:
3403 if ( aExp == 0x7FF ) {
3404 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3405 return a;
3406 }
3407 if ( bExp == 0 ) {
3408 --expDiff;
3409 }
3410 else {
3411 bSig |= LIT64( 0x4000000000000000 );
3412 }
3413 shift64RightJamming( bSig, expDiff, &bSig );
3414 aSig |= LIT64( 0x4000000000000000 );
3415 aBigger:
3416 zSig = aSig - bSig;
3417 zExp = aExp;
3418 normalizeRoundAndPack:
3419 --zExp;
3420 return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3421
3422}
3423
3424/*----------------------------------------------------------------------------
3425| Returns the result of adding the double-precision floating-point values `a'
3426| and `b'. The operation is performed according to the IEC/IEEE Standard for
3427| Binary Floating-Point Arithmetic.
3428*----------------------------------------------------------------------------*/
3429
3430float64 float64_add( float64 a, float64 b STATUS_PARAM )
3431{
3432 flag aSign, bSign;
37d18660
PM
3433 a = float64_squash_input_denormal(a STATUS_VAR);
3434 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
3435
3436 aSign = extractFloat64Sign( a );
3437 bSign = extractFloat64Sign( b );
3438 if ( aSign == bSign ) {
3439 return addFloat64Sigs( a, b, aSign STATUS_VAR );
3440 }
3441 else {
3442 return subFloat64Sigs( a, b, aSign STATUS_VAR );
3443 }
3444
3445}
3446
3447/*----------------------------------------------------------------------------
3448| Returns the result of subtracting the double-precision floating-point values
3449| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3450| for Binary Floating-Point Arithmetic.
3451*----------------------------------------------------------------------------*/
3452
3453float64 float64_sub( float64 a, float64 b STATUS_PARAM )
3454{
3455 flag aSign, bSign;
37d18660
PM
3456 a = float64_squash_input_denormal(a STATUS_VAR);
3457 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
3458
3459 aSign = extractFloat64Sign( a );
3460 bSign = extractFloat64Sign( b );
3461 if ( aSign == bSign ) {
3462 return subFloat64Sigs( a, b, aSign STATUS_VAR );
3463 }
3464 else {
3465 return addFloat64Sigs( a, b, aSign STATUS_VAR );
3466 }
3467
3468}
3469
3470/*----------------------------------------------------------------------------
3471| Returns the result of multiplying the double-precision floating-point values
3472| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3473| for Binary Floating-Point Arithmetic.
3474*----------------------------------------------------------------------------*/
3475
3476float64 float64_mul( float64 a, float64 b STATUS_PARAM )
3477{
3478 flag aSign, bSign, zSign;
94a49d86 3479 int_fast16_t aExp, bExp, zExp;
bb98fe42 3480 uint64_t aSig, bSig, zSig0, zSig1;
158142c2 3481
37d18660
PM
3482 a = float64_squash_input_denormal(a STATUS_VAR);
3483 b = float64_squash_input_denormal(b STATUS_VAR);
3484
158142c2
FB
3485 aSig = extractFloat64Frac( a );
3486 aExp = extractFloat64Exp( a );
3487 aSign = extractFloat64Sign( a );
3488 bSig = extractFloat64Frac( b );
3489 bExp = extractFloat64Exp( b );
3490 bSign = extractFloat64Sign( b );
3491 zSign = aSign ^ bSign;
3492 if ( aExp == 0x7FF ) {
3493 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3494 return propagateFloat64NaN( a, b STATUS_VAR );
3495 }
3496 if ( ( bExp | bSig ) == 0 ) {
3497 float_raise( float_flag_invalid STATUS_VAR);
3498 return float64_default_nan;
3499 }
3500 return packFloat64( zSign, 0x7FF, 0 );
3501 }
3502 if ( bExp == 0x7FF ) {
3503 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3504 if ( ( aExp | aSig ) == 0 ) {
3505 float_raise( float_flag_invalid STATUS_VAR);
3506 return float64_default_nan;
3507 }
3508 return packFloat64( zSign, 0x7FF, 0 );
3509 }
3510 if ( aExp == 0 ) {
3511 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3512 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3513 }
3514 if ( bExp == 0 ) {
3515 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
3516 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3517 }
3518 zExp = aExp + bExp - 0x3FF;
3519 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3520 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3521 mul64To128( aSig, bSig, &zSig0, &zSig1 );
3522 zSig0 |= ( zSig1 != 0 );
bb98fe42 3523 if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
158142c2
FB
3524 zSig0 <<= 1;
3525 --zExp;
3526 }
3527 return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
3528
3529}
3530
3531/*----------------------------------------------------------------------------
3532| Returns the result of dividing the double-precision floating-point value `a'
3533| by the corresponding value `b'. The operation is performed according to
3534| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3535*----------------------------------------------------------------------------*/
3536
3537float64 float64_div( float64 a, float64 b STATUS_PARAM )
3538{
3539 flag aSign, bSign, zSign;
94a49d86 3540 int_fast16_t aExp, bExp, zExp;
bb98fe42
AF
3541 uint64_t aSig, bSig, zSig;
3542 uint64_t rem0, rem1;
3543 uint64_t term0, term1;
37d18660
PM
3544 a = float64_squash_input_denormal(a STATUS_VAR);
3545 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
3546
3547 aSig = extractFloat64Frac( a );
3548 aExp = extractFloat64Exp( a );
3549 aSign = extractFloat64Sign( a );
3550 bSig = extractFloat64Frac( b );
3551 bExp = extractFloat64Exp( b );
3552 bSign = extractFloat64Sign( b );
3553 zSign = aSign ^ bSign;
3554 if ( aExp == 0x7FF ) {
3555 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3556 if ( bExp == 0x7FF ) {
3557 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3558 float_raise( float_flag_invalid STATUS_VAR);
3559 return float64_default_nan;
3560 }
3561 return packFloat64( zSign, 0x7FF, 0 );
3562 }
3563 if ( bExp == 0x7FF ) {
3564 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3565 return packFloat64( zSign, 0, 0 );
3566 }
3567 if ( bExp == 0 ) {
3568 if ( bSig == 0 ) {
3569 if ( ( aExp | aSig ) == 0 ) {
3570 float_raise( float_flag_invalid STATUS_VAR);
3571 return float64_default_nan;
3572 }
3573 float_raise( float_flag_divbyzero STATUS_VAR);
3574 return packFloat64( zSign, 0x7FF, 0 );
3575 }
3576 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3577 }
3578 if ( aExp == 0 ) {
3579 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3580 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3581 }
3582 zExp = aExp - bExp + 0x3FD;
3583 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3584 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3585 if ( bSig <= ( aSig + aSig ) ) {
3586 aSig >>= 1;
3587 ++zExp;
3588 }
3589 zSig = estimateDiv128To64( aSig, 0, bSig );
3590 if ( ( zSig & 0x1FF ) <= 2 ) {
3591 mul64To128( bSig, zSig, &term0, &term1 );
3592 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
bb98fe42 3593 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
3594 --zSig;
3595 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3596 }
3597 zSig |= ( rem1 != 0 );
3598 }
3599 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3600
3601}
3602
3603/*----------------------------------------------------------------------------
3604| Returns the remainder of the double-precision floating-point value `a'
3605| with respect to the corresponding value `b'. The operation is performed
3606| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3607*----------------------------------------------------------------------------*/
3608
3609float64 float64_rem( float64 a, float64 b STATUS_PARAM )
3610{
ed086f3d 3611 flag aSign, zSign;
94a49d86 3612 int_fast16_t aExp, bExp, expDiff;
bb98fe42
AF
3613 uint64_t aSig, bSig;
3614 uint64_t q, alternateASig;
3615 int64_t sigMean;
158142c2 3616
37d18660
PM
3617 a = float64_squash_input_denormal(a STATUS_VAR);
3618 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
3619 aSig = extractFloat64Frac( a );
3620 aExp = extractFloat64Exp( a );
3621 aSign = extractFloat64Sign( a );
3622 bSig = extractFloat64Frac( b );
3623 bExp = extractFloat64Exp( b );
158142c2
FB
3624 if ( aExp == 0x7FF ) {
3625 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3626 return propagateFloat64NaN( a, b STATUS_VAR );
3627 }
3628 float_raise( float_flag_invalid STATUS_VAR);
3629 return float64_default_nan;
3630 }
3631 if ( bExp == 0x7FF ) {
3632 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3633 return a;
3634 }
3635 if ( bExp == 0 ) {
3636 if ( bSig == 0 ) {
3637 float_raise( float_flag_invalid STATUS_VAR);
3638 return float64_default_nan;
3639 }
3640 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3641 }
3642 if ( aExp == 0 ) {
3643 if ( aSig == 0 ) return a;
3644 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3645 }
3646 expDiff = aExp - bExp;
3647 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3648 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3649 if ( expDiff < 0 ) {
3650 if ( expDiff < -1 ) return a;
3651 aSig >>= 1;
3652 }
3653 q = ( bSig <= aSig );
3654 if ( q ) aSig -= bSig;
3655 expDiff -= 64;
3656 while ( 0 < expDiff ) {
3657 q = estimateDiv128To64( aSig, 0, bSig );
3658 q = ( 2 < q ) ? q - 2 : 0;
3659 aSig = - ( ( bSig>>2 ) * q );
3660 expDiff -= 62;
3661 }
3662 expDiff += 64;
3663 if ( 0 < expDiff ) {
3664 q = estimateDiv128To64( aSig, 0, bSig );
3665 q = ( 2 < q ) ? q - 2 : 0;
3666 q >>= 64 - expDiff;
3667 bSig >>= 2;
3668 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3669 }
3670 else {
3671 aSig >>= 2;
3672 bSig >>= 2;
3673 }
3674 do {
3675 alternateASig = aSig;
3676 ++q;
3677 aSig -= bSig;
bb98fe42 3678 } while ( 0 <= (int64_t) aSig );
158142c2
FB
3679 sigMean = aSig + alternateASig;
3680 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3681 aSig = alternateASig;
3682 }
bb98fe42 3683 zSign = ( (int64_t) aSig < 0 );
158142c2
FB
3684 if ( zSign ) aSig = - aSig;
3685 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
3686
3687}
3688
369be8f6
PM
3689/*----------------------------------------------------------------------------
3690| Returns the result of multiplying the double-precision floating-point values
3691| `a' and `b' then adding 'c', with no intermediate rounding step after the
3692| multiplication. The operation is performed according to the IEC/IEEE
3693| Standard for Binary Floating-Point Arithmetic 754-2008.
3694| The flags argument allows the caller to select negation of the
3695| addend, the intermediate product, or the final result. (The difference
3696| between this and having the caller do a separate negation is that negating
3697| externally will flip the sign bit on NaNs.)
3698*----------------------------------------------------------------------------*/
3699
3700float64 float64_muladd(float64 a, float64 b, float64 c, int flags STATUS_PARAM)
3701{
3702 flag aSign, bSign, cSign, zSign;
94a49d86 3703 int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
369be8f6
PM
3704 uint64_t aSig, bSig, cSig;
3705 flag pInf, pZero, pSign;
3706 uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1;
3707 int shiftcount;
3708 flag signflip, infzero;
3709
3710 a = float64_squash_input_denormal(a STATUS_VAR);
3711 b = float64_squash_input_denormal(b STATUS_VAR);
3712 c = float64_squash_input_denormal(c STATUS_VAR);
3713 aSig = extractFloat64Frac(a);
3714 aExp = extractFloat64Exp(a);
3715 aSign = extractFloat64Sign(a);
3716 bSig = extractFloat64Frac(b);
3717 bExp = extractFloat64Exp(b);
3718 bSign = extractFloat64Sign(b);
3719 cSig = extractFloat64Frac(c);
3720 cExp = extractFloat64Exp(c);
3721 cSign = extractFloat64Sign(c);
3722
3723 infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) ||
3724 (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0));
3725
3726 /* It is implementation-defined whether the cases of (0,inf,qnan)
3727 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
3728 * they return if they do), so we have to hand this information
3729 * off to the target-specific pick-a-NaN routine.
3730 */
3731 if (((aExp == 0x7ff) && aSig) ||
3732 ((bExp == 0x7ff) && bSig) ||
3733 ((cExp == 0x7ff) && cSig)) {
3734 return propagateFloat64MulAddNaN(a, b, c, infzero STATUS_VAR);
3735 }
3736
3737 if (infzero) {
3738 float_raise(float_flag_invalid STATUS_VAR);
3739 return float64_default_nan;
3740 }
3741
3742 if (flags & float_muladd_negate_c) {
3743 cSign ^= 1;
3744 }
3745
3746 signflip = (flags & float_muladd_negate_result) ? 1 : 0;
3747
3748 /* Work out the sign and type of the product */
3749 pSign = aSign ^ bSign;
3750 if (flags & float_muladd_negate_product) {
3751 pSign ^= 1;
3752 }
3753 pInf = (aExp == 0x7ff) || (bExp == 0x7ff);
3754 pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
3755
3756 if (cExp == 0x7ff) {
3757 if (pInf && (pSign ^ cSign)) {
3758 /* addition of opposite-signed infinities => InvalidOperation */
3759 float_raise(float_flag_invalid STATUS_VAR);
3760 return float64_default_nan;
3761 }
3762 /* Otherwise generate an infinity of the same sign */
3763 return packFloat64(cSign ^ signflip, 0x7ff, 0);
3764 }
3765
3766 if (pInf) {
3767 return packFloat64(pSign ^ signflip, 0x7ff, 0);
3768 }
3769
3770 if (pZero) {
3771 if (cExp == 0) {
3772 if (cSig == 0) {
3773 /* Adding two exact zeroes */
3774 if (pSign == cSign) {
3775 zSign = pSign;
3776 } else if (STATUS(float_rounding_mode) == float_round_down) {
3777 zSign = 1;
3778 } else {
3779 zSign = 0;
3780 }
3781 return packFloat64(zSign ^ signflip, 0, 0);
3782 }
3783 /* Exact zero plus a denorm */
3784 if (STATUS(flush_to_zero)) {
3785 float_raise(float_flag_output_denormal STATUS_VAR);
3786 return packFloat64(cSign ^ signflip, 0, 0);
3787 }
3788 }
3789 /* Zero plus something non-zero : just return the something */
a6e7c184 3790 return packFloat64(cSign ^ signflip, cExp, cSig);
369be8f6
PM
3791 }
3792
3793 if (aExp == 0) {
3794 normalizeFloat64Subnormal(aSig, &aExp, &aSig);
3795 }
3796 if (bExp == 0) {
3797 normalizeFloat64Subnormal(bSig, &bExp, &bSig);
3798 }
3799
3800 /* Calculate the actual result a * b + c */
3801
3802 /* Multiply first; this is easy. */
3803 /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff
3804 * because we want the true exponent, not the "one-less-than"
3805 * flavour that roundAndPackFloat64() takes.
3806 */
3807 pExp = aExp + bExp - 0x3fe;
3808 aSig = (aSig | LIT64(0x0010000000000000))<<10;
3809 bSig = (bSig | LIT64(0x0010000000000000))<<11;
3810 mul64To128(aSig, bSig, &pSig0, &pSig1);
3811 if ((int64_t)(pSig0 << 1) >= 0) {
3812 shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1);
3813 pExp--;
3814 }
3815
3816 zSign = pSign ^ signflip;
3817
3818 /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit
3819 * bit in position 126.
3820 */
3821 if (cExp == 0) {
3822 if (!cSig) {
3823 /* Throw out the special case of c being an exact zero now */
3824 shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1);
3825 return roundAndPackFloat64(zSign, pExp - 1,
3826 pSig1 STATUS_VAR);
3827 }
3828 normalizeFloat64Subnormal(cSig, &cExp, &cSig);
3829 }
3830
3831 /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the
3832 * significand of the addend, with the explicit bit in position 126.
3833 */
3834 cSig0 = cSig << (126 - 64 - 52);
3835 cSig1 = 0;
3836 cSig0 |= LIT64(0x4000000000000000);
3837 expDiff = pExp - cExp;
3838
3839 if (pSign == cSign) {
3840 /* Addition */
3841 if (expDiff > 0) {
3842 /* scale c to match p */
3843 shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
3844 zExp = pExp;
3845 } else if (expDiff < 0) {
3846 /* scale p to match c */
3847 shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
3848 zExp = cExp;
3849 } else {
3850 /* no scaling needed */
3851 zExp = cExp;
3852 }
3853 /* Add significands and make sure explicit bit ends up in posn 126 */
3854 add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
3855 if ((int64_t)zSig0 < 0) {
3856 shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);
3857 } else {
3858 zExp--;
3859 }
3860 shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1);
3861 return roundAndPackFloat64(zSign, zExp, zSig1 STATUS_VAR);
3862 } else {
3863 /* Subtraction */
3864 if (expDiff > 0) {
3865 shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
3866 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
3867 zExp = pExp;
3868 } else if (expDiff < 0) {
3869 shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
3870 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
3871 zExp = cExp;
3872 zSign ^= 1;
3873 } else {
3874 zExp = pExp;
3875 if (lt128(cSig0, cSig1, pSig0, pSig1)) {
3876 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
3877 } else if (lt128(pSig0, pSig1, cSig0, cSig1)) {
3878 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
3879 zSign ^= 1;
3880 } else {
3881 /* Exact zero */
3882 zSign = signflip;
3883 if (STATUS(float_rounding_mode) == float_round_down) {
3884 zSign ^= 1;
3885 }
3886 return packFloat64(zSign, 0, 0);
3887 }
3888 }
3889 --zExp;
3890 /* Do the equivalent of normalizeRoundAndPackFloat64() but
3891 * starting with the significand in a pair of uint64_t.
3892 */
3893 if (zSig0) {
3894 shiftcount = countLeadingZeros64(zSig0) - 1;
3895 shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1);
3896 if (zSig1) {
3897 zSig0 |= 1;
3898 }
3899 zExp -= shiftcount;
3900 } else {
e3d142d0
PM
3901 shiftcount = countLeadingZeros64(zSig1);
3902 if (shiftcount == 0) {
3903 zSig0 = (zSig1 >> 1) | (zSig1 & 1);
3904 zExp -= 63;
3905 } else {
3906 shiftcount--;
3907 zSig0 = zSig1 << shiftcount;
3908 zExp -= (shiftcount + 64);
3909 }
369be8f6
PM
3910 }
3911 return roundAndPackFloat64(zSign, zExp, zSig0 STATUS_VAR);
3912 }
3913}
3914
158142c2
FB
3915/*----------------------------------------------------------------------------
3916| Returns the square root of the double-precision floating-point value `a'.
3917| The operation is performed according to the IEC/IEEE Standard for Binary
3918| Floating-Point Arithmetic.
3919*----------------------------------------------------------------------------*/
3920
3921float64 float64_sqrt( float64 a STATUS_PARAM )
3922{
3923 flag aSign;
94a49d86 3924 int_fast16_t aExp, zExp;
bb98fe42
AF
3925 uint64_t aSig, zSig, doubleZSig;
3926 uint64_t rem0, rem1, term0, term1;
37d18660 3927 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
3928
3929 aSig = extractFloat64Frac( a );
3930 aExp = extractFloat64Exp( a );
3931 aSign = extractFloat64Sign( a );
3932 if ( aExp == 0x7FF ) {
3933 if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
3934 if ( ! aSign ) return a;
3935 float_raise( float_flag_invalid STATUS_VAR);
3936 return float64_default_nan;
3937 }
3938 if ( aSign ) {
3939 if ( ( aExp | aSig ) == 0 ) return a;
3940 float_raise( float_flag_invalid STATUS_VAR);
3941 return float64_default_nan;
3942 }
3943 if ( aExp == 0 ) {
f090c9d4 3944 if ( aSig == 0 ) return float64_zero;
158142c2
FB
3945 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3946 }
3947 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3948 aSig |= LIT64( 0x0010000000000000 );
3949 zSig = estimateSqrt32( aExp, aSig>>21 );
3950 aSig <<= 9 - ( aExp & 1 );
3951 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3952 if ( ( zSig & 0x1FF ) <= 5 ) {
3953 doubleZSig = zSig<<1;
3954 mul64To128( zSig, zSig, &term0, &term1 );
3955 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
bb98fe42 3956 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
3957 --zSig;
3958 doubleZSig -= 2;
3959 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3960 }
3961 zSig |= ( ( rem0 | rem1 ) != 0 );
3962 }
3963 return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
3964
3965}
3966
374dfc33
AJ
3967/*----------------------------------------------------------------------------
3968| Returns the binary log of the double-precision floating-point value `a'.
3969| The operation is performed according to the IEC/IEEE Standard for Binary
3970| Floating-Point Arithmetic.
3971*----------------------------------------------------------------------------*/
3972float64 float64_log2( float64 a STATUS_PARAM )
3973{
3974 flag aSign, zSign;
94a49d86 3975 int_fast16_t aExp;
bb98fe42 3976 uint64_t aSig, aSig0, aSig1, zSig, i;
37d18660 3977 a = float64_squash_input_denormal(a STATUS_VAR);
374dfc33
AJ
3978
3979 aSig = extractFloat64Frac( a );
3980 aExp = extractFloat64Exp( a );
3981 aSign = extractFloat64Sign( a );
3982
3983 if ( aExp == 0 ) {
3984 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
3985 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3986 }
3987 if ( aSign ) {
3988 float_raise( float_flag_invalid STATUS_VAR);
3989 return float64_default_nan;
3990 }
3991 if ( aExp == 0x7FF ) {
3992 if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
3993 return a;
3994 }
3995
3996 aExp -= 0x3FF;
3997 aSig |= LIT64( 0x0010000000000000 );
3998 zSign = aExp < 0;
bb98fe42 3999 zSig = (uint64_t)aExp << 52;
374dfc33
AJ
4000 for (i = 1LL << 51; i > 0; i >>= 1) {
4001 mul64To128( aSig, aSig, &aSig0, &aSig1 );
4002 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4003 if ( aSig & LIT64( 0x0020000000000000 ) ) {
4004 aSig >>= 1;
4005 zSig |= i;
4006 }
4007 }
4008
4009 if ( zSign )
4010 zSig = -zSig;
4011 return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
4012}
4013
158142c2
FB
4014/*----------------------------------------------------------------------------
4015| Returns 1 if the double-precision floating-point value `a' is equal to the
b689362d
AJ
4016| corresponding value `b', and 0 otherwise. The invalid exception is raised
4017| if either operand is a NaN. Otherwise, the comparison is performed
158142c2
FB
4018| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4019*----------------------------------------------------------------------------*/
4020
b689362d 4021int float64_eq( float64 a, float64 b STATUS_PARAM )
158142c2 4022{
bb98fe42 4023 uint64_t av, bv;
37d18660
PM
4024 a = float64_squash_input_denormal(a STATUS_VAR);
4025 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
4026
4027 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4028 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4029 ) {
b689362d 4030 float_raise( float_flag_invalid STATUS_VAR);
158142c2
FB
4031 return 0;
4032 }
f090c9d4 4033 av = float64_val(a);
a1b91bb4 4034 bv = float64_val(b);
bb98fe42 4035 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
158142c2
FB
4036
4037}
4038
4039/*----------------------------------------------------------------------------
4040| Returns 1 if the double-precision floating-point value `a' is less than or
f5a64251
AJ
4041| equal to the corresponding value `b', and 0 otherwise. The invalid
4042| exception is raised if either operand is a NaN. The comparison is performed
4043| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
4044*----------------------------------------------------------------------------*/
4045
750afe93 4046int float64_le( float64 a, float64 b STATUS_PARAM )
158142c2
FB
4047{
4048 flag aSign, bSign;
bb98fe42 4049 uint64_t av, bv;
37d18660
PM
4050 a = float64_squash_input_denormal(a STATUS_VAR);
4051 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
4052
4053 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4054 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4055 ) {
4056 float_raise( float_flag_invalid STATUS_VAR);
4057 return 0;
4058 }
4059 aSign = extractFloat64Sign( a );
4060 bSign = extractFloat64Sign( b );
f090c9d4 4061 av = float64_val(a);
a1b91bb4 4062 bv = float64_val(b);
bb98fe42 4063 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 4064 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
4065
4066}
4067
4068/*----------------------------------------------------------------------------
4069| Returns 1 if the double-precision floating-point value `a' is less than
f5a64251
AJ
4070| the corresponding value `b', and 0 otherwise. The invalid exception is
4071| raised if either operand is a NaN. The comparison is performed according
4072| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
4073*----------------------------------------------------------------------------*/
4074
750afe93 4075int float64_lt( float64 a, float64 b STATUS_PARAM )
158142c2
FB
4076{
4077 flag aSign, bSign;
bb98fe42 4078 uint64_t av, bv;
158142c2 4079
37d18660
PM
4080 a = float64_squash_input_denormal(a STATUS_VAR);
4081 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
4082 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4083 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4084 ) {
4085 float_raise( float_flag_invalid STATUS_VAR);
4086 return 0;
4087 }
4088 aSign = extractFloat64Sign( a );
4089 bSign = extractFloat64Sign( b );
f090c9d4 4090 av = float64_val(a);
a1b91bb4 4091 bv = float64_val(b);
bb98fe42 4092 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 4093 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
4094
4095}
4096
67b7861d
AJ
4097/*----------------------------------------------------------------------------
4098| Returns 1 if the double-precision floating-point values `a' and `b' cannot
f5a64251
AJ
4099| be compared, and 0 otherwise. The invalid exception is raised if either
4100| operand is a NaN. The comparison is performed according to the IEC/IEEE
4101| Standard for Binary Floating-Point Arithmetic.
67b7861d
AJ
4102*----------------------------------------------------------------------------*/
4103
4104int float64_unordered( float64 a, float64 b STATUS_PARAM )
4105{
4106 a = float64_squash_input_denormal(a STATUS_VAR);
4107 b = float64_squash_input_denormal(b STATUS_VAR);
4108
4109 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4110 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4111 ) {
4112 float_raise( float_flag_invalid STATUS_VAR);
4113 return 1;
4114 }
4115 return 0;
4116}
4117
158142c2
FB
4118/*----------------------------------------------------------------------------
4119| Returns 1 if the double-precision floating-point value `a' is equal to the
f5a64251
AJ
4120| corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4121| exception.The comparison is performed according to the IEC/IEEE Standard
4122| for Binary Floating-Point Arithmetic.
158142c2
FB
4123*----------------------------------------------------------------------------*/
4124
b689362d 4125int float64_eq_quiet( float64 a, float64 b STATUS_PARAM )
158142c2 4126{
bb98fe42 4127 uint64_t av, bv;
37d18660
PM
4128 a = float64_squash_input_denormal(a STATUS_VAR);
4129 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
4130
4131 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4132 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4133 ) {
b689362d
AJ
4134 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4135 float_raise( float_flag_invalid STATUS_VAR);
4136 }
158142c2
FB
4137 return 0;
4138 }
f090c9d4 4139 av = float64_val(a);
a1b91bb4 4140 bv = float64_val(b);
bb98fe42 4141 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
158142c2
FB
4142
4143}
4144
4145/*----------------------------------------------------------------------------
4146| Returns 1 if the double-precision floating-point value `a' is less than or
4147| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4148| cause an exception. Otherwise, the comparison is performed according to the
4149| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4150*----------------------------------------------------------------------------*/
4151
750afe93 4152int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
158142c2
FB
4153{
4154 flag aSign, bSign;
bb98fe42 4155 uint64_t av, bv;
37d18660
PM
4156 a = float64_squash_input_denormal(a STATUS_VAR);
4157 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
4158
4159 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4160 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4161 ) {
4162 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4163 float_raise( float_flag_invalid STATUS_VAR);
4164 }
4165 return 0;
4166 }
4167 aSign = extractFloat64Sign( a );
4168 bSign = extractFloat64Sign( b );
f090c9d4 4169 av = float64_val(a);
a1b91bb4 4170 bv = float64_val(b);
bb98fe42 4171 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 4172 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
4173
4174}
4175
4176/*----------------------------------------------------------------------------
4177| Returns 1 if the double-precision floating-point value `a' is less than
4178| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4179| exception. Otherwise, the comparison is performed according to the IEC/IEEE
4180| Standard for Binary Floating-Point Arithmetic.
4181*----------------------------------------------------------------------------*/
4182
750afe93 4183int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
158142c2
FB
4184{
4185 flag aSign, bSign;
bb98fe42 4186 uint64_t av, bv;
37d18660
PM
4187 a = float64_squash_input_denormal(a STATUS_VAR);
4188 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
4189
4190 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4191 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4192 ) {
4193 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4194 float_raise( float_flag_invalid STATUS_VAR);
4195 }
4196 return 0;
4197 }
4198 aSign = extractFloat64Sign( a );
4199 bSign = extractFloat64Sign( b );
f090c9d4 4200 av = float64_val(a);
a1b91bb4 4201 bv = float64_val(b);
bb98fe42 4202 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 4203 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
4204
4205}
4206
67b7861d
AJ
4207/*----------------------------------------------------------------------------
4208| Returns 1 if the double-precision floating-point values `a' and `b' cannot
4209| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4210| comparison is performed according to the IEC/IEEE Standard for Binary
4211| Floating-Point Arithmetic.
4212*----------------------------------------------------------------------------*/
4213
4214int float64_unordered_quiet( float64 a, float64 b STATUS_PARAM )
4215{
4216 a = float64_squash_input_denormal(a STATUS_VAR);
4217 b = float64_squash_input_denormal(b STATUS_VAR);
4218
4219 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4220 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4221 ) {
4222 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4223 float_raise( float_flag_invalid STATUS_VAR);
4224 }
4225 return 1;
4226 }
4227 return 0;
4228}
4229
158142c2
FB
4230/*----------------------------------------------------------------------------
4231| Returns the result of converting the extended double-precision floating-
4232| point value `a' to the 32-bit two's complement integer format. The
4233| conversion is performed according to the IEC/IEEE Standard for Binary
4234| Floating-Point Arithmetic---which means in particular that the conversion
4235| is rounded according to the current rounding mode. If `a' is a NaN, the
4236| largest positive integer is returned. Otherwise, if the conversion
4237| overflows, the largest integer with the same sign as `a' is returned.
4238*----------------------------------------------------------------------------*/
4239
4240int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
4241{
4242 flag aSign;
4243 int32 aExp, shiftCount;
bb98fe42 4244 uint64_t aSig;
158142c2
FB
4245
4246 aSig = extractFloatx80Frac( a );
4247 aExp = extractFloatx80Exp( a );
4248 aSign = extractFloatx80Sign( a );
bb98fe42 4249 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
158142c2
FB
4250 shiftCount = 0x4037 - aExp;
4251 if ( shiftCount <= 0 ) shiftCount = 1;
4252 shift64RightJamming( aSig, shiftCount, &aSig );
4253 return roundAndPackInt32( aSign, aSig STATUS_VAR );
4254
4255}
4256
4257/*----------------------------------------------------------------------------
4258| Returns the result of converting the extended double-precision floating-
4259| point value `a' to the 32-bit two's complement integer format. The
4260| conversion is performed according to the IEC/IEEE Standard for Binary
4261| Floating-Point Arithmetic, except that the conversion is always rounded
4262| toward zero. If `a' is a NaN, the largest positive integer is returned.
4263| Otherwise, if the conversion overflows, the largest integer with the same
4264| sign as `a' is returned.
4265*----------------------------------------------------------------------------*/
4266
4267int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
4268{
4269 flag aSign;
4270 int32 aExp, shiftCount;
bb98fe42 4271 uint64_t aSig, savedASig;
b3a6a2e0 4272 int32_t z;
158142c2
FB
4273
4274 aSig = extractFloatx80Frac( a );
4275 aExp = extractFloatx80Exp( a );
4276 aSign = extractFloatx80Sign( a );
4277 if ( 0x401E < aExp ) {
bb98fe42 4278 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
158142c2
FB
4279 goto invalid;
4280 }
4281 else if ( aExp < 0x3FFF ) {
4282 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
4283 return 0;
4284 }
4285 shiftCount = 0x403E - aExp;
4286 savedASig = aSig;
4287 aSig >>= shiftCount;
4288 z = aSig;
4289 if ( aSign ) z = - z;
4290 if ( ( z < 0 ) ^ aSign ) {
4291 invalid:
4292 float_raise( float_flag_invalid STATUS_VAR);
bb98fe42 4293 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2
FB
4294 }
4295 if ( ( aSig<<shiftCount ) != savedASig ) {
4296 STATUS(float_exception_flags) |= float_flag_inexact;
4297 }
4298 return z;
4299
4300}
4301
4302/*----------------------------------------------------------------------------
4303| Returns the result of converting the extended double-precision floating-
4304| point value `a' to the 64-bit two's complement integer format. The
4305| conversion is performed according to the IEC/IEEE Standard for Binary
4306| Floating-Point Arithmetic---which means in particular that the conversion
4307| is rounded according to the current rounding mode. If `a' is a NaN,
4308| the largest positive integer is returned. Otherwise, if the conversion
4309| overflows, the largest integer with the same sign as `a' is returned.
4310*----------------------------------------------------------------------------*/
4311
4312int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
4313{
4314 flag aSign;
4315 int32 aExp, shiftCount;
bb98fe42 4316 uint64_t aSig, aSigExtra;
158142c2
FB
4317
4318 aSig = extractFloatx80Frac( a );
4319 aExp = extractFloatx80Exp( a );
4320 aSign = extractFloatx80Sign( a );
4321 shiftCount = 0x403E - aExp;
4322 if ( shiftCount <= 0 ) {
4323 if ( shiftCount ) {
4324 float_raise( float_flag_invalid STATUS_VAR);
4325 if ( ! aSign
4326 || ( ( aExp == 0x7FFF )
4327 && ( aSig != LIT64( 0x8000000000000000 ) ) )
4328 ) {
4329 return LIT64( 0x7FFFFFFFFFFFFFFF );
4330 }
bb98fe42 4331 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
4332 }
4333 aSigExtra = 0;
4334 }
4335 else {
4336 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4337 }
4338 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
4339
4340}
4341
4342/*----------------------------------------------------------------------------
4343| Returns the result of converting the extended double-precision floating-
4344| point value `a' to the 64-bit two's complement integer format. The
4345| conversion is performed according to the IEC/IEEE Standard for Binary
4346| Floating-Point Arithmetic, except that the conversion is always rounded
4347| toward zero. If `a' is a NaN, the largest positive integer is returned.
4348| Otherwise, if the conversion overflows, the largest integer with the same
4349| sign as `a' is returned.
4350*----------------------------------------------------------------------------*/
4351
4352int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
4353{
4354 flag aSign;
4355 int32 aExp, shiftCount;
bb98fe42 4356 uint64_t aSig;
158142c2
FB
4357 int64 z;
4358
4359 aSig = extractFloatx80Frac( a );
4360 aExp = extractFloatx80Exp( a );
4361 aSign = extractFloatx80Sign( a );
4362 shiftCount = aExp - 0x403E;
4363 if ( 0 <= shiftCount ) {
4364 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4365 if ( ( a.high != 0xC03E ) || aSig ) {
4366 float_raise( float_flag_invalid STATUS_VAR);
4367 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4368 return LIT64( 0x7FFFFFFFFFFFFFFF );
4369 }
4370 }
bb98fe42 4371 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
4372 }
4373 else if ( aExp < 0x3FFF ) {
4374 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
4375 return 0;
4376 }
4377 z = aSig>>( - shiftCount );
bb98fe42 4378 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
158142c2
FB
4379 STATUS(float_exception_flags) |= float_flag_inexact;
4380 }
4381 if ( aSign ) z = - z;
4382 return z;
4383
4384}
4385
4386/*----------------------------------------------------------------------------
4387| Returns the result of converting the extended double-precision floating-
4388| point value `a' to the single-precision floating-point format. The
4389| conversion is performed according to the IEC/IEEE Standard for Binary
4390| Floating-Point Arithmetic.
4391*----------------------------------------------------------------------------*/
4392
4393float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
4394{
4395 flag aSign;
4396 int32 aExp;
bb98fe42 4397 uint64_t aSig;
158142c2
FB
4398
4399 aSig = extractFloatx80Frac( a );
4400 aExp = extractFloatx80Exp( a );
4401 aSign = extractFloatx80Sign( a );
4402 if ( aExp == 0x7FFF ) {
bb98fe42 4403 if ( (uint64_t) ( aSig<<1 ) ) {
bcd4d9af 4404 return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
4405 }
4406 return packFloat32( aSign, 0xFF, 0 );
4407 }
4408 shift64RightJamming( aSig, 33, &aSig );
4409 if ( aExp || aSig ) aExp -= 0x3F81;
4410 return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
4411
4412}
4413
4414/*----------------------------------------------------------------------------
4415| Returns the result of converting the extended double-precision floating-
4416| point value `a' to the double-precision floating-point format. The
4417| conversion is performed according to the IEC/IEEE Standard for Binary
4418| Floating-Point Arithmetic.
4419*----------------------------------------------------------------------------*/
4420
4421float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
4422{
4423 flag aSign;
4424 int32 aExp;
bb98fe42 4425 uint64_t aSig, zSig;
158142c2
FB
4426
4427 aSig = extractFloatx80Frac( a );
4428 aExp = extractFloatx80Exp( a );
4429 aSign = extractFloatx80Sign( a );
4430 if ( aExp == 0x7FFF ) {
bb98fe42 4431 if ( (uint64_t) ( aSig<<1 ) ) {
bcd4d9af 4432 return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
4433 }
4434 return packFloat64( aSign, 0x7FF, 0 );
4435 }
4436 shift64RightJamming( aSig, 1, &zSig );
4437 if ( aExp || aSig ) aExp -= 0x3C01;
4438 return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
4439
4440}
4441
158142c2
FB
4442/*----------------------------------------------------------------------------
4443| Returns the result of converting the extended double-precision floating-
4444| point value `a' to the quadruple-precision floating-point format. The
4445| conversion is performed according to the IEC/IEEE Standard for Binary
4446| Floating-Point Arithmetic.
4447*----------------------------------------------------------------------------*/
4448
4449float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
4450{
4451 flag aSign;
94a49d86 4452 int_fast16_t aExp;
bb98fe42 4453 uint64_t aSig, zSig0, zSig1;
158142c2
FB
4454
4455 aSig = extractFloatx80Frac( a );
4456 aExp = extractFloatx80Exp( a );
4457 aSign = extractFloatx80Sign( a );
bb98fe42 4458 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
bcd4d9af 4459 return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
4460 }
4461 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4462 return packFloat128( aSign, aExp, zSig0, zSig1 );
4463
4464}
4465
158142c2
FB
4466/*----------------------------------------------------------------------------
4467| Rounds the extended double-precision floating-point value `a' to an integer,
4468| and returns the result as an extended quadruple-precision floating-point
4469| value. The operation is performed according to the IEC/IEEE Standard for
4470| Binary Floating-Point Arithmetic.
4471*----------------------------------------------------------------------------*/
4472
4473floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
4474{
4475 flag aSign;
4476 int32 aExp;
bb98fe42 4477 uint64_t lastBitMask, roundBitsMask;
158142c2
FB
4478 int8 roundingMode;
4479 floatx80 z;
4480
4481 aExp = extractFloatx80Exp( a );
4482 if ( 0x403E <= aExp ) {
bb98fe42 4483 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
158142c2
FB
4484 return propagateFloatx80NaN( a, a STATUS_VAR );
4485 }
4486 return a;
4487 }
4488 if ( aExp < 0x3FFF ) {
4489 if ( ( aExp == 0 )
bb98fe42 4490 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
158142c2
FB
4491 return a;
4492 }
4493 STATUS(float_exception_flags) |= float_flag_inexact;
4494 aSign = extractFloatx80Sign( a );
4495 switch ( STATUS(float_rounding_mode) ) {
4496 case float_round_nearest_even:
bb98fe42 4497 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
158142c2
FB
4498 ) {
4499 return
4500 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4501 }
4502 break;
4503 case float_round_down:
4504 return
4505 aSign ?
4506 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4507 : packFloatx80( 0, 0, 0 );
4508 case float_round_up:
4509 return
4510 aSign ? packFloatx80( 1, 0, 0 )
4511 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4512 }
4513 return packFloatx80( aSign, 0, 0 );
4514 }
4515 lastBitMask = 1;
4516 lastBitMask <<= 0x403E - aExp;
4517 roundBitsMask = lastBitMask - 1;
4518 z = a;
4519 roundingMode = STATUS(float_rounding_mode);
4520 if ( roundingMode == float_round_nearest_even ) {
4521 z.low += lastBitMask>>1;
4522 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4523 }
4524 else if ( roundingMode != float_round_to_zero ) {
4525 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
4526 z.low += roundBitsMask;
4527 }
4528 }
4529 z.low &= ~ roundBitsMask;
4530 if ( z.low == 0 ) {
4531 ++z.high;
4532 z.low = LIT64( 0x8000000000000000 );
4533 }
4534 if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
4535 return z;
4536
4537}
4538
4539/*----------------------------------------------------------------------------
4540| Returns the result of adding the absolute values of the extended double-
4541| precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
4542| negated before being returned. `zSign' is ignored if the result is a NaN.
4543| The addition is performed according to the IEC/IEEE Standard for Binary
4544| Floating-Point Arithmetic.
4545*----------------------------------------------------------------------------*/
4546
4547static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
4548{
4549 int32 aExp, bExp, zExp;
bb98fe42 4550 uint64_t aSig, bSig, zSig0, zSig1;
158142c2
FB
4551 int32 expDiff;
4552
4553 aSig = extractFloatx80Frac( a );
4554 aExp = extractFloatx80Exp( a );
4555 bSig = extractFloatx80Frac( b );
4556 bExp = extractFloatx80Exp( b );
4557 expDiff = aExp - bExp;
4558 if ( 0 < expDiff ) {
4559 if ( aExp == 0x7FFF ) {
bb98fe42 4560 if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
4561 return a;
4562 }
4563 if ( bExp == 0 ) --expDiff;
4564 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4565 zExp = aExp;
4566 }
4567 else if ( expDiff < 0 ) {
4568 if ( bExp == 0x7FFF ) {
bb98fe42 4569 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
4570 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4571 }
4572 if ( aExp == 0 ) ++expDiff;
4573 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4574 zExp = bExp;
4575 }
4576 else {
4577 if ( aExp == 0x7FFF ) {
bb98fe42 4578 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
158142c2
FB
4579 return propagateFloatx80NaN( a, b STATUS_VAR );
4580 }
4581 return a;
4582 }
4583 zSig1 = 0;
4584 zSig0 = aSig + bSig;
4585 if ( aExp == 0 ) {
4586 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4587 goto roundAndPack;
4588 }
4589 zExp = aExp;
4590 goto shiftRight1;
4591 }
4592 zSig0 = aSig + bSig;
bb98fe42 4593 if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
158142c2
FB
4594 shiftRight1:
4595 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4596 zSig0 |= LIT64( 0x8000000000000000 );
4597 ++zExp;
4598 roundAndPack:
4599 return
4600 roundAndPackFloatx80(
4601 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4602
4603}
4604
4605/*----------------------------------------------------------------------------
4606| Returns the result of subtracting the absolute values of the extended
4607| double-precision floating-point values `a' and `b'. If `zSign' is 1, the
4608| difference is negated before being returned. `zSign' is ignored if the
4609| result is a NaN. The subtraction is performed according to the IEC/IEEE
4610| Standard for Binary Floating-Point Arithmetic.
4611*----------------------------------------------------------------------------*/
4612
4613static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
4614{
4615 int32 aExp, bExp, zExp;
bb98fe42 4616 uint64_t aSig, bSig, zSig0, zSig1;
158142c2
FB
4617 int32 expDiff;
4618 floatx80 z;
4619
4620 aSig = extractFloatx80Frac( a );
4621 aExp = extractFloatx80Exp( a );
4622 bSig = extractFloatx80Frac( b );
4623 bExp = extractFloatx80Exp( b );
4624 expDiff = aExp - bExp;
4625 if ( 0 < expDiff ) goto aExpBigger;
4626 if ( expDiff < 0 ) goto bExpBigger;
4627 if ( aExp == 0x7FFF ) {
bb98fe42 4628 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
158142c2
FB
4629 return propagateFloatx80NaN( a, b STATUS_VAR );
4630 }
4631 float_raise( float_flag_invalid STATUS_VAR);
4632 z.low = floatx80_default_nan_low;
4633 z.high = floatx80_default_nan_high;
4634 return z;
4635 }
4636 if ( aExp == 0 ) {
4637 aExp = 1;
4638 bExp = 1;
4639 }
4640 zSig1 = 0;
4641 if ( bSig < aSig ) goto aBigger;
4642 if ( aSig < bSig ) goto bBigger;
4643 return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
4644 bExpBigger:
4645 if ( bExp == 0x7FFF ) {
bb98fe42 4646 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
4647 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
4648 }
4649 if ( aExp == 0 ) ++expDiff;
4650 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4651 bBigger:
4652 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4653 zExp = bExp;
4654 zSign ^= 1;
4655 goto normalizeRoundAndPack;
4656 aExpBigger:
4657 if ( aExp == 0x7FFF ) {
bb98fe42 4658 if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
4659 return a;
4660 }
4661 if ( bExp == 0 ) --expDiff;
4662 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4663 aBigger:
4664 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4665 zExp = aExp;
4666 normalizeRoundAndPack:
4667 return
4668 normalizeRoundAndPackFloatx80(
4669 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4670
4671}
4672
4673/*----------------------------------------------------------------------------
4674| Returns the result of adding the extended double-precision floating-point
4675| values `a' and `b'. The operation is performed according to the IEC/IEEE
4676| Standard for Binary Floating-Point Arithmetic.
4677*----------------------------------------------------------------------------*/
4678
4679floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
4680{
4681 flag aSign, bSign;
4682
4683 aSign = extractFloatx80Sign( a );
4684 bSign = extractFloatx80Sign( b );
4685 if ( aSign == bSign ) {
4686 return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4687 }
4688 else {
4689 return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4690 }
4691
4692}
4693
4694/*----------------------------------------------------------------------------
4695| Returns the result of subtracting the extended double-precision floating-
4696| point values `a' and `b'. The operation is performed according to the
4697| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4698*----------------------------------------------------------------------------*/
4699
4700floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
4701{
4702 flag aSign, bSign;
4703
4704 aSign = extractFloatx80Sign( a );
4705 bSign = extractFloatx80Sign( b );
4706 if ( aSign == bSign ) {
4707 return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4708 }
4709 else {
4710 return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4711 }
4712
4713}
4714
4715/*----------------------------------------------------------------------------
4716| Returns the result of multiplying the extended double-precision floating-
4717| point values `a' and `b'. The operation is performed according to the
4718| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4719*----------------------------------------------------------------------------*/
4720
4721floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
4722{
4723 flag aSign, bSign, zSign;
4724 int32 aExp, bExp, zExp;
bb98fe42 4725 uint64_t aSig, bSig, zSig0, zSig1;
158142c2
FB
4726 floatx80 z;
4727
4728 aSig = extractFloatx80Frac( a );
4729 aExp = extractFloatx80Exp( a );
4730 aSign = extractFloatx80Sign( a );
4731 bSig = extractFloatx80Frac( b );
4732 bExp = extractFloatx80Exp( b );
4733 bSign = extractFloatx80Sign( b );
4734 zSign = aSign ^ bSign;
4735 if ( aExp == 0x7FFF ) {
bb98fe42
AF
4736 if ( (uint64_t) ( aSig<<1 )
4737 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
158142c2
FB
4738 return propagateFloatx80NaN( a, b STATUS_VAR );
4739 }
4740 if ( ( bExp | bSig ) == 0 ) goto invalid;
4741 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4742 }
4743 if ( bExp == 0x7FFF ) {
bb98fe42 4744 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
4745 if ( ( aExp | aSig ) == 0 ) {
4746 invalid:
4747 float_raise( float_flag_invalid STATUS_VAR);
4748 z.low = floatx80_default_nan_low;
4749 z.high = floatx80_default_nan_high;
4750 return z;
4751 }
4752 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4753 }
4754 if ( aExp == 0 ) {
4755 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4756 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4757 }
4758 if ( bExp == 0 ) {
4759 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
4760 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4761 }
4762 zExp = aExp + bExp - 0x3FFE;
4763 mul64To128( aSig, bSig, &zSig0, &zSig1 );
bb98fe42 4764 if ( 0 < (int64_t) zSig0 ) {
158142c2
FB
4765 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
4766 --zExp;
4767 }
4768 return
4769 roundAndPackFloatx80(
4770 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4771
4772}
4773
4774/*----------------------------------------------------------------------------
4775| Returns the result of dividing the extended double-precision floating-point
4776| value `a' by the corresponding value `b'. The operation is performed
4777| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4778*----------------------------------------------------------------------------*/
4779
4780floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
4781{
4782 flag aSign, bSign, zSign;
4783 int32 aExp, bExp, zExp;
bb98fe42
AF
4784 uint64_t aSig, bSig, zSig0, zSig1;
4785 uint64_t rem0, rem1, rem2, term0, term1, term2;
158142c2
FB
4786 floatx80 z;
4787
4788 aSig = extractFloatx80Frac( a );
4789 aExp = extractFloatx80Exp( a );
4790 aSign = extractFloatx80Sign( a );
4791 bSig = extractFloatx80Frac( b );
4792 bExp = extractFloatx80Exp( b );
4793 bSign = extractFloatx80Sign( b );
4794 zSign = aSign ^ bSign;
4795 if ( aExp == 0x7FFF ) {
bb98fe42 4796 if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2 4797 if ( bExp == 0x7FFF ) {
bb98fe42 4798 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
4799 goto invalid;
4800 }
4801 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4802 }
4803 if ( bExp == 0x7FFF ) {
bb98fe42 4804 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
4805 return packFloatx80( zSign, 0, 0 );
4806 }
4807 if ( bExp == 0 ) {
4808 if ( bSig == 0 ) {
4809 if ( ( aExp | aSig ) == 0 ) {
4810 invalid:
4811 float_raise( float_flag_invalid STATUS_VAR);
4812 z.low = floatx80_default_nan_low;
4813 z.high = floatx80_default_nan_high;
4814 return z;
4815 }
4816 float_raise( float_flag_divbyzero STATUS_VAR);
4817 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4818 }
4819 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4820 }
4821 if ( aExp == 0 ) {
4822 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4823 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4824 }
4825 zExp = aExp - bExp + 0x3FFE;
4826 rem1 = 0;
4827 if ( bSig <= aSig ) {
4828 shift128Right( aSig, 0, 1, &aSig, &rem1 );
4829 ++zExp;
4830 }
4831 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
4832 mul64To128( bSig, zSig0, &term0, &term1 );
4833 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
bb98fe42 4834 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
4835 --zSig0;
4836 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4837 }
4838 zSig1 = estimateDiv128To64( rem1, 0, bSig );
bb98fe42 4839 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
158142c2
FB
4840 mul64To128( bSig, zSig1, &term1, &term2 );
4841 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
bb98fe42 4842 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
4843 --zSig1;
4844 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
4845 }
4846 zSig1 |= ( ( rem1 | rem2 ) != 0 );
4847 }
4848 return
4849 roundAndPackFloatx80(
4850 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4851
4852}
4853
4854/*----------------------------------------------------------------------------
4855| Returns the remainder of the extended double-precision floating-point value
4856| `a' with respect to the corresponding value `b'. The operation is performed
4857| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4858*----------------------------------------------------------------------------*/
4859
4860floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
4861{
ed086f3d 4862 flag aSign, zSign;
158142c2 4863 int32 aExp, bExp, expDiff;
bb98fe42
AF
4864 uint64_t aSig0, aSig1, bSig;
4865 uint64_t q, term0, term1, alternateASig0, alternateASig1;
158142c2
FB
4866 floatx80 z;
4867
4868 aSig0 = extractFloatx80Frac( a );
4869 aExp = extractFloatx80Exp( a );
4870 aSign = extractFloatx80Sign( a );
4871 bSig = extractFloatx80Frac( b );
4872 bExp = extractFloatx80Exp( b );
158142c2 4873 if ( aExp == 0x7FFF ) {
bb98fe42
AF
4874 if ( (uint64_t) ( aSig0<<1 )
4875 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
158142c2
FB
4876 return propagateFloatx80NaN( a, b STATUS_VAR );
4877 }
4878 goto invalid;
4879 }
4880 if ( bExp == 0x7FFF ) {
bb98fe42 4881 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
4882 return a;
4883 }
4884 if ( bExp == 0 ) {
4885 if ( bSig == 0 ) {
4886 invalid:
4887 float_raise( float_flag_invalid STATUS_VAR);
4888 z.low = floatx80_default_nan_low;
4889 z.high = floatx80_default_nan_high;
4890 return z;
4891 }
4892 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4893 }
4894 if ( aExp == 0 ) {
bb98fe42 4895 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
158142c2
FB
4896 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4897 }
4898 bSig |= LIT64( 0x8000000000000000 );
4899 zSign = aSign;
4900 expDiff = aExp - bExp;
4901 aSig1 = 0;
4902 if ( expDiff < 0 ) {
4903 if ( expDiff < -1 ) return a;
4904 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
4905 expDiff = 0;
4906 }
4907 q = ( bSig <= aSig0 );
4908 if ( q ) aSig0 -= bSig;
4909 expDiff -= 64;
4910 while ( 0 < expDiff ) {
4911 q = estimateDiv128To64( aSig0, aSig1, bSig );
4912 q = ( 2 < q ) ? q - 2 : 0;
4913 mul64To128( bSig, q, &term0, &term1 );
4914 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4915 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
4916 expDiff -= 62;
4917 }
4918 expDiff += 64;
4919 if ( 0 < expDiff ) {
4920 q = estimateDiv128To64( aSig0, aSig1, bSig );
4921 q = ( 2 < q ) ? q - 2 : 0;
4922 q >>= 64 - expDiff;
4923 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
4924 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4925 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4926 while ( le128( term0, term1, aSig0, aSig1 ) ) {
4927 ++q;
4928 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4929 }
4930 }
4931 else {
4932 term1 = 0;
4933 term0 = bSig;
4934 }
4935 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4936 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4937 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4938 && ( q & 1 ) )
4939 ) {
4940 aSig0 = alternateASig0;
4941 aSig1 = alternateASig1;
4942 zSign = ! zSign;
4943 }
4944 return
4945 normalizeRoundAndPackFloatx80(
4946 80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
4947
4948}
4949
4950/*----------------------------------------------------------------------------
4951| Returns the square root of the extended double-precision floating-point
4952| value `a'. The operation is performed according to the IEC/IEEE Standard
4953| for Binary Floating-Point Arithmetic.
4954*----------------------------------------------------------------------------*/
4955
4956floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
4957{
4958 flag aSign;
4959 int32 aExp, zExp;
bb98fe42
AF
4960 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4961 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
158142c2
FB
4962 floatx80 z;
4963
4964 aSig0 = extractFloatx80Frac( a );
4965 aExp = extractFloatx80Exp( a );
4966 aSign = extractFloatx80Sign( a );
4967 if ( aExp == 0x7FFF ) {
bb98fe42 4968 if ( (uint64_t) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
158142c2
FB
4969 if ( ! aSign ) return a;
4970 goto invalid;
4971 }
4972 if ( aSign ) {
4973 if ( ( aExp | aSig0 ) == 0 ) return a;
4974 invalid:
4975 float_raise( float_flag_invalid STATUS_VAR);
4976 z.low = floatx80_default_nan_low;
4977 z.high = floatx80_default_nan_high;
4978 return z;
4979 }
4980 if ( aExp == 0 ) {
4981 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4982 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4983 }
4984 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4985 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4986 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4987 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4988 doubleZSig0 = zSig0<<1;
4989 mul64To128( zSig0, zSig0, &term0, &term1 );
4990 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
bb98fe42 4991 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
4992 --zSig0;
4993 doubleZSig0 -= 2;
4994 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4995 }
4996 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4997 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4998 if ( zSig1 == 0 ) zSig1 = 1;
4999 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5000 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5001 mul64To128( zSig1, zSig1, &term2, &term3 );
5002 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
bb98fe42 5003 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
5004 --zSig1;
5005 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5006 term3 |= 1;
5007 term2 |= doubleZSig0;
5008 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5009 }
5010 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5011 }
5012 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5013 zSig0 |= doubleZSig0;
5014 return
5015 roundAndPackFloatx80(
5016 STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
5017
5018}
5019
5020/*----------------------------------------------------------------------------
b689362d
AJ
5021| Returns 1 if the extended double-precision floating-point value `a' is equal
5022| to the corresponding value `b', and 0 otherwise. The invalid exception is
5023| raised if either operand is a NaN. Otherwise, the comparison is performed
5024| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
5025*----------------------------------------------------------------------------*/
5026
b689362d 5027int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
158142c2
FB
5028{
5029
5030 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5031 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5032 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5033 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2 5034 ) {
b689362d 5035 float_raise( float_flag_invalid STATUS_VAR);
158142c2
FB
5036 return 0;
5037 }
5038 return
5039 ( a.low == b.low )
5040 && ( ( a.high == b.high )
5041 || ( ( a.low == 0 )
bb98fe42 5042 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
5043 );
5044
5045}
5046
5047/*----------------------------------------------------------------------------
5048| Returns 1 if the extended double-precision floating-point value `a' is
5049| less than or equal to the corresponding value `b', and 0 otherwise. The
f5a64251
AJ
5050| invalid exception is raised if either operand is a NaN. The comparison is
5051| performed according to the IEC/IEEE Standard for Binary Floating-Point
5052| Arithmetic.
158142c2
FB
5053*----------------------------------------------------------------------------*/
5054
750afe93 5055int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
158142c2
FB
5056{
5057 flag aSign, bSign;
5058
5059 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5060 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5061 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5062 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2
FB
5063 ) {
5064 float_raise( float_flag_invalid STATUS_VAR);
5065 return 0;
5066 }
5067 aSign = extractFloatx80Sign( a );
5068 bSign = extractFloatx80Sign( b );
5069 if ( aSign != bSign ) {
5070 return
5071 aSign
bb98fe42 5072 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5073 == 0 );
5074 }
5075 return
5076 aSign ? le128( b.high, b.low, a.high, a.low )
5077 : le128( a.high, a.low, b.high, b.low );
5078
5079}
5080
5081/*----------------------------------------------------------------------------
5082| Returns 1 if the extended double-precision floating-point value `a' is
f5a64251
AJ
5083| less than the corresponding value `b', and 0 otherwise. The invalid
5084| exception is raised if either operand is a NaN. The comparison is performed
5085| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
5086*----------------------------------------------------------------------------*/
5087
750afe93 5088int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
158142c2
FB
5089{
5090 flag aSign, bSign;
5091
5092 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5093 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5094 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5095 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2
FB
5096 ) {
5097 float_raise( float_flag_invalid STATUS_VAR);
5098 return 0;
5099 }
5100 aSign = extractFloatx80Sign( a );
5101 bSign = extractFloatx80Sign( b );
5102 if ( aSign != bSign ) {
5103 return
5104 aSign
bb98fe42 5105 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5106 != 0 );
5107 }
5108 return
5109 aSign ? lt128( b.high, b.low, a.high, a.low )
5110 : lt128( a.high, a.low, b.high, b.low );
5111
5112}
5113
67b7861d
AJ
5114/*----------------------------------------------------------------------------
5115| Returns 1 if the extended double-precision floating-point values `a' and `b'
f5a64251
AJ
5116| cannot be compared, and 0 otherwise. The invalid exception is raised if
5117| either operand is a NaN. The comparison is performed according to the
5118| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
67b7861d
AJ
5119*----------------------------------------------------------------------------*/
5120int floatx80_unordered( floatx80 a, floatx80 b STATUS_PARAM )
5121{
5122 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5123 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5124 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5125 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5126 ) {
5127 float_raise( float_flag_invalid STATUS_VAR);
5128 return 1;
5129 }
5130 return 0;
5131}
5132
158142c2 5133/*----------------------------------------------------------------------------
b689362d 5134| Returns 1 if the extended double-precision floating-point value `a' is
f5a64251
AJ
5135| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5136| cause an exception. The comparison is performed according to the IEC/IEEE
5137| Standard for Binary Floating-Point Arithmetic.
158142c2
FB
5138*----------------------------------------------------------------------------*/
5139
b689362d 5140int floatx80_eq_quiet( floatx80 a, floatx80 b STATUS_PARAM )
158142c2
FB
5141{
5142
5143 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5144 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5145 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5146 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2 5147 ) {
b689362d
AJ
5148 if ( floatx80_is_signaling_nan( a )
5149 || floatx80_is_signaling_nan( b ) ) {
5150 float_raise( float_flag_invalid STATUS_VAR);
5151 }
158142c2
FB
5152 return 0;
5153 }
5154 return
5155 ( a.low == b.low )
5156 && ( ( a.high == b.high )
5157 || ( ( a.low == 0 )
bb98fe42 5158 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
5159 );
5160
5161}
5162
5163/*----------------------------------------------------------------------------
5164| Returns 1 if the extended double-precision floating-point value `a' is less
5165| than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
5166| do not cause an exception. Otherwise, the comparison is performed according
5167| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5168*----------------------------------------------------------------------------*/
5169
750afe93 5170int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
158142c2
FB
5171{
5172 flag aSign, bSign;
5173
5174 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5175 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5176 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5177 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2
FB
5178 ) {
5179 if ( floatx80_is_signaling_nan( a )
5180 || floatx80_is_signaling_nan( b ) ) {
5181 float_raise( float_flag_invalid STATUS_VAR);
5182 }
5183 return 0;
5184 }
5185 aSign = extractFloatx80Sign( a );
5186 bSign = extractFloatx80Sign( b );
5187 if ( aSign != bSign ) {
5188 return
5189 aSign
bb98fe42 5190 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5191 == 0 );
5192 }
5193 return
5194 aSign ? le128( b.high, b.low, a.high, a.low )
5195 : le128( a.high, a.low, b.high, b.low );
5196
5197}
5198
5199/*----------------------------------------------------------------------------
5200| Returns 1 if the extended double-precision floating-point value `a' is less
5201| than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
5202| an exception. Otherwise, the comparison is performed according to the
5203| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5204*----------------------------------------------------------------------------*/
5205
750afe93 5206int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
158142c2
FB
5207{
5208 flag aSign, bSign;
5209
5210 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5211 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5212 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5213 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2
FB
5214 ) {
5215 if ( floatx80_is_signaling_nan( a )
5216 || floatx80_is_signaling_nan( b ) ) {
5217 float_raise( float_flag_invalid STATUS_VAR);
5218 }
5219 return 0;
5220 }
5221 aSign = extractFloatx80Sign( a );
5222 bSign = extractFloatx80Sign( b );
5223 if ( aSign != bSign ) {
5224 return
5225 aSign
bb98fe42 5226 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5227 != 0 );
5228 }
5229 return
5230 aSign ? lt128( b.high, b.low, a.high, a.low )
5231 : lt128( a.high, a.low, b.high, b.low );
5232
5233}
5234
67b7861d
AJ
5235/*----------------------------------------------------------------------------
5236| Returns 1 if the extended double-precision floating-point values `a' and `b'
5237| cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
5238| The comparison is performed according to the IEC/IEEE Standard for Binary
5239| Floating-Point Arithmetic.
5240*----------------------------------------------------------------------------*/
5241int floatx80_unordered_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5242{
5243 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5244 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5245 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5246 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5247 ) {
5248 if ( floatx80_is_signaling_nan( a )
5249 || floatx80_is_signaling_nan( b ) ) {
5250 float_raise( float_flag_invalid STATUS_VAR);
5251 }
5252 return 1;
5253 }
5254 return 0;
5255}
5256
158142c2
FB
5257/*----------------------------------------------------------------------------
5258| Returns the result of converting the quadruple-precision floating-point
5259| value `a' to the 32-bit two's complement integer format. The conversion
5260| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5261| Arithmetic---which means in particular that the conversion is rounded
5262| according to the current rounding mode. If `a' is a NaN, the largest
5263| positive integer is returned. Otherwise, if the conversion overflows, the
5264| largest integer with the same sign as `a' is returned.
5265*----------------------------------------------------------------------------*/
5266
5267int32 float128_to_int32( float128 a STATUS_PARAM )
5268{
5269 flag aSign;
5270 int32 aExp, shiftCount;
bb98fe42 5271 uint64_t aSig0, aSig1;
158142c2
FB
5272
5273 aSig1 = extractFloat128Frac1( a );
5274 aSig0 = extractFloat128Frac0( a );
5275 aExp = extractFloat128Exp( a );
5276 aSign = extractFloat128Sign( a );
5277 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5278 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5279 aSig0 |= ( aSig1 != 0 );
5280 shiftCount = 0x4028 - aExp;
5281 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5282 return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
5283
5284}
5285
5286/*----------------------------------------------------------------------------
5287| Returns the result of converting the quadruple-precision floating-point
5288| value `a' to the 32-bit two's complement integer format. The conversion
5289| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5290| Arithmetic, except that the conversion is always rounded toward zero. If
5291| `a' is a NaN, the largest positive integer is returned. Otherwise, if the
5292| conversion overflows, the largest integer with the same sign as `a' is
5293| returned.
5294*----------------------------------------------------------------------------*/
5295
5296int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
5297{
5298 flag aSign;
5299 int32 aExp, shiftCount;
bb98fe42 5300 uint64_t aSig0, aSig1, savedASig;
b3a6a2e0 5301 int32_t z;
158142c2
FB
5302
5303 aSig1 = extractFloat128Frac1( a );
5304 aSig0 = extractFloat128Frac0( a );
5305 aExp = extractFloat128Exp( a );
5306 aSign = extractFloat128Sign( a );
5307 aSig0 |= ( aSig1 != 0 );
5308 if ( 0x401E < aExp ) {
5309 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5310 goto invalid;
5311 }
5312 else if ( aExp < 0x3FFF ) {
5313 if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
5314 return 0;
5315 }
5316 aSig0 |= LIT64( 0x0001000000000000 );
5317 shiftCount = 0x402F - aExp;
5318 savedASig = aSig0;
5319 aSig0 >>= shiftCount;
5320 z = aSig0;
5321 if ( aSign ) z = - z;
5322 if ( ( z < 0 ) ^ aSign ) {
5323 invalid:
5324 float_raise( float_flag_invalid STATUS_VAR);
bb98fe42 5325 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2
FB
5326 }
5327 if ( ( aSig0<<shiftCount ) != savedASig ) {
5328 STATUS(float_exception_flags) |= float_flag_inexact;
5329 }
5330 return z;
5331
5332}
5333
5334/*----------------------------------------------------------------------------
5335| Returns the result of converting the quadruple-precision floating-point
5336| value `a' to the 64-bit two's complement integer format. The conversion
5337| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5338| Arithmetic---which means in particular that the conversion is rounded
5339| according to the current rounding mode. If `a' is a NaN, the largest
5340| positive integer is returned. Otherwise, if the conversion overflows, the
5341| largest integer with the same sign as `a' is returned.
5342*----------------------------------------------------------------------------*/
5343
5344int64 float128_to_int64( float128 a STATUS_PARAM )
5345{
5346 flag aSign;
5347 int32 aExp, shiftCount;
bb98fe42 5348 uint64_t aSig0, aSig1;
158142c2
FB
5349
5350 aSig1 = extractFloat128Frac1( a );
5351 aSig0 = extractFloat128Frac0( a );
5352 aExp = extractFloat128Exp( a );
5353 aSign = extractFloat128Sign( a );
5354 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5355 shiftCount = 0x402F - aExp;
5356 if ( shiftCount <= 0 ) {
5357 if ( 0x403E < aExp ) {
5358 float_raise( float_flag_invalid STATUS_VAR);
5359 if ( ! aSign
5360 || ( ( aExp == 0x7FFF )
5361 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5362 )
5363 ) {
5364 return LIT64( 0x7FFFFFFFFFFFFFFF );
5365 }
bb98fe42 5366 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
5367 }
5368 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5369 }
5370 else {
5371 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5372 }
5373 return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
5374
5375}
5376
5377/*----------------------------------------------------------------------------
5378| Returns the result of converting the quadruple-precision floating-point
5379| value `a' to the 64-bit two's complement integer format. The conversion
5380| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5381| Arithmetic, except that the conversion is always rounded toward zero.
5382| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
5383| the conversion overflows, the largest integer with the same sign as `a' is
5384| returned.
5385*----------------------------------------------------------------------------*/
5386
5387int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
5388{
5389 flag aSign;
5390 int32 aExp, shiftCount;
bb98fe42 5391 uint64_t aSig0, aSig1;
158142c2
FB
5392 int64 z;
5393
5394 aSig1 = extractFloat128Frac1( a );
5395 aSig0 = extractFloat128Frac0( a );
5396 aExp = extractFloat128Exp( a );
5397 aSign = extractFloat128Sign( a );
5398 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5399 shiftCount = aExp - 0x402F;
5400 if ( 0 < shiftCount ) {
5401 if ( 0x403E <= aExp ) {
5402 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5403 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
5404 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5405 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
5406 }
5407 else {
5408 float_raise( float_flag_invalid STATUS_VAR);
5409 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5410 return LIT64( 0x7FFFFFFFFFFFFFFF );
5411 }
5412 }
bb98fe42 5413 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
5414 }
5415 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
bb98fe42 5416 if ( (uint64_t) ( aSig1<<shiftCount ) ) {
158142c2
FB
5417 STATUS(float_exception_flags) |= float_flag_inexact;
5418 }
5419 }
5420 else {
5421 if ( aExp < 0x3FFF ) {
5422 if ( aExp | aSig0 | aSig1 ) {
5423 STATUS(float_exception_flags) |= float_flag_inexact;
5424 }
5425 return 0;
5426 }
5427 z = aSig0>>( - shiftCount );
5428 if ( aSig1
bb98fe42 5429 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
158142c2
FB
5430 STATUS(float_exception_flags) |= float_flag_inexact;
5431 }
5432 }
5433 if ( aSign ) z = - z;
5434 return z;
5435
5436}
5437
5438/*----------------------------------------------------------------------------
5439| Returns the result of converting the quadruple-precision floating-point
5440| value `a' to the single-precision floating-point format. The conversion
5441| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5442| Arithmetic.
5443*----------------------------------------------------------------------------*/
5444
5445float32 float128_to_float32( float128 a STATUS_PARAM )
5446{
5447 flag aSign;
5448 int32 aExp;
bb98fe42
AF
5449 uint64_t aSig0, aSig1;
5450 uint32_t zSig;
158142c2
FB
5451
5452 aSig1 = extractFloat128Frac1( a );
5453 aSig0 = extractFloat128Frac0( a );
5454 aExp = extractFloat128Exp( a );
5455 aSign = extractFloat128Sign( a );
5456 if ( aExp == 0x7FFF ) {
5457 if ( aSig0 | aSig1 ) {
bcd4d9af 5458 return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
5459 }
5460 return packFloat32( aSign, 0xFF, 0 );
5461 }
5462 aSig0 |= ( aSig1 != 0 );
5463 shift64RightJamming( aSig0, 18, &aSig0 );
5464 zSig = aSig0;
5465 if ( aExp || zSig ) {
5466 zSig |= 0x40000000;
5467 aExp -= 0x3F81;
5468 }
5469 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
5470
5471}
5472
5473/*----------------------------------------------------------------------------
5474| Returns the result of converting the quadruple-precision floating-point
5475| value `a' to the double-precision floating-point format. The conversion
5476| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5477| Arithmetic.
5478*----------------------------------------------------------------------------*/
5479
5480float64 float128_to_float64( float128 a STATUS_PARAM )
5481{
5482 flag aSign;
5483 int32 aExp;
bb98fe42 5484 uint64_t aSig0, aSig1;
158142c2
FB
5485
5486 aSig1 = extractFloat128Frac1( a );
5487 aSig0 = extractFloat128Frac0( a );
5488 aExp = extractFloat128Exp( a );
5489 aSign = extractFloat128Sign( a );
5490 if ( aExp == 0x7FFF ) {
5491 if ( aSig0 | aSig1 ) {
bcd4d9af 5492 return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
5493 }
5494 return packFloat64( aSign, 0x7FF, 0 );
5495 }
5496 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5497 aSig0 |= ( aSig1 != 0 );
5498 if ( aExp || aSig0 ) {
5499 aSig0 |= LIT64( 0x4000000000000000 );
5500 aExp -= 0x3C01;
5501 }
5502 return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
5503
5504}
5505
158142c2
FB
5506/*----------------------------------------------------------------------------
5507| Returns the result of converting the quadruple-precision floating-point
5508| value `a' to the extended double-precision floating-point format. The
5509| conversion is performed according to the IEC/IEEE Standard for Binary
5510| Floating-Point Arithmetic.
5511*----------------------------------------------------------------------------*/
5512
5513floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
5514{
5515 flag aSign;
5516 int32 aExp;
bb98fe42 5517 uint64_t aSig0, aSig1;
158142c2
FB
5518
5519 aSig1 = extractFloat128Frac1( a );
5520 aSig0 = extractFloat128Frac0( a );
5521 aExp = extractFloat128Exp( a );
5522 aSign = extractFloat128Sign( a );
5523 if ( aExp == 0x7FFF ) {
5524 if ( aSig0 | aSig1 ) {
bcd4d9af 5525 return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
5526 }
5527 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5528 }
5529 if ( aExp == 0 ) {
5530 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5531 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5532 }
5533 else {
5534 aSig0 |= LIT64( 0x0001000000000000 );
5535 }
5536 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5537 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
5538
5539}
5540
158142c2
FB
5541/*----------------------------------------------------------------------------
5542| Rounds the quadruple-precision floating-point value `a' to an integer, and
5543| returns the result as a quadruple-precision floating-point value. The
5544| operation is performed according to the IEC/IEEE Standard for Binary
5545| Floating-Point Arithmetic.
5546*----------------------------------------------------------------------------*/
5547
5548float128 float128_round_to_int( float128 a STATUS_PARAM )
5549{
5550 flag aSign;
5551 int32 aExp;
bb98fe42 5552 uint64_t lastBitMask, roundBitsMask;
158142c2
FB
5553 int8 roundingMode;
5554 float128 z;
5555
5556 aExp = extractFloat128Exp( a );
5557 if ( 0x402F <= aExp ) {
5558 if ( 0x406F <= aExp ) {
5559 if ( ( aExp == 0x7FFF )
5560 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5561 ) {
5562 return propagateFloat128NaN( a, a STATUS_VAR );
5563 }
5564 return a;
5565 }
5566 lastBitMask = 1;
5567 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5568 roundBitsMask = lastBitMask - 1;
5569 z = a;
5570 roundingMode = STATUS(float_rounding_mode);
5571 if ( roundingMode == float_round_nearest_even ) {
5572 if ( lastBitMask ) {
5573 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
5574 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
5575 }
5576 else {
bb98fe42 5577 if ( (int64_t) z.low < 0 ) {
158142c2 5578 ++z.high;
bb98fe42 5579 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
158142c2
FB
5580 }
5581 }
5582 }
5583 else if ( roundingMode != float_round_to_zero ) {
5584 if ( extractFloat128Sign( z )
5585 ^ ( roundingMode == float_round_up ) ) {
5586 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
5587 }
5588 }
5589 z.low &= ~ roundBitsMask;
5590 }
5591 else {
5592 if ( aExp < 0x3FFF ) {
bb98fe42 5593 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
158142c2
FB
5594 STATUS(float_exception_flags) |= float_flag_inexact;
5595 aSign = extractFloat128Sign( a );
5596 switch ( STATUS(float_rounding_mode) ) {
5597 case float_round_nearest_even:
5598 if ( ( aExp == 0x3FFE )
5599 && ( extractFloat128Frac0( a )
5600 | extractFloat128Frac1( a ) )
5601 ) {
5602 return packFloat128( aSign, 0x3FFF, 0, 0 );
5603 }
5604 break;
5605 case float_round_down:
5606 return
5607 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
5608 : packFloat128( 0, 0, 0, 0 );
5609 case float_round_up:
5610 return
5611 aSign ? packFloat128( 1, 0, 0, 0 )
5612 : packFloat128( 0, 0x3FFF, 0, 0 );
5613 }
5614 return packFloat128( aSign, 0, 0, 0 );
5615 }
5616 lastBitMask = 1;
5617 lastBitMask <<= 0x402F - aExp;
5618 roundBitsMask = lastBitMask - 1;
5619 z.low = 0;
5620 z.high = a.high;
5621 roundingMode = STATUS(float_rounding_mode);
5622 if ( roundingMode == float_round_nearest_even ) {
5623 z.high += lastBitMask>>1;
5624 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
5625 z.high &= ~ lastBitMask;
5626 }
5627 }
5628 else if ( roundingMode != float_round_to_zero ) {
5629 if ( extractFloat128Sign( z )
5630 ^ ( roundingMode == float_round_up ) ) {
5631 z.high |= ( a.low != 0 );
5632 z.high += roundBitsMask;
5633 }
5634 }
5635 z.high &= ~ roundBitsMask;
5636 }
5637 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
5638 STATUS(float_exception_flags) |= float_flag_inexact;
5639 }
5640 return z;
5641
5642}
5643
5644/*----------------------------------------------------------------------------
5645| Returns the result of adding the absolute values of the quadruple-precision
5646| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
5647| before being returned. `zSign' is ignored if the result is a NaN.
5648| The addition is performed according to the IEC/IEEE Standard for Binary
5649| Floating-Point Arithmetic.
5650*----------------------------------------------------------------------------*/
5651
5652static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5653{
5654 int32 aExp, bExp, zExp;
bb98fe42 5655 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
158142c2
FB
5656 int32 expDiff;
5657
5658 aSig1 = extractFloat128Frac1( a );
5659 aSig0 = extractFloat128Frac0( a );
5660 aExp = extractFloat128Exp( a );
5661 bSig1 = extractFloat128Frac1( b );
5662 bSig0 = extractFloat128Frac0( b );
5663 bExp = extractFloat128Exp( b );
5664 expDiff = aExp - bExp;
5665 if ( 0 < expDiff ) {
5666 if ( aExp == 0x7FFF ) {
5667 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5668 return a;
5669 }
5670 if ( bExp == 0 ) {
5671 --expDiff;
5672 }
5673 else {
5674 bSig0 |= LIT64( 0x0001000000000000 );
5675 }
5676 shift128ExtraRightJamming(
5677 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
5678 zExp = aExp;
5679 }
5680 else if ( expDiff < 0 ) {
5681 if ( bExp == 0x7FFF ) {
5682 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5683 return packFloat128( zSign, 0x7FFF, 0, 0 );
5684 }
5685 if ( aExp == 0 ) {
5686 ++expDiff;
5687 }
5688 else {
5689 aSig0 |= LIT64( 0x0001000000000000 );
5690 }
5691 shift128ExtraRightJamming(
5692 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
5693 zExp = bExp;
5694 }
5695 else {
5696 if ( aExp == 0x7FFF ) {
5697 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5698 return propagateFloat128NaN( a, b STATUS_VAR );
5699 }
5700 return a;
5701 }
5702 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
fe76d976 5703 if ( aExp == 0 ) {
e6afc87f
PM
5704 if (STATUS(flush_to_zero)) {
5705 if (zSig0 | zSig1) {
5706 float_raise(float_flag_output_denormal STATUS_VAR);
5707 }
5708 return packFloat128(zSign, 0, 0, 0);
5709 }
fe76d976
PB
5710 return packFloat128( zSign, 0, zSig0, zSig1 );
5711 }
158142c2
FB
5712 zSig2 = 0;
5713 zSig0 |= LIT64( 0x0002000000000000 );
5714 zExp = aExp;
5715 goto shiftRight1;
5716 }
5717 aSig0 |= LIT64( 0x0001000000000000 );
5718 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5719 --zExp;
5720 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
5721 ++zExp;
5722 shiftRight1:
5723 shift128ExtraRightJamming(
5724 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5725 roundAndPack:
5726 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5727
5728}
5729
5730/*----------------------------------------------------------------------------
5731| Returns the result of subtracting the absolute values of the quadruple-
5732| precision floating-point values `a' and `b'. If `zSign' is 1, the
5733| difference is negated before being returned. `zSign' is ignored if the
5734| result is a NaN. The subtraction is performed according to the IEC/IEEE
5735| Standard for Binary Floating-Point Arithmetic.
5736*----------------------------------------------------------------------------*/
5737
5738static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5739{
5740 int32 aExp, bExp, zExp;
bb98fe42 5741 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
158142c2
FB
5742 int32 expDiff;
5743 float128 z;
5744
5745 aSig1 = extractFloat128Frac1( a );
5746 aSig0 = extractFloat128Frac0( a );
5747 aExp = extractFloat128Exp( a );
5748 bSig1 = extractFloat128Frac1( b );
5749 bSig0 = extractFloat128Frac0( b );
5750 bExp = extractFloat128Exp( b );
5751 expDiff = aExp - bExp;
5752 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5753 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
5754 if ( 0 < expDiff ) goto aExpBigger;
5755 if ( expDiff < 0 ) goto bExpBigger;
5756 if ( aExp == 0x7FFF ) {
5757 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5758 return propagateFloat128NaN( a, b STATUS_VAR );
5759 }
5760 float_raise( float_flag_invalid STATUS_VAR);
5761 z.low = float128_default_nan_low;
5762 z.high = float128_default_nan_high;
5763 return z;
5764 }
5765 if ( aExp == 0 ) {
5766 aExp = 1;
5767 bExp = 1;
5768 }
5769 if ( bSig0 < aSig0 ) goto aBigger;
5770 if ( aSig0 < bSig0 ) goto bBigger;
5771 if ( bSig1 < aSig1 ) goto aBigger;
5772 if ( aSig1 < bSig1 ) goto bBigger;
5773 return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
5774 bExpBigger:
5775 if ( bExp == 0x7FFF ) {
5776 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5777 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
5778 }
5779 if ( aExp == 0 ) {
5780 ++expDiff;
5781 }
5782 else {
5783 aSig0 |= LIT64( 0x4000000000000000 );
5784 }
5785 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5786 bSig0 |= LIT64( 0x4000000000000000 );
5787 bBigger:
5788 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
5789 zExp = bExp;
5790 zSign ^= 1;
5791 goto normalizeRoundAndPack;
5792 aExpBigger:
5793 if ( aExp == 0x7FFF ) {
5794 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5795 return a;
5796 }
5797 if ( bExp == 0 ) {
5798 --expDiff;
5799 }
5800 else {
5801 bSig0 |= LIT64( 0x4000000000000000 );
5802 }
5803 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
5804 aSig0 |= LIT64( 0x4000000000000000 );
5805 aBigger:
5806 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5807 zExp = aExp;
5808 normalizeRoundAndPack:
5809 --zExp;
5810 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
5811
5812}
5813
5814/*----------------------------------------------------------------------------
5815| Returns the result of adding the quadruple-precision floating-point values
5816| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
5817| for Binary Floating-Point Arithmetic.
5818*----------------------------------------------------------------------------*/
5819
5820float128 float128_add( float128 a, float128 b STATUS_PARAM )
5821{
5822 flag aSign, bSign;
5823
5824 aSign = extractFloat128Sign( a );
5825 bSign = extractFloat128Sign( b );
5826 if ( aSign == bSign ) {
5827 return addFloat128Sigs( a, b, aSign STATUS_VAR );
5828 }
5829 else {
5830 return subFloat128Sigs( a, b, aSign STATUS_VAR );
5831 }
5832
5833}
5834
5835/*----------------------------------------------------------------------------
5836| Returns the result of subtracting the quadruple-precision floating-point
5837| values `a' and `b'. The operation is performed according to the IEC/IEEE
5838| Standard for Binary Floating-Point Arithmetic.
5839*----------------------------------------------------------------------------*/
5840
5841float128 float128_sub( float128 a, float128 b STATUS_PARAM )
5842{
5843 flag aSign, bSign;
5844
5845 aSign = extractFloat128Sign( a );
5846 bSign = extractFloat128Sign( b );
5847 if ( aSign == bSign ) {
5848 return subFloat128Sigs( a, b, aSign STATUS_VAR );
5849 }
5850 else {
5851 return addFloat128Sigs( a, b, aSign STATUS_VAR );
5852 }
5853
5854}
5855
5856/*----------------------------------------------------------------------------
5857| Returns the result of multiplying the quadruple-precision floating-point
5858| values `a' and `b'. The operation is performed according to the IEC/IEEE
5859| Standard for Binary Floating-Point Arithmetic.
5860*----------------------------------------------------------------------------*/
5861
5862float128 float128_mul( float128 a, float128 b STATUS_PARAM )
5863{
5864 flag aSign, bSign, zSign;
5865 int32 aExp, bExp, zExp;
bb98fe42 5866 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
158142c2
FB
5867 float128 z;
5868
5869 aSig1 = extractFloat128Frac1( a );
5870 aSig0 = extractFloat128Frac0( a );
5871 aExp = extractFloat128Exp( a );
5872 aSign = extractFloat128Sign( a );
5873 bSig1 = extractFloat128Frac1( b );
5874 bSig0 = extractFloat128Frac0( b );
5875 bExp = extractFloat128Exp( b );
5876 bSign = extractFloat128Sign( b );
5877 zSign = aSign ^ bSign;
5878 if ( aExp == 0x7FFF ) {
5879 if ( ( aSig0 | aSig1 )
5880 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5881 return propagateFloat128NaN( a, b STATUS_VAR );
5882 }
5883 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
5884 return packFloat128( zSign, 0x7FFF, 0, 0 );
5885 }
5886 if ( bExp == 0x7FFF ) {
5887 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5888 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5889 invalid:
5890 float_raise( float_flag_invalid STATUS_VAR);
5891 z.low = float128_default_nan_low;
5892 z.high = float128_default_nan_high;
5893 return z;
5894 }
5895 return packFloat128( zSign, 0x7FFF, 0, 0 );
5896 }
5897 if ( aExp == 0 ) {
5898 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5899 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5900 }
5901 if ( bExp == 0 ) {
5902 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5903 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5904 }
5905 zExp = aExp + bExp - 0x4000;
5906 aSig0 |= LIT64( 0x0001000000000000 );
5907 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5908 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5909 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5910 zSig2 |= ( zSig3 != 0 );
5911 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5912 shift128ExtraRightJamming(
5913 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5914 ++zExp;
5915 }
5916 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5917
5918}
5919
5920/*----------------------------------------------------------------------------
5921| Returns the result of dividing the quadruple-precision floating-point value
5922| `a' by the corresponding value `b'. The operation is performed according to
5923| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5924*----------------------------------------------------------------------------*/
5925
5926float128 float128_div( float128 a, float128 b STATUS_PARAM )
5927{
5928 flag aSign, bSign, zSign;
5929 int32 aExp, bExp, zExp;
bb98fe42
AF
5930 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5931 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
158142c2
FB
5932 float128 z;
5933
5934 aSig1 = extractFloat128Frac1( a );
5935 aSig0 = extractFloat128Frac0( a );
5936 aExp = extractFloat128Exp( a );
5937 aSign = extractFloat128Sign( a );
5938 bSig1 = extractFloat128Frac1( b );
5939 bSig0 = extractFloat128Frac0( b );
5940 bExp = extractFloat128Exp( b );
5941 bSign = extractFloat128Sign( b );
5942 zSign = aSign ^ bSign;
5943 if ( aExp == 0x7FFF ) {
5944 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5945 if ( bExp == 0x7FFF ) {
5946 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5947 goto invalid;
5948 }
5949 return packFloat128( zSign, 0x7FFF, 0, 0 );
5950 }
5951 if ( bExp == 0x7FFF ) {
5952 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5953 return packFloat128( zSign, 0, 0, 0 );
5954 }
5955 if ( bExp == 0 ) {
5956 if ( ( bSig0 | bSig1 ) == 0 ) {
5957 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5958 invalid:
5959 float_raise( float_flag_invalid STATUS_VAR);
5960 z.low = float128_default_nan_low;
5961 z.high = float128_default_nan_high;
5962 return z;
5963 }
5964 float_raise( float_flag_divbyzero STATUS_VAR);
5965 return packFloat128( zSign, 0x7FFF, 0, 0 );
5966 }
5967 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5968 }
5969 if ( aExp == 0 ) {
5970 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5971 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5972 }
5973 zExp = aExp - bExp + 0x3FFD;
5974 shortShift128Left(
5975 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5976 shortShift128Left(
5977 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5978 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5979 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5980 ++zExp;
5981 }
5982 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5983 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5984 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
bb98fe42 5985 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
5986 --zSig0;
5987 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5988 }
5989 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5990 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5991 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5992 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
bb98fe42 5993 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
5994 --zSig1;
5995 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5996 }
5997 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5998 }
5999 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6000 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6001
6002}
6003
6004/*----------------------------------------------------------------------------
6005| Returns the remainder of the quadruple-precision floating-point value `a'
6006| with respect to the corresponding value `b'. The operation is performed
6007| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6008*----------------------------------------------------------------------------*/
6009
6010float128 float128_rem( float128 a, float128 b STATUS_PARAM )
6011{
ed086f3d 6012 flag aSign, zSign;
158142c2 6013 int32 aExp, bExp, expDiff;
bb98fe42
AF
6014 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6015 uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6016 int64_t sigMean0;
158142c2
FB
6017 float128 z;
6018
6019 aSig1 = extractFloat128Frac1( a );
6020 aSig0 = extractFloat128Frac0( a );
6021 aExp = extractFloat128Exp( a );
6022 aSign = extractFloat128Sign( a );
6023 bSig1 = extractFloat128Frac1( b );
6024 bSig0 = extractFloat128Frac0( b );
6025 bExp = extractFloat128Exp( b );
158142c2
FB
6026 if ( aExp == 0x7FFF ) {
6027 if ( ( aSig0 | aSig1 )
6028 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6029 return propagateFloat128NaN( a, b STATUS_VAR );
6030 }
6031 goto invalid;
6032 }
6033 if ( bExp == 0x7FFF ) {
6034 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6035 return a;
6036 }
6037 if ( bExp == 0 ) {
6038 if ( ( bSig0 | bSig1 ) == 0 ) {
6039 invalid:
6040 float_raise( float_flag_invalid STATUS_VAR);
6041 z.low = float128_default_nan_low;
6042 z.high = float128_default_nan_high;
6043 return z;
6044 }
6045 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6046 }
6047 if ( aExp == 0 ) {
6048 if ( ( aSig0 | aSig1 ) == 0 ) return a;
6049 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6050 }
6051 expDiff = aExp - bExp;
6052 if ( expDiff < -1 ) return a;
6053 shortShift128Left(
6054 aSig0 | LIT64( 0x0001000000000000 ),
6055 aSig1,
6056 15 - ( expDiff < 0 ),
6057 &aSig0,
6058 &aSig1
6059 );
6060 shortShift128Left(
6061 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6062 q = le128( bSig0, bSig1, aSig0, aSig1 );
6063 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6064 expDiff -= 64;
6065 while ( 0 < expDiff ) {
6066 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6067 q = ( 4 < q ) ? q - 4 : 0;
6068 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6069 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6070 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6071 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6072 expDiff -= 61;
6073 }
6074 if ( -64 < expDiff ) {
6075 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6076 q = ( 4 < q ) ? q - 4 : 0;
6077 q >>= - expDiff;
6078 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6079 expDiff += 52;
6080 if ( expDiff < 0 ) {
6081 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6082 }
6083 else {
6084 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6085 }
6086 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6087 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6088 }
6089 else {
6090 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6091 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6092 }
6093 do {
6094 alternateASig0 = aSig0;
6095 alternateASig1 = aSig1;
6096 ++q;
6097 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
bb98fe42 6098 } while ( 0 <= (int64_t) aSig0 );
158142c2 6099 add128(
bb98fe42 6100 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
158142c2
FB
6101 if ( ( sigMean0 < 0 )
6102 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6103 aSig0 = alternateASig0;
6104 aSig1 = alternateASig1;
6105 }
bb98fe42 6106 zSign = ( (int64_t) aSig0 < 0 );
158142c2
FB
6107 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6108 return
6109 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
6110
6111}
6112
6113/*----------------------------------------------------------------------------
6114| Returns the square root of the quadruple-precision floating-point value `a'.
6115| The operation is performed according to the IEC/IEEE Standard for Binary
6116| Floating-Point Arithmetic.
6117*----------------------------------------------------------------------------*/
6118
6119float128 float128_sqrt( float128 a STATUS_PARAM )
6120{
6121 flag aSign;
6122 int32 aExp, zExp;
bb98fe42
AF
6123 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6124 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
158142c2
FB
6125 float128 z;
6126
6127 aSig1 = extractFloat128Frac1( a );
6128 aSig0 = extractFloat128Frac0( a );
6129 aExp = extractFloat128Exp( a );
6130 aSign = extractFloat128Sign( a );
6131 if ( aExp == 0x7FFF ) {
6132 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
6133 if ( ! aSign ) return a;
6134 goto invalid;
6135 }
6136 if ( aSign ) {
6137 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6138 invalid:
6139 float_raise( float_flag_invalid STATUS_VAR);
6140 z.low = float128_default_nan_low;
6141 z.high = float128_default_nan_high;
6142 return z;
6143 }
6144 if ( aExp == 0 ) {
6145 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6146 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6147 }
6148 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6149 aSig0 |= LIT64( 0x0001000000000000 );
6150 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6151 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6152 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6153 doubleZSig0 = zSig0<<1;
6154 mul64To128( zSig0, zSig0, &term0, &term1 );
6155 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
bb98fe42 6156 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
6157 --zSig0;
6158 doubleZSig0 -= 2;
6159 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6160 }
6161 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6162 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6163 if ( zSig1 == 0 ) zSig1 = 1;
6164 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6165 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6166 mul64To128( zSig1, zSig1, &term2, &term3 );
6167 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
bb98fe42 6168 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
6169 --zSig1;
6170 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6171 term3 |= 1;
6172 term2 |= doubleZSig0;
6173 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6174 }
6175 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6176 }
6177 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6178 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6179
6180}
6181
6182/*----------------------------------------------------------------------------
6183| Returns 1 if the quadruple-precision floating-point value `a' is equal to
b689362d
AJ
6184| the corresponding value `b', and 0 otherwise. The invalid exception is
6185| raised if either operand is a NaN. Otherwise, the comparison is performed
158142c2
FB
6186| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6187*----------------------------------------------------------------------------*/
6188
b689362d 6189int float128_eq( float128 a, float128 b STATUS_PARAM )
158142c2
FB
6190{
6191
6192 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6193 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6194 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6195 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6196 ) {
b689362d 6197 float_raise( float_flag_invalid STATUS_VAR);
158142c2
FB
6198 return 0;
6199 }
6200 return
6201 ( a.low == b.low )
6202 && ( ( a.high == b.high )
6203 || ( ( a.low == 0 )
bb98fe42 6204 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
6205 );
6206
6207}
6208
6209/*----------------------------------------------------------------------------
6210| Returns 1 if the quadruple-precision floating-point value `a' is less than
f5a64251
AJ
6211| or equal to the corresponding value `b', and 0 otherwise. The invalid
6212| exception is raised if either operand is a NaN. The comparison is performed
6213| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
6214*----------------------------------------------------------------------------*/
6215
750afe93 6216int float128_le( float128 a, float128 b STATUS_PARAM )
158142c2
FB
6217{
6218 flag aSign, bSign;
6219
6220 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6221 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6222 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6223 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6224 ) {
6225 float_raise( float_flag_invalid STATUS_VAR);
6226 return 0;
6227 }
6228 aSign = extractFloat128Sign( a );
6229 bSign = extractFloat128Sign( b );
6230 if ( aSign != bSign ) {
6231 return
6232 aSign
bb98fe42 6233 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
6234 == 0 );
6235 }
6236 return
6237 aSign ? le128( b.high, b.low, a.high, a.low )
6238 : le128( a.high, a.low, b.high, b.low );
6239
6240}
6241
6242/*----------------------------------------------------------------------------
6243| Returns 1 if the quadruple-precision floating-point value `a' is less than
f5a64251
AJ
6244| the corresponding value `b', and 0 otherwise. The invalid exception is
6245| raised if either operand is a NaN. The comparison is performed according
6246| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
6247*----------------------------------------------------------------------------*/
6248
750afe93 6249int float128_lt( float128 a, float128 b STATUS_PARAM )
158142c2
FB
6250{
6251 flag aSign, bSign;
6252
6253 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6254 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6255 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6256 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6257 ) {
6258 float_raise( float_flag_invalid STATUS_VAR);
6259 return 0;
6260 }
6261 aSign = extractFloat128Sign( a );
6262 bSign = extractFloat128Sign( b );
6263 if ( aSign != bSign ) {
6264 return
6265 aSign
bb98fe42 6266 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
6267 != 0 );
6268 }
6269 return
6270 aSign ? lt128( b.high, b.low, a.high, a.low )
6271 : lt128( a.high, a.low, b.high, b.low );
6272
6273}
6274
67b7861d
AJ
6275/*----------------------------------------------------------------------------
6276| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
f5a64251
AJ
6277| be compared, and 0 otherwise. The invalid exception is raised if either
6278| operand is a NaN. The comparison is performed according to the IEC/IEEE
6279| Standard for Binary Floating-Point Arithmetic.
67b7861d
AJ
6280*----------------------------------------------------------------------------*/
6281
6282int float128_unordered( float128 a, float128 b STATUS_PARAM )
6283{
6284 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6285 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6286 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6287 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6288 ) {
6289 float_raise( float_flag_invalid STATUS_VAR);
6290 return 1;
6291 }
6292 return 0;
6293}
6294
158142c2
FB
6295/*----------------------------------------------------------------------------
6296| Returns 1 if the quadruple-precision floating-point value `a' is equal to
f5a64251
AJ
6297| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6298| exception. The comparison is performed according to the IEC/IEEE Standard
6299| for Binary Floating-Point Arithmetic.
158142c2
FB
6300*----------------------------------------------------------------------------*/
6301
b689362d 6302int float128_eq_quiet( float128 a, float128 b STATUS_PARAM )
158142c2
FB
6303{
6304
6305 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6306 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6307 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6308 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6309 ) {
b689362d
AJ
6310 if ( float128_is_signaling_nan( a )
6311 || float128_is_signaling_nan( b ) ) {
6312 float_raise( float_flag_invalid STATUS_VAR);
6313 }
158142c2
FB
6314 return 0;
6315 }
6316 return
6317 ( a.low == b.low )
6318 && ( ( a.high == b.high )
6319 || ( ( a.low == 0 )
bb98fe42 6320 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
6321 );
6322
6323}
6324
6325/*----------------------------------------------------------------------------
6326| Returns 1 if the quadruple-precision floating-point value `a' is less than
6327| or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
6328| cause an exception. Otherwise, the comparison is performed according to the
6329| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6330*----------------------------------------------------------------------------*/
6331
750afe93 6332int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
158142c2
FB
6333{
6334 flag aSign, bSign;
6335
6336 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6337 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6338 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6339 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6340 ) {
6341 if ( float128_is_signaling_nan( a )
6342 || float128_is_signaling_nan( b ) ) {
6343 float_raise( float_flag_invalid STATUS_VAR);
6344 }
6345 return 0;
6346 }
6347 aSign = extractFloat128Sign( a );
6348 bSign = extractFloat128Sign( b );
6349 if ( aSign != bSign ) {
6350 return
6351 aSign
bb98fe42 6352 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
6353 == 0 );
6354 }
6355 return
6356 aSign ? le128( b.high, b.low, a.high, a.low )
6357 : le128( a.high, a.low, b.high, b.low );
6358
6359}
6360
6361/*----------------------------------------------------------------------------
6362| Returns 1 if the quadruple-precision floating-point value `a' is less than
6363| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6364| exception. Otherwise, the comparison is performed according to the IEC/IEEE
6365| Standard for Binary Floating-Point Arithmetic.
6366*----------------------------------------------------------------------------*/
6367
750afe93 6368int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
158142c2
FB
6369{
6370 flag aSign, bSign;
6371
6372 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6373 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6374 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6375 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6376 ) {
6377 if ( float128_is_signaling_nan( a )
6378 || float128_is_signaling_nan( b ) ) {
6379 float_raise( float_flag_invalid STATUS_VAR);
6380 }
6381 return 0;
6382 }
6383 aSign = extractFloat128Sign( a );
6384 bSign = extractFloat128Sign( b );
6385 if ( aSign != bSign ) {
6386 return
6387 aSign
bb98fe42 6388 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
6389 != 0 );
6390 }
6391 return
6392 aSign ? lt128( b.high, b.low, a.high, a.low )
6393 : lt128( a.high, a.low, b.high, b.low );
6394
6395}
6396
67b7861d
AJ
6397/*----------------------------------------------------------------------------
6398| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6399| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
6400| comparison is performed according to the IEC/IEEE Standard for Binary
6401| Floating-Point Arithmetic.
6402*----------------------------------------------------------------------------*/
6403
6404int float128_unordered_quiet( float128 a, float128 b STATUS_PARAM )
6405{
6406 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6407 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6408 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6409 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6410 ) {
6411 if ( float128_is_signaling_nan( a )
6412 || float128_is_signaling_nan( b ) ) {
6413 float_raise( float_flag_invalid STATUS_VAR);
6414 }
6415 return 1;
6416 }
6417 return 0;
6418}
6419
1d6bda35 6420/* misc functions */
9f8d2a09 6421float32 uint32_to_float32( uint32 a STATUS_PARAM )
1d6bda35
FB
6422{
6423 return int64_to_float32(a STATUS_VAR);
6424}
6425
9f8d2a09 6426float64 uint32_to_float64( uint32 a STATUS_PARAM )
1d6bda35
FB
6427{
6428 return int64_to_float64(a STATUS_VAR);
6429}
6430
9f8d2a09 6431uint32 float32_to_uint32( float32 a STATUS_PARAM )
1d6bda35
FB
6432{
6433 int64_t v;
9f8d2a09 6434 uint32 res;
1d6bda35
FB
6435
6436 v = float32_to_int64(a STATUS_VAR);
6437 if (v < 0) {
6438 res = 0;
6439 float_raise( float_flag_invalid STATUS_VAR);
6440 } else if (v > 0xffffffff) {
6441 res = 0xffffffff;
6442 float_raise( float_flag_invalid STATUS_VAR);
6443 } else {
6444 res = v;
6445 }
6446 return res;
6447}
6448
9f8d2a09 6449uint32 float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
1d6bda35
FB
6450{
6451 int64_t v;
9f8d2a09 6452 uint32 res;
1d6bda35
FB
6453
6454 v = float32_to_int64_round_to_zero(a STATUS_VAR);
6455 if (v < 0) {
6456 res = 0;
6457 float_raise( float_flag_invalid STATUS_VAR);
6458 } else if (v > 0xffffffff) {
6459 res = 0xffffffff;
6460 float_raise( float_flag_invalid STATUS_VAR);
6461 } else {
6462 res = v;
6463 }
6464 return res;
6465}
6466
5aea4c58 6467uint_fast16_t float32_to_uint16_round_to_zero(float32 a STATUS_PARAM)
cbcef455
PM
6468{
6469 int64_t v;
5aea4c58 6470 uint_fast16_t res;
cbcef455
PM
6471
6472 v = float32_to_int64_round_to_zero(a STATUS_VAR);
6473 if (v < 0) {
6474 res = 0;
6475 float_raise( float_flag_invalid STATUS_VAR);
6476 } else if (v > 0xffff) {
6477 res = 0xffff;
6478 float_raise( float_flag_invalid STATUS_VAR);
6479 } else {
6480 res = v;
6481 }
6482 return res;
6483}
6484
9f8d2a09 6485uint32 float64_to_uint32( float64 a STATUS_PARAM )
1d6bda35
FB
6486{
6487 int64_t v;
9f8d2a09 6488 uint32 res;
1d6bda35
FB
6489
6490 v = float64_to_int64(a STATUS_VAR);
6491 if (v < 0) {
6492 res = 0;
6493 float_raise( float_flag_invalid STATUS_VAR);
6494 } else if (v > 0xffffffff) {
6495 res = 0xffffffff;
6496 float_raise( float_flag_invalid STATUS_VAR);
6497 } else {
6498 res = v;
6499 }
6500 return res;
6501}
6502
9f8d2a09 6503uint32 float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
1d6bda35
FB
6504{
6505 int64_t v;
9f8d2a09 6506 uint32 res;
1d6bda35
FB
6507
6508 v = float64_to_int64_round_to_zero(a STATUS_VAR);
6509 if (v < 0) {
6510 res = 0;
6511 float_raise( float_flag_invalid STATUS_VAR);
6512 } else if (v > 0xffffffff) {
6513 res = 0xffffffff;
6514 float_raise( float_flag_invalid STATUS_VAR);
6515 } else {
6516 res = v;
6517 }
6518 return res;
6519}
6520
5aea4c58 6521uint_fast16_t float64_to_uint16_round_to_zero(float64 a STATUS_PARAM)
cbcef455
PM
6522{
6523 int64_t v;
5aea4c58 6524 uint_fast16_t res;
cbcef455
PM
6525
6526 v = float64_to_int64_round_to_zero(a STATUS_VAR);
6527 if (v < 0) {
6528 res = 0;
6529 float_raise( float_flag_invalid STATUS_VAR);
6530 } else if (v > 0xffff) {
6531 res = 0xffff;
6532 float_raise( float_flag_invalid STATUS_VAR);
6533 } else {
6534 res = v;
6535 }
6536 return res;
6537}
6538
f090c9d4 6539/* FIXME: This looks broken. */
75d62a58
JM
6540uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
6541{
6542 int64_t v;
6543
f090c9d4
PB
6544 v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
6545 v += float64_val(a);
6546 v = float64_to_int64(make_float64(v) STATUS_VAR);
75d62a58
JM
6547
6548 return v - INT64_MIN;
6549}
6550
6551uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
6552{
6553 int64_t v;
6554
f090c9d4
PB
6555 v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
6556 v += float64_val(a);
6557 v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);
75d62a58
JM
6558
6559 return v - INT64_MIN;
6560}
6561
1d6bda35 6562#define COMPARE(s, nan_exp) \
750afe93 6563INLINE int float ## s ## _compare_internal( float ## s a, float ## s b, \
1d6bda35
FB
6564 int is_quiet STATUS_PARAM ) \
6565{ \
6566 flag aSign, bSign; \
bb98fe42 6567 uint ## s ## _t av, bv; \
37d18660
PM
6568 a = float ## s ## _squash_input_denormal(a STATUS_VAR); \
6569 b = float ## s ## _squash_input_denormal(b STATUS_VAR); \
1d6bda35
FB
6570 \
6571 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
6572 extractFloat ## s ## Frac( a ) ) || \
6573 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
6574 extractFloat ## s ## Frac( b ) )) { \
6575 if (!is_quiet || \
6576 float ## s ## _is_signaling_nan( a ) || \
6577 float ## s ## _is_signaling_nan( b ) ) { \
6578 float_raise( float_flag_invalid STATUS_VAR); \
6579 } \
6580 return float_relation_unordered; \
6581 } \
6582 aSign = extractFloat ## s ## Sign( a ); \
6583 bSign = extractFloat ## s ## Sign( b ); \
f090c9d4 6584 av = float ## s ## _val(a); \
cd8a2533 6585 bv = float ## s ## _val(b); \
1d6bda35 6586 if ( aSign != bSign ) { \
bb98fe42 6587 if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) { \
1d6bda35
FB
6588 /* zero case */ \
6589 return float_relation_equal; \
6590 } else { \
6591 return 1 - (2 * aSign); \
6592 } \
6593 } else { \
f090c9d4 6594 if (av == bv) { \
1d6bda35
FB
6595 return float_relation_equal; \
6596 } else { \
f090c9d4 6597 return 1 - 2 * (aSign ^ ( av < bv )); \
1d6bda35
FB
6598 } \
6599 } \
6600} \
6601 \
750afe93 6602int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM ) \
1d6bda35
FB
6603{ \
6604 return float ## s ## _compare_internal(a, b, 0 STATUS_VAR); \
6605} \
6606 \
750afe93 6607int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \
1d6bda35
FB
6608{ \
6609 return float ## s ## _compare_internal(a, b, 1 STATUS_VAR); \
6610}
6611
6612COMPARE(32, 0xff)
6613COMPARE(64, 0x7ff)
9ee6e8bb 6614
f6714d36
AJ
6615INLINE int floatx80_compare_internal( floatx80 a, floatx80 b,
6616 int is_quiet STATUS_PARAM )
6617{
6618 flag aSign, bSign;
6619
6620 if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
6621 ( extractFloatx80Frac( a )<<1 ) ) ||
6622 ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
6623 ( extractFloatx80Frac( b )<<1 ) )) {
6624 if (!is_quiet ||
6625 floatx80_is_signaling_nan( a ) ||
6626 floatx80_is_signaling_nan( b ) ) {
6627 float_raise( float_flag_invalid STATUS_VAR);
6628 }
6629 return float_relation_unordered;
6630 }
6631 aSign = extractFloatx80Sign( a );
6632 bSign = extractFloatx80Sign( b );
6633 if ( aSign != bSign ) {
6634
6635 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
6636 ( ( a.low | b.low ) == 0 ) ) {
6637 /* zero case */
6638 return float_relation_equal;
6639 } else {
6640 return 1 - (2 * aSign);
6641 }
6642 } else {
6643 if (a.low == b.low && a.high == b.high) {
6644 return float_relation_equal;
6645 } else {
6646 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6647 }
6648 }
6649}
6650
6651int floatx80_compare( floatx80 a, floatx80 b STATUS_PARAM )
6652{
6653 return floatx80_compare_internal(a, b, 0 STATUS_VAR);
6654}
6655
6656int floatx80_compare_quiet( floatx80 a, floatx80 b STATUS_PARAM )
6657{
6658 return floatx80_compare_internal(a, b, 1 STATUS_VAR);
6659}
6660
1f587329
BS
6661INLINE int float128_compare_internal( float128 a, float128 b,
6662 int is_quiet STATUS_PARAM )
6663{
6664 flag aSign, bSign;
6665
6666 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6667 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6668 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6669 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6670 if (!is_quiet ||
6671 float128_is_signaling_nan( a ) ||
6672 float128_is_signaling_nan( b ) ) {
6673 float_raise( float_flag_invalid STATUS_VAR);
6674 }
6675 return float_relation_unordered;
6676 }
6677 aSign = extractFloat128Sign( a );
6678 bSign = extractFloat128Sign( b );
6679 if ( aSign != bSign ) {
6680 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6681 /* zero case */
6682 return float_relation_equal;
6683 } else {
6684 return 1 - (2 * aSign);
6685 }
6686 } else {
6687 if (a.low == b.low && a.high == b.high) {
6688 return float_relation_equal;
6689 } else {
6690 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6691 }
6692 }
6693}
6694
6695int float128_compare( float128 a, float128 b STATUS_PARAM )
6696{
6697 return float128_compare_internal(a, b, 0 STATUS_VAR);
6698}
6699
6700int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
6701{
6702 return float128_compare_internal(a, b, 1 STATUS_VAR);
6703}
6704
274f1b04
PM
6705/* min() and max() functions. These can't be implemented as
6706 * 'compare and pick one input' because that would mishandle
6707 * NaNs and +0 vs -0.
6708 */
6709#define MINMAX(s, nan_exp) \
6710INLINE float ## s float ## s ## _minmax(float ## s a, float ## s b, \
6711 int ismin STATUS_PARAM ) \
6712{ \
6713 flag aSign, bSign; \
6714 uint ## s ## _t av, bv; \
6715 a = float ## s ## _squash_input_denormal(a STATUS_VAR); \
6716 b = float ## s ## _squash_input_denormal(b STATUS_VAR); \
6717 if (float ## s ## _is_any_nan(a) || \
6718 float ## s ## _is_any_nan(b)) { \
6719 return propagateFloat ## s ## NaN(a, b STATUS_VAR); \
6720 } \
6721 aSign = extractFloat ## s ## Sign(a); \
6722 bSign = extractFloat ## s ## Sign(b); \
6723 av = float ## s ## _val(a); \
6724 bv = float ## s ## _val(b); \
6725 if (aSign != bSign) { \
6726 if (ismin) { \
6727 return aSign ? a : b; \
6728 } else { \
6729 return aSign ? b : a; \
6730 } \
6731 } else { \
6732 if (ismin) { \
6733 return (aSign ^ (av < bv)) ? a : b; \
6734 } else { \
6735 return (aSign ^ (av < bv)) ? b : a; \
6736 } \
6737 } \
6738} \
6739 \
6740float ## s float ## s ## _min(float ## s a, float ## s b STATUS_PARAM) \
6741{ \
6742 return float ## s ## _minmax(a, b, 1 STATUS_VAR); \
6743} \
6744 \
6745float ## s float ## s ## _max(float ## s a, float ## s b STATUS_PARAM) \
6746{ \
6747 return float ## s ## _minmax(a, b, 0 STATUS_VAR); \
6748}
6749
6750MINMAX(32, 0xff)
6751MINMAX(64, 0x7ff)
6752
6753
9ee6e8bb
PB
6754/* Multiply A by 2 raised to the power N. */
6755float32 float32_scalbn( float32 a, int n STATUS_PARAM )
6756{
6757 flag aSign;
326b9e98 6758 int16_t aExp;
bb98fe42 6759 uint32_t aSig;
9ee6e8bb 6760
37d18660 6761 a = float32_squash_input_denormal(a STATUS_VAR);
9ee6e8bb
PB
6762 aSig = extractFloat32Frac( a );
6763 aExp = extractFloat32Exp( a );
6764 aSign = extractFloat32Sign( a );
6765
6766 if ( aExp == 0xFF ) {
326b9e98
AJ
6767 if ( aSig ) {
6768 return propagateFloat32NaN( a, a STATUS_VAR );
6769 }
9ee6e8bb
PB
6770 return a;
6771 }
69397542
PB
6772 if ( aExp != 0 )
6773 aSig |= 0x00800000;
6774 else if ( aSig == 0 )
6775 return a;
6776
326b9e98
AJ
6777 if (n > 0x200) {
6778 n = 0x200;
6779 } else if (n < -0x200) {
6780 n = -0x200;
6781 }
6782
69397542
PB
6783 aExp += n - 1;
6784 aSig <<= 7;
6785 return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
9ee6e8bb
PB
6786}
6787
6788float64 float64_scalbn( float64 a, int n STATUS_PARAM )
6789{
6790 flag aSign;
326b9e98 6791 int16_t aExp;
bb98fe42 6792 uint64_t aSig;
9ee6e8bb 6793
37d18660 6794 a = float64_squash_input_denormal(a STATUS_VAR);
9ee6e8bb
PB
6795 aSig = extractFloat64Frac( a );
6796 aExp = extractFloat64Exp( a );
6797 aSign = extractFloat64Sign( a );
6798
6799 if ( aExp == 0x7FF ) {
326b9e98
AJ
6800 if ( aSig ) {
6801 return propagateFloat64NaN( a, a STATUS_VAR );
6802 }
9ee6e8bb
PB
6803 return a;
6804 }
69397542
PB
6805 if ( aExp != 0 )
6806 aSig |= LIT64( 0x0010000000000000 );
6807 else if ( aSig == 0 )
6808 return a;
6809
326b9e98
AJ
6810 if (n > 0x1000) {
6811 n = 0x1000;
6812 } else if (n < -0x1000) {
6813 n = -0x1000;
6814 }
6815
69397542
PB
6816 aExp += n - 1;
6817 aSig <<= 10;
6818 return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
9ee6e8bb
PB
6819}
6820
9ee6e8bb
PB
6821floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
6822{
6823 flag aSign;
326b9e98 6824 int32_t aExp;
bb98fe42 6825 uint64_t aSig;
9ee6e8bb
PB
6826
6827 aSig = extractFloatx80Frac( a );
6828 aExp = extractFloatx80Exp( a );
6829 aSign = extractFloatx80Sign( a );
6830
326b9e98
AJ
6831 if ( aExp == 0x7FFF ) {
6832 if ( aSig<<1 ) {
6833 return propagateFloatx80NaN( a, a STATUS_VAR );
6834 }
9ee6e8bb
PB
6835 return a;
6836 }
326b9e98 6837
69397542
PB
6838 if (aExp == 0 && aSig == 0)
6839 return a;
6840
326b9e98
AJ
6841 if (n > 0x10000) {
6842 n = 0x10000;
6843 } else if (n < -0x10000) {
6844 n = -0x10000;
6845 }
6846
9ee6e8bb 6847 aExp += n;
69397542
PB
6848 return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
6849 aSign, aExp, aSig, 0 STATUS_VAR );
9ee6e8bb 6850}
9ee6e8bb 6851
9ee6e8bb
PB
6852float128 float128_scalbn( float128 a, int n STATUS_PARAM )
6853{
6854 flag aSign;
326b9e98 6855 int32_t aExp;
bb98fe42 6856 uint64_t aSig0, aSig1;
9ee6e8bb
PB
6857
6858 aSig1 = extractFloat128Frac1( a );
6859 aSig0 = extractFloat128Frac0( a );
6860 aExp = extractFloat128Exp( a );
6861 aSign = extractFloat128Sign( a );
6862 if ( aExp == 0x7FFF ) {
326b9e98
AJ
6863 if ( aSig0 | aSig1 ) {
6864 return propagateFloat128NaN( a, a STATUS_VAR );
6865 }
9ee6e8bb
PB
6866 return a;
6867 }
69397542
PB
6868 if ( aExp != 0 )
6869 aSig0 |= LIT64( 0x0001000000000000 );
6870 else if ( aSig0 == 0 && aSig1 == 0 )
6871 return a;
6872
326b9e98
AJ
6873 if (n > 0x10000) {
6874 n = 0x10000;
6875 } else if (n < -0x10000) {
6876 n = -0x10000;
6877 }
6878
69397542
PB
6879 aExp += n - 1;
6880 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
6881 STATUS_VAR );
9ee6e8bb
PB
6882
6883}