]> git.proxmox.com Git - mirror_qemu.git/blame - fpu/softfloat.c
softfloat: Fix exception flag handling for float32_to_float16()
[mirror_qemu.git] / fpu / softfloat.c
CommitLineData
8d725fac
AF
1/*
2 * QEMU float support
3 *
4 * Derived from SoftFloat.
5 */
158142c2
FB
6
7/*============================================================================
8
9This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
10Package, Release 2b.
11
12Written by John R. Hauser. This work was made possible in part by the
13International Computer Science Institute, located at Suite 600, 1947 Center
14Street, Berkeley, California 94704. Funding was partially provided by the
15National Science Foundation under grant MIP-9311980. The original version
16of this code was written as part of a project to build a fixed-point vector
17processor in collaboration with the University of California at Berkeley,
18overseen by Profs. Nelson Morgan and John Wawrzynek. More information
19is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
20arithmetic/SoftFloat.html'.
21
22THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
23been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
24RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
25AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
26COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
27EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
28INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
29OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
30
31Derivative works are acceptable, even for commercial purposes, so long as
32(1) the source code for the derivative work includes prominent notice that
33the work is derivative, and (2) the source code includes prominent notice with
34these four paragraphs for those parts of this code that are retained.
35
36=============================================================================*/
37
2ac8bd03
PM
38/* softfloat (and in particular the code in softfloat-specialize.h) is
39 * target-dependent and needs the TARGET_* macros.
40 */
41#include "config.h"
42
6b4c305c 43#include "fpu/softfloat.h"
158142c2
FB
44
45/*----------------------------------------------------------------------------
46| Primitive arithmetic functions, including multi-word arithmetic, and
47| division and square root approximations. (Can be specialized to target if
48| desired.)
49*----------------------------------------------------------------------------*/
50#include "softfloat-macros.h"
51
52/*----------------------------------------------------------------------------
53| Functions and definitions to determine: (1) whether tininess for underflow
54| is detected before or after rounding by default, (2) what (if anything)
55| happens when exceptions are raised, (3) how signaling NaNs are distinguished
56| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
57| are propagated from function inputs to output. These details are target-
58| specific.
59*----------------------------------------------------------------------------*/
60#include "softfloat-specialize.h"
61
62void set_float_rounding_mode(int val STATUS_PARAM)
63{
64 STATUS(float_rounding_mode) = val;
65}
66
1d6bda35
FB
67void set_float_exception_flags(int val STATUS_PARAM)
68{
69 STATUS(float_exception_flags) = val;
70}
71
158142c2
FB
72void set_floatx80_rounding_precision(int val STATUS_PARAM)
73{
74 STATUS(floatx80_rounding_precision) = val;
75}
158142c2 76
bb4d4bb3
PM
77/*----------------------------------------------------------------------------
78| Returns the fraction bits of the half-precision floating-point value `a'.
79*----------------------------------------------------------------------------*/
80
81INLINE uint32_t extractFloat16Frac(float16 a)
82{
83 return float16_val(a) & 0x3ff;
84}
85
86/*----------------------------------------------------------------------------
87| Returns the exponent bits of the half-precision floating-point value `a'.
88*----------------------------------------------------------------------------*/
89
94a49d86 90INLINE int_fast16_t extractFloat16Exp(float16 a)
bb4d4bb3
PM
91{
92 return (float16_val(a) >> 10) & 0x1f;
93}
94
95/*----------------------------------------------------------------------------
96| Returns the sign bit of the single-precision floating-point value `a'.
97*----------------------------------------------------------------------------*/
98
99INLINE flag extractFloat16Sign(float16 a)
100{
101 return float16_val(a)>>15;
102}
103
158142c2
FB
104/*----------------------------------------------------------------------------
105| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
106| and 7, and returns the properly rounded 32-bit integer corresponding to the
107| input. If `zSign' is 1, the input is negated before being converted to an
108| integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
109| is simply rounded to an integer, with the inexact exception raised if the
110| input cannot be represented exactly as an integer. However, if the fixed-
111| point input is too large, the invalid exception is raised and the largest
112| positive or negative integer is returned.
113*----------------------------------------------------------------------------*/
114
bb98fe42 115static int32 roundAndPackInt32( flag zSign, uint64_t absZ STATUS_PARAM)
158142c2
FB
116{
117 int8 roundingMode;
118 flag roundNearestEven;
119 int8 roundIncrement, roundBits;
760e1416 120 int32_t z;
158142c2
FB
121
122 roundingMode = STATUS(float_rounding_mode);
123 roundNearestEven = ( roundingMode == float_round_nearest_even );
124 roundIncrement = 0x40;
125 if ( ! roundNearestEven ) {
126 if ( roundingMode == float_round_to_zero ) {
127 roundIncrement = 0;
128 }
129 else {
130 roundIncrement = 0x7F;
131 if ( zSign ) {
132 if ( roundingMode == float_round_up ) roundIncrement = 0;
133 }
134 else {
135 if ( roundingMode == float_round_down ) roundIncrement = 0;
136 }
137 }
138 }
139 roundBits = absZ & 0x7F;
140 absZ = ( absZ + roundIncrement )>>7;
141 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
142 z = absZ;
143 if ( zSign ) z = - z;
144 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
145 float_raise( float_flag_invalid STATUS_VAR);
bb98fe42 146 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2
FB
147 }
148 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
149 return z;
150
151}
152
153/*----------------------------------------------------------------------------
154| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
155| `absZ1', with binary point between bits 63 and 64 (between the input words),
156| and returns the properly rounded 64-bit integer corresponding to the input.
157| If `zSign' is 1, the input is negated before being converted to an integer.
158| Ordinarily, the fixed-point input is simply rounded to an integer, with
159| the inexact exception raised if the input cannot be represented exactly as
160| an integer. However, if the fixed-point input is too large, the invalid
161| exception is raised and the largest positive or negative integer is
162| returned.
163*----------------------------------------------------------------------------*/
164
bb98fe42 165static int64 roundAndPackInt64( flag zSign, uint64_t absZ0, uint64_t absZ1 STATUS_PARAM)
158142c2
FB
166{
167 int8 roundingMode;
168 flag roundNearestEven, increment;
760e1416 169 int64_t z;
158142c2
FB
170
171 roundingMode = STATUS(float_rounding_mode);
172 roundNearestEven = ( roundingMode == float_round_nearest_even );
bb98fe42 173 increment = ( (int64_t) absZ1 < 0 );
158142c2
FB
174 if ( ! roundNearestEven ) {
175 if ( roundingMode == float_round_to_zero ) {
176 increment = 0;
177 }
178 else {
179 if ( zSign ) {
180 increment = ( roundingMode == float_round_down ) && absZ1;
181 }
182 else {
183 increment = ( roundingMode == float_round_up ) && absZ1;
184 }
185 }
186 }
187 if ( increment ) {
188 ++absZ0;
189 if ( absZ0 == 0 ) goto overflow;
bb98fe42 190 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
158142c2
FB
191 }
192 z = absZ0;
193 if ( zSign ) z = - z;
194 if ( z && ( ( z < 0 ) ^ zSign ) ) {
195 overflow:
196 float_raise( float_flag_invalid STATUS_VAR);
197 return
bb98fe42 198 zSign ? (int64_t) LIT64( 0x8000000000000000 )
158142c2
FB
199 : LIT64( 0x7FFFFFFFFFFFFFFF );
200 }
201 if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
202 return z;
203
204}
205
206/*----------------------------------------------------------------------------
207| Returns the fraction bits of the single-precision floating-point value `a'.
208*----------------------------------------------------------------------------*/
209
bb98fe42 210INLINE uint32_t extractFloat32Frac( float32 a )
158142c2
FB
211{
212
f090c9d4 213 return float32_val(a) & 0x007FFFFF;
158142c2
FB
214
215}
216
217/*----------------------------------------------------------------------------
218| Returns the exponent bits of the single-precision floating-point value `a'.
219*----------------------------------------------------------------------------*/
220
94a49d86 221INLINE int_fast16_t extractFloat32Exp(float32 a)
158142c2
FB
222{
223
f090c9d4 224 return ( float32_val(a)>>23 ) & 0xFF;
158142c2
FB
225
226}
227
228/*----------------------------------------------------------------------------
229| Returns the sign bit of the single-precision floating-point value `a'.
230*----------------------------------------------------------------------------*/
231
232INLINE flag extractFloat32Sign( float32 a )
233{
234
f090c9d4 235 return float32_val(a)>>31;
158142c2
FB
236
237}
238
37d18660
PM
239/*----------------------------------------------------------------------------
240| If `a' is denormal and we are in flush-to-zero mode then set the
241| input-denormal exception and return zero. Otherwise just return the value.
242*----------------------------------------------------------------------------*/
243static float32 float32_squash_input_denormal(float32 a STATUS_PARAM)
244{
245 if (STATUS(flush_inputs_to_zero)) {
246 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
247 float_raise(float_flag_input_denormal STATUS_VAR);
248 return make_float32(float32_val(a) & 0x80000000);
249 }
250 }
251 return a;
252}
253
158142c2
FB
254/*----------------------------------------------------------------------------
255| Normalizes the subnormal single-precision floating-point value represented
256| by the denormalized significand `aSig'. The normalized exponent and
257| significand are stored at the locations pointed to by `zExpPtr' and
258| `zSigPtr', respectively.
259*----------------------------------------------------------------------------*/
260
261static void
94a49d86 262 normalizeFloat32Subnormal(uint32_t aSig, int_fast16_t *zExpPtr, uint32_t *zSigPtr)
158142c2
FB
263{
264 int8 shiftCount;
265
266 shiftCount = countLeadingZeros32( aSig ) - 8;
267 *zSigPtr = aSig<<shiftCount;
268 *zExpPtr = 1 - shiftCount;
269
270}
271
272/*----------------------------------------------------------------------------
273| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
274| single-precision floating-point value, returning the result. After being
275| shifted into the proper positions, the three fields are simply added
276| together to form the result. This means that any integer portion of `zSig'
277| will be added into the exponent. Since a properly normalized significand
278| will have an integer portion equal to 1, the `zExp' input should be 1 less
279| than the desired result exponent whenever `zSig' is a complete, normalized
280| significand.
281*----------------------------------------------------------------------------*/
282
94a49d86 283INLINE float32 packFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig)
158142c2
FB
284{
285
f090c9d4 286 return make_float32(
bb98fe42 287 ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
158142c2
FB
288
289}
290
291/*----------------------------------------------------------------------------
292| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
293| and significand `zSig', and returns the proper single-precision floating-
294| point value corresponding to the abstract input. Ordinarily, the abstract
295| value is simply rounded and packed into the single-precision format, with
296| the inexact exception raised if the abstract input cannot be represented
297| exactly. However, if the abstract value is too large, the overflow and
298| inexact exceptions are raised and an infinity or maximal finite value is
299| returned. If the abstract value is too small, the input value is rounded to
300| a subnormal number, and the underflow and inexact exceptions are raised if
301| the abstract input cannot be represented exactly as a subnormal single-
302| precision floating-point number.
303| The input significand `zSig' has its binary point between bits 30
304| and 29, which is 7 bits to the left of the usual location. This shifted
305| significand must be normalized or smaller. If `zSig' is not normalized,
306| `zExp' must be 0; in that case, the result returned is a subnormal number,
307| and it must not require rounding. In the usual case that `zSig' is
308| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
309| The handling of underflow and overflow follows the IEC/IEEE Standard for
310| Binary Floating-Point Arithmetic.
311*----------------------------------------------------------------------------*/
312
94a49d86 313static float32 roundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM)
158142c2
FB
314{
315 int8 roundingMode;
316 flag roundNearestEven;
317 int8 roundIncrement, roundBits;
318 flag isTiny;
319
320 roundingMode = STATUS(float_rounding_mode);
321 roundNearestEven = ( roundingMode == float_round_nearest_even );
322 roundIncrement = 0x40;
323 if ( ! roundNearestEven ) {
324 if ( roundingMode == float_round_to_zero ) {
325 roundIncrement = 0;
326 }
327 else {
328 roundIncrement = 0x7F;
329 if ( zSign ) {
330 if ( roundingMode == float_round_up ) roundIncrement = 0;
331 }
332 else {
333 if ( roundingMode == float_round_down ) roundIncrement = 0;
334 }
335 }
336 }
337 roundBits = zSig & 0x7F;
bb98fe42 338 if ( 0xFD <= (uint16_t) zExp ) {
158142c2
FB
339 if ( ( 0xFD < zExp )
340 || ( ( zExp == 0xFD )
bb98fe42 341 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
158142c2
FB
342 ) {
343 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
f090c9d4 344 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
158142c2
FB
345 }
346 if ( zExp < 0 ) {
e6afc87f
PM
347 if (STATUS(flush_to_zero)) {
348 float_raise(float_flag_output_denormal STATUS_VAR);
349 return packFloat32(zSign, 0, 0);
350 }
158142c2
FB
351 isTiny =
352 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
353 || ( zExp < -1 )
354 || ( zSig + roundIncrement < 0x80000000 );
355 shift32RightJamming( zSig, - zExp, &zSig );
356 zExp = 0;
357 roundBits = zSig & 0x7F;
358 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
359 }
360 }
361 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
362 zSig = ( zSig + roundIncrement )>>7;
363 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
364 if ( zSig == 0 ) zExp = 0;
365 return packFloat32( zSign, zExp, zSig );
366
367}
368
369/*----------------------------------------------------------------------------
370| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
371| and significand `zSig', and returns the proper single-precision floating-
372| point value corresponding to the abstract input. This routine is just like
373| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
374| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
375| floating-point exponent.
376*----------------------------------------------------------------------------*/
377
378static float32
94a49d86 379 normalizeRoundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM)
158142c2
FB
380{
381 int8 shiftCount;
382
383 shiftCount = countLeadingZeros32( zSig ) - 1;
384 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
385
386}
387
388/*----------------------------------------------------------------------------
389| Returns the fraction bits of the double-precision floating-point value `a'.
390*----------------------------------------------------------------------------*/
391
bb98fe42 392INLINE uint64_t extractFloat64Frac( float64 a )
158142c2
FB
393{
394
f090c9d4 395 return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
158142c2
FB
396
397}
398
399/*----------------------------------------------------------------------------
400| Returns the exponent bits of the double-precision floating-point value `a'.
401*----------------------------------------------------------------------------*/
402
94a49d86 403INLINE int_fast16_t extractFloat64Exp(float64 a)
158142c2
FB
404{
405
f090c9d4 406 return ( float64_val(a)>>52 ) & 0x7FF;
158142c2
FB
407
408}
409
410/*----------------------------------------------------------------------------
411| Returns the sign bit of the double-precision floating-point value `a'.
412*----------------------------------------------------------------------------*/
413
414INLINE flag extractFloat64Sign( float64 a )
415{
416
f090c9d4 417 return float64_val(a)>>63;
158142c2
FB
418
419}
420
37d18660
PM
421/*----------------------------------------------------------------------------
422| If `a' is denormal and we are in flush-to-zero mode then set the
423| input-denormal exception and return zero. Otherwise just return the value.
424*----------------------------------------------------------------------------*/
425static float64 float64_squash_input_denormal(float64 a STATUS_PARAM)
426{
427 if (STATUS(flush_inputs_to_zero)) {
428 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
429 float_raise(float_flag_input_denormal STATUS_VAR);
430 return make_float64(float64_val(a) & (1ULL << 63));
431 }
432 }
433 return a;
434}
435
158142c2
FB
436/*----------------------------------------------------------------------------
437| Normalizes the subnormal double-precision floating-point value represented
438| by the denormalized significand `aSig'. The normalized exponent and
439| significand are stored at the locations pointed to by `zExpPtr' and
440| `zSigPtr', respectively.
441*----------------------------------------------------------------------------*/
442
443static void
94a49d86 444 normalizeFloat64Subnormal(uint64_t aSig, int_fast16_t *zExpPtr, uint64_t *zSigPtr)
158142c2
FB
445{
446 int8 shiftCount;
447
448 shiftCount = countLeadingZeros64( aSig ) - 11;
449 *zSigPtr = aSig<<shiftCount;
450 *zExpPtr = 1 - shiftCount;
451
452}
453
454/*----------------------------------------------------------------------------
455| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
456| double-precision floating-point value, returning the result. After being
457| shifted into the proper positions, the three fields are simply added
458| together to form the result. This means that any integer portion of `zSig'
459| will be added into the exponent. Since a properly normalized significand
460| will have an integer portion equal to 1, the `zExp' input should be 1 less
461| than the desired result exponent whenever `zSig' is a complete, normalized
462| significand.
463*----------------------------------------------------------------------------*/
464
94a49d86 465INLINE float64 packFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig)
158142c2
FB
466{
467
f090c9d4 468 return make_float64(
bb98fe42 469 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
158142c2
FB
470
471}
472
473/*----------------------------------------------------------------------------
474| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
475| and significand `zSig', and returns the proper double-precision floating-
476| point value corresponding to the abstract input. Ordinarily, the abstract
477| value is simply rounded and packed into the double-precision format, with
478| the inexact exception raised if the abstract input cannot be represented
479| exactly. However, if the abstract value is too large, the overflow and
480| inexact exceptions are raised and an infinity or maximal finite value is
481| returned. If the abstract value is too small, the input value is rounded
482| to a subnormal number, and the underflow and inexact exceptions are raised
483| if the abstract input cannot be represented exactly as a subnormal double-
484| precision floating-point number.
485| The input significand `zSig' has its binary point between bits 62
486| and 61, which is 10 bits to the left of the usual location. This shifted
487| significand must be normalized or smaller. If `zSig' is not normalized,
488| `zExp' must be 0; in that case, the result returned is a subnormal number,
489| and it must not require rounding. In the usual case that `zSig' is
490| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
491| The handling of underflow and overflow follows the IEC/IEEE Standard for
492| Binary Floating-Point Arithmetic.
493*----------------------------------------------------------------------------*/
494
94a49d86 495static float64 roundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM)
158142c2
FB
496{
497 int8 roundingMode;
498 flag roundNearestEven;
94a49d86 499 int_fast16_t roundIncrement, roundBits;
158142c2
FB
500 flag isTiny;
501
502 roundingMode = STATUS(float_rounding_mode);
503 roundNearestEven = ( roundingMode == float_round_nearest_even );
504 roundIncrement = 0x200;
505 if ( ! roundNearestEven ) {
506 if ( roundingMode == float_round_to_zero ) {
507 roundIncrement = 0;
508 }
509 else {
510 roundIncrement = 0x3FF;
511 if ( zSign ) {
512 if ( roundingMode == float_round_up ) roundIncrement = 0;
513 }
514 else {
515 if ( roundingMode == float_round_down ) roundIncrement = 0;
516 }
517 }
518 }
519 roundBits = zSig & 0x3FF;
bb98fe42 520 if ( 0x7FD <= (uint16_t) zExp ) {
158142c2
FB
521 if ( ( 0x7FD < zExp )
522 || ( ( zExp == 0x7FD )
bb98fe42 523 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
158142c2
FB
524 ) {
525 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
f090c9d4 526 return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
158142c2
FB
527 }
528 if ( zExp < 0 ) {
e6afc87f
PM
529 if (STATUS(flush_to_zero)) {
530 float_raise(float_flag_output_denormal STATUS_VAR);
531 return packFloat64(zSign, 0, 0);
532 }
158142c2
FB
533 isTiny =
534 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
535 || ( zExp < -1 )
536 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
537 shift64RightJamming( zSig, - zExp, &zSig );
538 zExp = 0;
539 roundBits = zSig & 0x3FF;
540 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
541 }
542 }
543 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
544 zSig = ( zSig + roundIncrement )>>10;
545 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
546 if ( zSig == 0 ) zExp = 0;
547 return packFloat64( zSign, zExp, zSig );
548
549}
550
551/*----------------------------------------------------------------------------
552| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
553| and significand `zSig', and returns the proper double-precision floating-
554| point value corresponding to the abstract input. This routine is just like
555| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
556| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
557| floating-point exponent.
558*----------------------------------------------------------------------------*/
559
560static float64
94a49d86 561 normalizeRoundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM)
158142c2
FB
562{
563 int8 shiftCount;
564
565 shiftCount = countLeadingZeros64( zSig ) - 1;
566 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
567
568}
569
158142c2
FB
570/*----------------------------------------------------------------------------
571| Returns the fraction bits of the extended double-precision floating-point
572| value `a'.
573*----------------------------------------------------------------------------*/
574
bb98fe42 575INLINE uint64_t extractFloatx80Frac( floatx80 a )
158142c2
FB
576{
577
578 return a.low;
579
580}
581
582/*----------------------------------------------------------------------------
583| Returns the exponent bits of the extended double-precision floating-point
584| value `a'.
585*----------------------------------------------------------------------------*/
586
587INLINE int32 extractFloatx80Exp( floatx80 a )
588{
589
590 return a.high & 0x7FFF;
591
592}
593
594/*----------------------------------------------------------------------------
595| Returns the sign bit of the extended double-precision floating-point value
596| `a'.
597*----------------------------------------------------------------------------*/
598
599INLINE flag extractFloatx80Sign( floatx80 a )
600{
601
602 return a.high>>15;
603
604}
605
606/*----------------------------------------------------------------------------
607| Normalizes the subnormal extended double-precision floating-point value
608| represented by the denormalized significand `aSig'. The normalized exponent
609| and significand are stored at the locations pointed to by `zExpPtr' and
610| `zSigPtr', respectively.
611*----------------------------------------------------------------------------*/
612
613static void
bb98fe42 614 normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr )
158142c2
FB
615{
616 int8 shiftCount;
617
618 shiftCount = countLeadingZeros64( aSig );
619 *zSigPtr = aSig<<shiftCount;
620 *zExpPtr = 1 - shiftCount;
621
622}
623
624/*----------------------------------------------------------------------------
625| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
626| extended double-precision floating-point value, returning the result.
627*----------------------------------------------------------------------------*/
628
bb98fe42 629INLINE floatx80 packFloatx80( flag zSign, int32 zExp, uint64_t zSig )
158142c2
FB
630{
631 floatx80 z;
632
633 z.low = zSig;
bb98fe42 634 z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
158142c2
FB
635 return z;
636
637}
638
639/*----------------------------------------------------------------------------
640| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
641| and extended significand formed by the concatenation of `zSig0' and `zSig1',
642| and returns the proper extended double-precision floating-point value
643| corresponding to the abstract input. Ordinarily, the abstract value is
644| rounded and packed into the extended double-precision format, with the
645| inexact exception raised if the abstract input cannot be represented
646| exactly. However, if the abstract value is too large, the overflow and
647| inexact exceptions are raised and an infinity or maximal finite value is
648| returned. If the abstract value is too small, the input value is rounded to
649| a subnormal number, and the underflow and inexact exceptions are raised if
650| the abstract input cannot be represented exactly as a subnormal extended
651| double-precision floating-point number.
652| If `roundingPrecision' is 32 or 64, the result is rounded to the same
653| number of bits as single or double precision, respectively. Otherwise, the
654| result is rounded to the full precision of the extended double-precision
655| format.
656| The input significand must be normalized or smaller. If the input
657| significand is not normalized, `zExp' must be 0; in that case, the result
658| returned is a subnormal number, and it must not require rounding. The
659| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
660| Floating-Point Arithmetic.
661*----------------------------------------------------------------------------*/
662
663static floatx80
664 roundAndPackFloatx80(
bb98fe42 665 int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
158142c2
FB
666 STATUS_PARAM)
667{
668 int8 roundingMode;
669 flag roundNearestEven, increment, isTiny;
670 int64 roundIncrement, roundMask, roundBits;
671
672 roundingMode = STATUS(float_rounding_mode);
673 roundNearestEven = ( roundingMode == float_round_nearest_even );
674 if ( roundingPrecision == 80 ) goto precision80;
675 if ( roundingPrecision == 64 ) {
676 roundIncrement = LIT64( 0x0000000000000400 );
677 roundMask = LIT64( 0x00000000000007FF );
678 }
679 else if ( roundingPrecision == 32 ) {
680 roundIncrement = LIT64( 0x0000008000000000 );
681 roundMask = LIT64( 0x000000FFFFFFFFFF );
682 }
683 else {
684 goto precision80;
685 }
686 zSig0 |= ( zSig1 != 0 );
687 if ( ! roundNearestEven ) {
688 if ( roundingMode == float_round_to_zero ) {
689 roundIncrement = 0;
690 }
691 else {
692 roundIncrement = roundMask;
693 if ( zSign ) {
694 if ( roundingMode == float_round_up ) roundIncrement = 0;
695 }
696 else {
697 if ( roundingMode == float_round_down ) roundIncrement = 0;
698 }
699 }
700 }
701 roundBits = zSig0 & roundMask;
bb98fe42 702 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
158142c2
FB
703 if ( ( 0x7FFE < zExp )
704 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
705 ) {
706 goto overflow;
707 }
708 if ( zExp <= 0 ) {
e6afc87f
PM
709 if (STATUS(flush_to_zero)) {
710 float_raise(float_flag_output_denormal STATUS_VAR);
711 return packFloatx80(zSign, 0, 0);
712 }
158142c2
FB
713 isTiny =
714 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
715 || ( zExp < 0 )
716 || ( zSig0 <= zSig0 + roundIncrement );
717 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
718 zExp = 0;
719 roundBits = zSig0 & roundMask;
720 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
721 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
722 zSig0 += roundIncrement;
bb98fe42 723 if ( (int64_t) zSig0 < 0 ) zExp = 1;
158142c2
FB
724 roundIncrement = roundMask + 1;
725 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
726 roundMask |= roundIncrement;
727 }
728 zSig0 &= ~ roundMask;
729 return packFloatx80( zSign, zExp, zSig0 );
730 }
731 }
732 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
733 zSig0 += roundIncrement;
734 if ( zSig0 < roundIncrement ) {
735 ++zExp;
736 zSig0 = LIT64( 0x8000000000000000 );
737 }
738 roundIncrement = roundMask + 1;
739 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
740 roundMask |= roundIncrement;
741 }
742 zSig0 &= ~ roundMask;
743 if ( zSig0 == 0 ) zExp = 0;
744 return packFloatx80( zSign, zExp, zSig0 );
745 precision80:
bb98fe42 746 increment = ( (int64_t) zSig1 < 0 );
158142c2
FB
747 if ( ! roundNearestEven ) {
748 if ( roundingMode == float_round_to_zero ) {
749 increment = 0;
750 }
751 else {
752 if ( zSign ) {
753 increment = ( roundingMode == float_round_down ) && zSig1;
754 }
755 else {
756 increment = ( roundingMode == float_round_up ) && zSig1;
757 }
758 }
759 }
bb98fe42 760 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
158142c2
FB
761 if ( ( 0x7FFE < zExp )
762 || ( ( zExp == 0x7FFE )
763 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
764 && increment
765 )
766 ) {
767 roundMask = 0;
768 overflow:
769 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
770 if ( ( roundingMode == float_round_to_zero )
771 || ( zSign && ( roundingMode == float_round_up ) )
772 || ( ! zSign && ( roundingMode == float_round_down ) )
773 ) {
774 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
775 }
776 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
777 }
778 if ( zExp <= 0 ) {
779 isTiny =
780 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
781 || ( zExp < 0 )
782 || ! increment
783 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
784 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
785 zExp = 0;
786 if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
787 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
788 if ( roundNearestEven ) {
bb98fe42 789 increment = ( (int64_t) zSig1 < 0 );
158142c2
FB
790 }
791 else {
792 if ( zSign ) {
793 increment = ( roundingMode == float_round_down ) && zSig1;
794 }
795 else {
796 increment = ( roundingMode == float_round_up ) && zSig1;
797 }
798 }
799 if ( increment ) {
800 ++zSig0;
801 zSig0 &=
bb98fe42
AF
802 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
803 if ( (int64_t) zSig0 < 0 ) zExp = 1;
158142c2
FB
804 }
805 return packFloatx80( zSign, zExp, zSig0 );
806 }
807 }
808 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
809 if ( increment ) {
810 ++zSig0;
811 if ( zSig0 == 0 ) {
812 ++zExp;
813 zSig0 = LIT64( 0x8000000000000000 );
814 }
815 else {
bb98fe42 816 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
158142c2
FB
817 }
818 }
819 else {
820 if ( zSig0 == 0 ) zExp = 0;
821 }
822 return packFloatx80( zSign, zExp, zSig0 );
823
824}
825
826/*----------------------------------------------------------------------------
827| Takes an abstract floating-point value having sign `zSign', exponent
828| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
829| and returns the proper extended double-precision floating-point value
830| corresponding to the abstract input. This routine is just like
831| `roundAndPackFloatx80' except that the input significand does not have to be
832| normalized.
833*----------------------------------------------------------------------------*/
834
835static floatx80
836 normalizeRoundAndPackFloatx80(
bb98fe42 837 int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
158142c2
FB
838 STATUS_PARAM)
839{
840 int8 shiftCount;
841
842 if ( zSig0 == 0 ) {
843 zSig0 = zSig1;
844 zSig1 = 0;
845 zExp -= 64;
846 }
847 shiftCount = countLeadingZeros64( zSig0 );
848 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
849 zExp -= shiftCount;
850 return
851 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
852
853}
854
158142c2
FB
855/*----------------------------------------------------------------------------
856| Returns the least-significant 64 fraction bits of the quadruple-precision
857| floating-point value `a'.
858*----------------------------------------------------------------------------*/
859
bb98fe42 860INLINE uint64_t extractFloat128Frac1( float128 a )
158142c2
FB
861{
862
863 return a.low;
864
865}
866
867/*----------------------------------------------------------------------------
868| Returns the most-significant 48 fraction bits of the quadruple-precision
869| floating-point value `a'.
870*----------------------------------------------------------------------------*/
871
bb98fe42 872INLINE uint64_t extractFloat128Frac0( float128 a )
158142c2
FB
873{
874
875 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
876
877}
878
879/*----------------------------------------------------------------------------
880| Returns the exponent bits of the quadruple-precision floating-point value
881| `a'.
882*----------------------------------------------------------------------------*/
883
884INLINE int32 extractFloat128Exp( float128 a )
885{
886
887 return ( a.high>>48 ) & 0x7FFF;
888
889}
890
891/*----------------------------------------------------------------------------
892| Returns the sign bit of the quadruple-precision floating-point value `a'.
893*----------------------------------------------------------------------------*/
894
895INLINE flag extractFloat128Sign( float128 a )
896{
897
898 return a.high>>63;
899
900}
901
902/*----------------------------------------------------------------------------
903| Normalizes the subnormal quadruple-precision floating-point value
904| represented by the denormalized significand formed by the concatenation of
905| `aSig0' and `aSig1'. The normalized exponent is stored at the location
906| pointed to by `zExpPtr'. The most significant 49 bits of the normalized
907| significand are stored at the location pointed to by `zSig0Ptr', and the
908| least significant 64 bits of the normalized significand are stored at the
909| location pointed to by `zSig1Ptr'.
910*----------------------------------------------------------------------------*/
911
912static void
913 normalizeFloat128Subnormal(
bb98fe42
AF
914 uint64_t aSig0,
915 uint64_t aSig1,
158142c2 916 int32 *zExpPtr,
bb98fe42
AF
917 uint64_t *zSig0Ptr,
918 uint64_t *zSig1Ptr
158142c2
FB
919 )
920{
921 int8 shiftCount;
922
923 if ( aSig0 == 0 ) {
924 shiftCount = countLeadingZeros64( aSig1 ) - 15;
925 if ( shiftCount < 0 ) {
926 *zSig0Ptr = aSig1>>( - shiftCount );
927 *zSig1Ptr = aSig1<<( shiftCount & 63 );
928 }
929 else {
930 *zSig0Ptr = aSig1<<shiftCount;
931 *zSig1Ptr = 0;
932 }
933 *zExpPtr = - shiftCount - 63;
934 }
935 else {
936 shiftCount = countLeadingZeros64( aSig0 ) - 15;
937 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
938 *zExpPtr = 1 - shiftCount;
939 }
940
941}
942
943/*----------------------------------------------------------------------------
944| Packs the sign `zSign', the exponent `zExp', and the significand formed
945| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
946| floating-point value, returning the result. After being shifted into the
947| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
948| added together to form the most significant 32 bits of the result. This
949| means that any integer portion of `zSig0' will be added into the exponent.
950| Since a properly normalized significand will have an integer portion equal
951| to 1, the `zExp' input should be 1 less than the desired result exponent
952| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
953| significand.
954*----------------------------------------------------------------------------*/
955
956INLINE float128
bb98fe42 957 packFloat128( flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 )
158142c2
FB
958{
959 float128 z;
960
961 z.low = zSig1;
bb98fe42 962 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
158142c2
FB
963 return z;
964
965}
966
967/*----------------------------------------------------------------------------
968| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
969| and extended significand formed by the concatenation of `zSig0', `zSig1',
970| and `zSig2', and returns the proper quadruple-precision floating-point value
971| corresponding to the abstract input. Ordinarily, the abstract value is
972| simply rounded and packed into the quadruple-precision format, with the
973| inexact exception raised if the abstract input cannot be represented
974| exactly. However, if the abstract value is too large, the overflow and
975| inexact exceptions are raised and an infinity or maximal finite value is
976| returned. If the abstract value is too small, the input value is rounded to
977| a subnormal number, and the underflow and inexact exceptions are raised if
978| the abstract input cannot be represented exactly as a subnormal quadruple-
979| precision floating-point number.
980| The input significand must be normalized or smaller. If the input
981| significand is not normalized, `zExp' must be 0; in that case, the result
982| returned is a subnormal number, and it must not require rounding. In the
983| usual case that the input significand is normalized, `zExp' must be 1 less
984| than the ``true'' floating-point exponent. The handling of underflow and
985| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
986*----------------------------------------------------------------------------*/
987
988static float128
989 roundAndPackFloat128(
bb98fe42 990 flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1, uint64_t zSig2 STATUS_PARAM)
158142c2
FB
991{
992 int8 roundingMode;
993 flag roundNearestEven, increment, isTiny;
994
995 roundingMode = STATUS(float_rounding_mode);
996 roundNearestEven = ( roundingMode == float_round_nearest_even );
bb98fe42 997 increment = ( (int64_t) zSig2 < 0 );
158142c2
FB
998 if ( ! roundNearestEven ) {
999 if ( roundingMode == float_round_to_zero ) {
1000 increment = 0;
1001 }
1002 else {
1003 if ( zSign ) {
1004 increment = ( roundingMode == float_round_down ) && zSig2;
1005 }
1006 else {
1007 increment = ( roundingMode == float_round_up ) && zSig2;
1008 }
1009 }
1010 }
bb98fe42 1011 if ( 0x7FFD <= (uint32_t) zExp ) {
158142c2
FB
1012 if ( ( 0x7FFD < zExp )
1013 || ( ( zExp == 0x7FFD )
1014 && eq128(
1015 LIT64( 0x0001FFFFFFFFFFFF ),
1016 LIT64( 0xFFFFFFFFFFFFFFFF ),
1017 zSig0,
1018 zSig1
1019 )
1020 && increment
1021 )
1022 ) {
1023 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
1024 if ( ( roundingMode == float_round_to_zero )
1025 || ( zSign && ( roundingMode == float_round_up ) )
1026 || ( ! zSign && ( roundingMode == float_round_down ) )
1027 ) {
1028 return
1029 packFloat128(
1030 zSign,
1031 0x7FFE,
1032 LIT64( 0x0000FFFFFFFFFFFF ),
1033 LIT64( 0xFFFFFFFFFFFFFFFF )
1034 );
1035 }
1036 return packFloat128( zSign, 0x7FFF, 0, 0 );
1037 }
1038 if ( zExp < 0 ) {
e6afc87f
PM
1039 if (STATUS(flush_to_zero)) {
1040 float_raise(float_flag_output_denormal STATUS_VAR);
1041 return packFloat128(zSign, 0, 0, 0);
1042 }
158142c2
FB
1043 isTiny =
1044 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
1045 || ( zExp < -1 )
1046 || ! increment
1047 || lt128(
1048 zSig0,
1049 zSig1,
1050 LIT64( 0x0001FFFFFFFFFFFF ),
1051 LIT64( 0xFFFFFFFFFFFFFFFF )
1052 );
1053 shift128ExtraRightJamming(
1054 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1055 zExp = 0;
1056 if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
1057 if ( roundNearestEven ) {
bb98fe42 1058 increment = ( (int64_t) zSig2 < 0 );
158142c2
FB
1059 }
1060 else {
1061 if ( zSign ) {
1062 increment = ( roundingMode == float_round_down ) && zSig2;
1063 }
1064 else {
1065 increment = ( roundingMode == float_round_up ) && zSig2;
1066 }
1067 }
1068 }
1069 }
1070 if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1071 if ( increment ) {
1072 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1073 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1074 }
1075 else {
1076 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1077 }
1078 return packFloat128( zSign, zExp, zSig0, zSig1 );
1079
1080}
1081
1082/*----------------------------------------------------------------------------
1083| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1084| and significand formed by the concatenation of `zSig0' and `zSig1', and
1085| returns the proper quadruple-precision floating-point value corresponding
1086| to the abstract input. This routine is just like `roundAndPackFloat128'
1087| except that the input significand has fewer bits and does not have to be
1088| normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1089| point exponent.
1090*----------------------------------------------------------------------------*/
1091
1092static float128
1093 normalizeRoundAndPackFloat128(
bb98fe42 1094 flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 STATUS_PARAM)
158142c2
FB
1095{
1096 int8 shiftCount;
bb98fe42 1097 uint64_t zSig2;
158142c2
FB
1098
1099 if ( zSig0 == 0 ) {
1100 zSig0 = zSig1;
1101 zSig1 = 0;
1102 zExp -= 64;
1103 }
1104 shiftCount = countLeadingZeros64( zSig0 ) - 15;
1105 if ( 0 <= shiftCount ) {
1106 zSig2 = 0;
1107 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1108 }
1109 else {
1110 shift128ExtraRightJamming(
1111 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1112 }
1113 zExp -= shiftCount;
1114 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1115
1116}
1117
158142c2
FB
1118/*----------------------------------------------------------------------------
1119| Returns the result of converting the 32-bit two's complement integer `a'
1120| to the single-precision floating-point format. The conversion is performed
1121| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1122*----------------------------------------------------------------------------*/
1123
1124float32 int32_to_float32( int32 a STATUS_PARAM )
1125{
1126 flag zSign;
1127
f090c9d4 1128 if ( a == 0 ) return float32_zero;
bb98fe42 1129 if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
158142c2
FB
1130 zSign = ( a < 0 );
1131 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1132
1133}
1134
1135/*----------------------------------------------------------------------------
1136| Returns the result of converting the 32-bit two's complement integer `a'
1137| to the double-precision floating-point format. The conversion is performed
1138| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1139*----------------------------------------------------------------------------*/
1140
1141float64 int32_to_float64( int32 a STATUS_PARAM )
1142{
1143 flag zSign;
1144 uint32 absA;
1145 int8 shiftCount;
bb98fe42 1146 uint64_t zSig;
158142c2 1147
f090c9d4 1148 if ( a == 0 ) return float64_zero;
158142c2
FB
1149 zSign = ( a < 0 );
1150 absA = zSign ? - a : a;
1151 shiftCount = countLeadingZeros32( absA ) + 21;
1152 zSig = absA;
1153 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1154
1155}
1156
158142c2
FB
1157/*----------------------------------------------------------------------------
1158| Returns the result of converting the 32-bit two's complement integer `a'
1159| to the extended double-precision floating-point format. The conversion
1160| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1161| Arithmetic.
1162*----------------------------------------------------------------------------*/
1163
1164floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1165{
1166 flag zSign;
1167 uint32 absA;
1168 int8 shiftCount;
bb98fe42 1169 uint64_t zSig;
158142c2
FB
1170
1171 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1172 zSign = ( a < 0 );
1173 absA = zSign ? - a : a;
1174 shiftCount = countLeadingZeros32( absA ) + 32;
1175 zSig = absA;
1176 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1177
1178}
1179
158142c2
FB
1180/*----------------------------------------------------------------------------
1181| Returns the result of converting the 32-bit two's complement integer `a' to
1182| the quadruple-precision floating-point format. The conversion is performed
1183| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1184*----------------------------------------------------------------------------*/
1185
1186float128 int32_to_float128( int32 a STATUS_PARAM )
1187{
1188 flag zSign;
1189 uint32 absA;
1190 int8 shiftCount;
bb98fe42 1191 uint64_t zSig0;
158142c2
FB
1192
1193 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1194 zSign = ( a < 0 );
1195 absA = zSign ? - a : a;
1196 shiftCount = countLeadingZeros32( absA ) + 17;
1197 zSig0 = absA;
1198 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1199
1200}
1201
158142c2
FB
1202/*----------------------------------------------------------------------------
1203| Returns the result of converting the 64-bit two's complement integer `a'
1204| to the single-precision floating-point format. The conversion is performed
1205| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1206*----------------------------------------------------------------------------*/
1207
1208float32 int64_to_float32( int64 a STATUS_PARAM )
1209{
1210 flag zSign;
1211 uint64 absA;
1212 int8 shiftCount;
1213
f090c9d4 1214 if ( a == 0 ) return float32_zero;
158142c2
FB
1215 zSign = ( a < 0 );
1216 absA = zSign ? - a : a;
1217 shiftCount = countLeadingZeros64( absA ) - 40;
1218 if ( 0 <= shiftCount ) {
1219 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1220 }
1221 else {
1222 shiftCount += 7;
1223 if ( shiftCount < 0 ) {
1224 shift64RightJamming( absA, - shiftCount, &absA );
1225 }
1226 else {
1227 absA <<= shiftCount;
1228 }
1229 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1230 }
1231
1232}
1233
3430b0be 1234float32 uint64_to_float32( uint64 a STATUS_PARAM )
75d62a58
JM
1235{
1236 int8 shiftCount;
1237
f090c9d4 1238 if ( a == 0 ) return float32_zero;
75d62a58
JM
1239 shiftCount = countLeadingZeros64( a ) - 40;
1240 if ( 0 <= shiftCount ) {
e744c06f 1241 return packFloat32(0, 0x95 - shiftCount, a<<shiftCount);
75d62a58
JM
1242 }
1243 else {
1244 shiftCount += 7;
1245 if ( shiftCount < 0 ) {
1246 shift64RightJamming( a, - shiftCount, &a );
1247 }
1248 else {
1249 a <<= shiftCount;
1250 }
e744c06f 1251 return roundAndPackFloat32(0, 0x9C - shiftCount, a STATUS_VAR);
75d62a58
JM
1252 }
1253}
1254
158142c2
FB
1255/*----------------------------------------------------------------------------
1256| Returns the result of converting the 64-bit two's complement integer `a'
1257| to the double-precision floating-point format. The conversion is performed
1258| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1259*----------------------------------------------------------------------------*/
1260
1261float64 int64_to_float64( int64 a STATUS_PARAM )
1262{
1263 flag zSign;
1264
f090c9d4 1265 if ( a == 0 ) return float64_zero;
bb98fe42 1266 if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
158142c2
FB
1267 return packFloat64( 1, 0x43E, 0 );
1268 }
1269 zSign = ( a < 0 );
1270 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1271
1272}
1273
17ed2293 1274float64 uint64_to_float64(uint64 a STATUS_PARAM)
75d62a58 1275{
17ed2293 1276 int exp = 0x43C;
75d62a58 1277
17ed2293
RH
1278 if (a == 0) {
1279 return float64_zero;
1280 }
1281 if ((int64_t)a < 0) {
1282 shift64RightJamming(a, 1, &a);
1283 exp += 1;
1284 }
1285 return normalizeRoundAndPackFloat64(0, exp, a STATUS_VAR);
75d62a58
JM
1286}
1287
158142c2
FB
1288/*----------------------------------------------------------------------------
1289| Returns the result of converting the 64-bit two's complement integer `a'
1290| to the extended double-precision floating-point format. The conversion
1291| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1292| Arithmetic.
1293*----------------------------------------------------------------------------*/
1294
1295floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1296{
1297 flag zSign;
1298 uint64 absA;
1299 int8 shiftCount;
1300
1301 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1302 zSign = ( a < 0 );
1303 absA = zSign ? - a : a;
1304 shiftCount = countLeadingZeros64( absA );
1305 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1306
1307}
1308
158142c2
FB
1309/*----------------------------------------------------------------------------
1310| Returns the result of converting the 64-bit two's complement integer `a' to
1311| the quadruple-precision floating-point format. The conversion is performed
1312| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1313*----------------------------------------------------------------------------*/
1314
1315float128 int64_to_float128( int64 a STATUS_PARAM )
1316{
1317 flag zSign;
1318 uint64 absA;
1319 int8 shiftCount;
1320 int32 zExp;
bb98fe42 1321 uint64_t zSig0, zSig1;
158142c2
FB
1322
1323 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1324 zSign = ( a < 0 );
1325 absA = zSign ? - a : a;
1326 shiftCount = countLeadingZeros64( absA ) + 49;
1327 zExp = 0x406E - shiftCount;
1328 if ( 64 <= shiftCount ) {
1329 zSig1 = 0;
1330 zSig0 = absA;
1331 shiftCount -= 64;
1332 }
1333 else {
1334 zSig1 = absA;
1335 zSig0 = 0;
1336 }
1337 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1338 return packFloat128( zSign, zExp, zSig0, zSig1 );
1339
1340}
1341
1e397ead
RH
1342float128 uint64_to_float128(uint64 a STATUS_PARAM)
1343{
1344 if (a == 0) {
1345 return float128_zero;
1346 }
1347 return normalizeRoundAndPackFloat128(0, 0x406E, a, 0 STATUS_VAR);
1348}
1349
158142c2
FB
1350/*----------------------------------------------------------------------------
1351| Returns the result of converting the single-precision floating-point value
1352| `a' to the 32-bit two's complement integer format. The conversion is
1353| performed according to the IEC/IEEE Standard for Binary Floating-Point
1354| Arithmetic---which means in particular that the conversion is rounded
1355| according to the current rounding mode. If `a' is a NaN, the largest
1356| positive integer is returned. Otherwise, if the conversion overflows, the
1357| largest integer with the same sign as `a' is returned.
1358*----------------------------------------------------------------------------*/
1359
1360int32 float32_to_int32( float32 a STATUS_PARAM )
1361{
1362 flag aSign;
94a49d86 1363 int_fast16_t aExp, shiftCount;
bb98fe42
AF
1364 uint32_t aSig;
1365 uint64_t aSig64;
158142c2 1366
37d18660 1367 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1368 aSig = extractFloat32Frac( a );
1369 aExp = extractFloat32Exp( a );
1370 aSign = extractFloat32Sign( a );
1371 if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1372 if ( aExp ) aSig |= 0x00800000;
1373 shiftCount = 0xAF - aExp;
1374 aSig64 = aSig;
1375 aSig64 <<= 32;
1376 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1377 return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1378
1379}
1380
1381/*----------------------------------------------------------------------------
1382| Returns the result of converting the single-precision floating-point value
1383| `a' to the 32-bit two's complement integer format. The conversion is
1384| performed according to the IEC/IEEE Standard for Binary Floating-Point
1385| Arithmetic, except that the conversion is always rounded toward zero.
1386| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1387| the conversion overflows, the largest integer with the same sign as `a' is
1388| returned.
1389*----------------------------------------------------------------------------*/
1390
1391int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1392{
1393 flag aSign;
94a49d86 1394 int_fast16_t aExp, shiftCount;
bb98fe42 1395 uint32_t aSig;
b3a6a2e0 1396 int32_t z;
37d18660 1397 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1398
1399 aSig = extractFloat32Frac( a );
1400 aExp = extractFloat32Exp( a );
1401 aSign = extractFloat32Sign( a );
1402 shiftCount = aExp - 0x9E;
1403 if ( 0 <= shiftCount ) {
f090c9d4 1404 if ( float32_val(a) != 0xCF000000 ) {
158142c2
FB
1405 float_raise( float_flag_invalid STATUS_VAR);
1406 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1407 }
bb98fe42 1408 return (int32_t) 0x80000000;
158142c2
FB
1409 }
1410 else if ( aExp <= 0x7E ) {
1411 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1412 return 0;
1413 }
1414 aSig = ( aSig | 0x00800000 )<<8;
1415 z = aSig>>( - shiftCount );
bb98fe42 1416 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
158142c2
FB
1417 STATUS(float_exception_flags) |= float_flag_inexact;
1418 }
1419 if ( aSign ) z = - z;
1420 return z;
1421
1422}
1423
cbcef455
PM
1424/*----------------------------------------------------------------------------
1425| Returns the result of converting the single-precision floating-point value
1426| `a' to the 16-bit two's complement integer format. The conversion is
1427| performed according to the IEC/IEEE Standard for Binary Floating-Point
1428| Arithmetic, except that the conversion is always rounded toward zero.
1429| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1430| the conversion overflows, the largest integer with the same sign as `a' is
1431| returned.
1432*----------------------------------------------------------------------------*/
1433
94a49d86 1434int_fast16_t float32_to_int16_round_to_zero(float32 a STATUS_PARAM)
cbcef455
PM
1435{
1436 flag aSign;
94a49d86 1437 int_fast16_t aExp, shiftCount;
bb98fe42 1438 uint32_t aSig;
cbcef455
PM
1439 int32 z;
1440
1441 aSig = extractFloat32Frac( a );
1442 aExp = extractFloat32Exp( a );
1443 aSign = extractFloat32Sign( a );
1444 shiftCount = aExp - 0x8E;
1445 if ( 0 <= shiftCount ) {
1446 if ( float32_val(a) != 0xC7000000 ) {
1447 float_raise( float_flag_invalid STATUS_VAR);
1448 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1449 return 0x7FFF;
1450 }
1451 }
bb98fe42 1452 return (int32_t) 0xffff8000;
cbcef455
PM
1453 }
1454 else if ( aExp <= 0x7E ) {
1455 if ( aExp | aSig ) {
1456 STATUS(float_exception_flags) |= float_flag_inexact;
1457 }
1458 return 0;
1459 }
1460 shiftCount -= 0x10;
1461 aSig = ( aSig | 0x00800000 )<<8;
1462 z = aSig>>( - shiftCount );
bb98fe42 1463 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
cbcef455
PM
1464 STATUS(float_exception_flags) |= float_flag_inexact;
1465 }
1466 if ( aSign ) {
1467 z = - z;
1468 }
1469 return z;
1470
1471}
1472
158142c2
FB
1473/*----------------------------------------------------------------------------
1474| Returns the result of converting the single-precision floating-point value
1475| `a' to the 64-bit two's complement integer format. The conversion is
1476| performed according to the IEC/IEEE Standard for Binary Floating-Point
1477| Arithmetic---which means in particular that the conversion is rounded
1478| according to the current rounding mode. If `a' is a NaN, the largest
1479| positive integer is returned. Otherwise, if the conversion overflows, the
1480| largest integer with the same sign as `a' is returned.
1481*----------------------------------------------------------------------------*/
1482
1483int64 float32_to_int64( float32 a STATUS_PARAM )
1484{
1485 flag aSign;
94a49d86 1486 int_fast16_t aExp, shiftCount;
bb98fe42
AF
1487 uint32_t aSig;
1488 uint64_t aSig64, aSigExtra;
37d18660 1489 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1490
1491 aSig = extractFloat32Frac( a );
1492 aExp = extractFloat32Exp( a );
1493 aSign = extractFloat32Sign( a );
1494 shiftCount = 0xBE - aExp;
1495 if ( shiftCount < 0 ) {
1496 float_raise( float_flag_invalid STATUS_VAR);
1497 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1498 return LIT64( 0x7FFFFFFFFFFFFFFF );
1499 }
bb98fe42 1500 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
1501 }
1502 if ( aExp ) aSig |= 0x00800000;
1503 aSig64 = aSig;
1504 aSig64 <<= 40;
1505 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1506 return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1507
1508}
1509
1510/*----------------------------------------------------------------------------
1511| Returns the result of converting the single-precision floating-point value
1512| `a' to the 64-bit two's complement integer format. The conversion is
1513| performed according to the IEC/IEEE Standard for Binary Floating-Point
1514| Arithmetic, except that the conversion is always rounded toward zero. If
1515| `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1516| conversion overflows, the largest integer with the same sign as `a' is
1517| returned.
1518*----------------------------------------------------------------------------*/
1519
1520int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1521{
1522 flag aSign;
94a49d86 1523 int_fast16_t aExp, shiftCount;
bb98fe42
AF
1524 uint32_t aSig;
1525 uint64_t aSig64;
158142c2 1526 int64 z;
37d18660 1527 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1528
1529 aSig = extractFloat32Frac( a );
1530 aExp = extractFloat32Exp( a );
1531 aSign = extractFloat32Sign( a );
1532 shiftCount = aExp - 0xBE;
1533 if ( 0 <= shiftCount ) {
f090c9d4 1534 if ( float32_val(a) != 0xDF000000 ) {
158142c2
FB
1535 float_raise( float_flag_invalid STATUS_VAR);
1536 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1537 return LIT64( 0x7FFFFFFFFFFFFFFF );
1538 }
1539 }
bb98fe42 1540 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
1541 }
1542 else if ( aExp <= 0x7E ) {
1543 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1544 return 0;
1545 }
1546 aSig64 = aSig | 0x00800000;
1547 aSig64 <<= 40;
1548 z = aSig64>>( - shiftCount );
bb98fe42 1549 if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
158142c2
FB
1550 STATUS(float_exception_flags) |= float_flag_inexact;
1551 }
1552 if ( aSign ) z = - z;
1553 return z;
1554
1555}
1556
1557/*----------------------------------------------------------------------------
1558| Returns the result of converting the single-precision floating-point value
1559| `a' to the double-precision floating-point format. The conversion is
1560| performed according to the IEC/IEEE Standard for Binary Floating-Point
1561| Arithmetic.
1562*----------------------------------------------------------------------------*/
1563
1564float64 float32_to_float64( float32 a STATUS_PARAM )
1565{
1566 flag aSign;
94a49d86 1567 int_fast16_t aExp;
bb98fe42 1568 uint32_t aSig;
37d18660 1569 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1570
1571 aSig = extractFloat32Frac( a );
1572 aExp = extractFloat32Exp( a );
1573 aSign = extractFloat32Sign( a );
1574 if ( aExp == 0xFF ) {
bcd4d9af 1575 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
1576 return packFloat64( aSign, 0x7FF, 0 );
1577 }
1578 if ( aExp == 0 ) {
1579 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1580 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1581 --aExp;
1582 }
bb98fe42 1583 return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
158142c2
FB
1584
1585}
1586
158142c2
FB
1587/*----------------------------------------------------------------------------
1588| Returns the result of converting the single-precision floating-point value
1589| `a' to the extended double-precision floating-point format. The conversion
1590| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1591| Arithmetic.
1592*----------------------------------------------------------------------------*/
1593
1594floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1595{
1596 flag aSign;
94a49d86 1597 int_fast16_t aExp;
bb98fe42 1598 uint32_t aSig;
158142c2 1599
37d18660 1600 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1601 aSig = extractFloat32Frac( a );
1602 aExp = extractFloat32Exp( a );
1603 aSign = extractFloat32Sign( a );
1604 if ( aExp == 0xFF ) {
bcd4d9af 1605 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
1606 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1607 }
1608 if ( aExp == 0 ) {
1609 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1610 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1611 }
1612 aSig |= 0x00800000;
bb98fe42 1613 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
158142c2
FB
1614
1615}
1616
158142c2
FB
1617/*----------------------------------------------------------------------------
1618| Returns the result of converting the single-precision floating-point value
1619| `a' to the double-precision floating-point format. The conversion is
1620| performed according to the IEC/IEEE Standard for Binary Floating-Point
1621| Arithmetic.
1622*----------------------------------------------------------------------------*/
1623
1624float128 float32_to_float128( float32 a STATUS_PARAM )
1625{
1626 flag aSign;
94a49d86 1627 int_fast16_t aExp;
bb98fe42 1628 uint32_t aSig;
158142c2 1629
37d18660 1630 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1631 aSig = extractFloat32Frac( a );
1632 aExp = extractFloat32Exp( a );
1633 aSign = extractFloat32Sign( a );
1634 if ( aExp == 0xFF ) {
bcd4d9af 1635 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
1636 return packFloat128( aSign, 0x7FFF, 0, 0 );
1637 }
1638 if ( aExp == 0 ) {
1639 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1640 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1641 --aExp;
1642 }
bb98fe42 1643 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
158142c2
FB
1644
1645}
1646
158142c2
FB
1647/*----------------------------------------------------------------------------
1648| Rounds the single-precision floating-point value `a' to an integer, and
1649| returns the result as a single-precision floating-point value. The
1650| operation is performed according to the IEC/IEEE Standard for Binary
1651| Floating-Point Arithmetic.
1652*----------------------------------------------------------------------------*/
1653
1654float32 float32_round_to_int( float32 a STATUS_PARAM)
1655{
1656 flag aSign;
94a49d86 1657 int_fast16_t aExp;
bb98fe42 1658 uint32_t lastBitMask, roundBitsMask;
158142c2 1659 int8 roundingMode;
bb98fe42 1660 uint32_t z;
37d18660 1661 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1662
1663 aExp = extractFloat32Exp( a );
1664 if ( 0x96 <= aExp ) {
1665 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1666 return propagateFloat32NaN( a, a STATUS_VAR );
1667 }
1668 return a;
1669 }
1670 if ( aExp <= 0x7E ) {
bb98fe42 1671 if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
158142c2
FB
1672 STATUS(float_exception_flags) |= float_flag_inexact;
1673 aSign = extractFloat32Sign( a );
1674 switch ( STATUS(float_rounding_mode) ) {
1675 case float_round_nearest_even:
1676 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1677 return packFloat32( aSign, 0x7F, 0 );
1678 }
1679 break;
1680 case float_round_down:
f090c9d4 1681 return make_float32(aSign ? 0xBF800000 : 0);
158142c2 1682 case float_round_up:
f090c9d4 1683 return make_float32(aSign ? 0x80000000 : 0x3F800000);
158142c2
FB
1684 }
1685 return packFloat32( aSign, 0, 0 );
1686 }
1687 lastBitMask = 1;
1688 lastBitMask <<= 0x96 - aExp;
1689 roundBitsMask = lastBitMask - 1;
f090c9d4 1690 z = float32_val(a);
158142c2
FB
1691 roundingMode = STATUS(float_rounding_mode);
1692 if ( roundingMode == float_round_nearest_even ) {
1693 z += lastBitMask>>1;
1694 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1695 }
1696 else if ( roundingMode != float_round_to_zero ) {
f090c9d4 1697 if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
158142c2
FB
1698 z += roundBitsMask;
1699 }
1700 }
1701 z &= ~ roundBitsMask;
f090c9d4
PB
1702 if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1703 return make_float32(z);
158142c2
FB
1704
1705}
1706
1707/*----------------------------------------------------------------------------
1708| Returns the result of adding the absolute values of the single-precision
1709| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1710| before being returned. `zSign' is ignored if the result is a NaN.
1711| The addition is performed according to the IEC/IEEE Standard for Binary
1712| Floating-Point Arithmetic.
1713*----------------------------------------------------------------------------*/
1714
1715static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1716{
94a49d86 1717 int_fast16_t aExp, bExp, zExp;
bb98fe42 1718 uint32_t aSig, bSig, zSig;
94a49d86 1719 int_fast16_t expDiff;
158142c2
FB
1720
1721 aSig = extractFloat32Frac( a );
1722 aExp = extractFloat32Exp( a );
1723 bSig = extractFloat32Frac( b );
1724 bExp = extractFloat32Exp( b );
1725 expDiff = aExp - bExp;
1726 aSig <<= 6;
1727 bSig <<= 6;
1728 if ( 0 < expDiff ) {
1729 if ( aExp == 0xFF ) {
1730 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1731 return a;
1732 }
1733 if ( bExp == 0 ) {
1734 --expDiff;
1735 }
1736 else {
1737 bSig |= 0x20000000;
1738 }
1739 shift32RightJamming( bSig, expDiff, &bSig );
1740 zExp = aExp;
1741 }
1742 else if ( expDiff < 0 ) {
1743 if ( bExp == 0xFF ) {
1744 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1745 return packFloat32( zSign, 0xFF, 0 );
1746 }
1747 if ( aExp == 0 ) {
1748 ++expDiff;
1749 }
1750 else {
1751 aSig |= 0x20000000;
1752 }
1753 shift32RightJamming( aSig, - expDiff, &aSig );
1754 zExp = bExp;
1755 }
1756 else {
1757 if ( aExp == 0xFF ) {
1758 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1759 return a;
1760 }
fe76d976 1761 if ( aExp == 0 ) {
e6afc87f
PM
1762 if (STATUS(flush_to_zero)) {
1763 if (aSig | bSig) {
1764 float_raise(float_flag_output_denormal STATUS_VAR);
1765 }
1766 return packFloat32(zSign, 0, 0);
1767 }
fe76d976
PB
1768 return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1769 }
158142c2
FB
1770 zSig = 0x40000000 + aSig + bSig;
1771 zExp = aExp;
1772 goto roundAndPack;
1773 }
1774 aSig |= 0x20000000;
1775 zSig = ( aSig + bSig )<<1;
1776 --zExp;
bb98fe42 1777 if ( (int32_t) zSig < 0 ) {
158142c2
FB
1778 zSig = aSig + bSig;
1779 ++zExp;
1780 }
1781 roundAndPack:
1782 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1783
1784}
1785
1786/*----------------------------------------------------------------------------
1787| Returns the result of subtracting the absolute values of the single-
1788| precision floating-point values `a' and `b'. If `zSign' is 1, the
1789| difference is negated before being returned. `zSign' is ignored if the
1790| result is a NaN. The subtraction is performed according to the IEC/IEEE
1791| Standard for Binary Floating-Point Arithmetic.
1792*----------------------------------------------------------------------------*/
1793
1794static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1795{
94a49d86 1796 int_fast16_t aExp, bExp, zExp;
bb98fe42 1797 uint32_t aSig, bSig, zSig;
94a49d86 1798 int_fast16_t expDiff;
158142c2
FB
1799
1800 aSig = extractFloat32Frac( a );
1801 aExp = extractFloat32Exp( a );
1802 bSig = extractFloat32Frac( b );
1803 bExp = extractFloat32Exp( b );
1804 expDiff = aExp - bExp;
1805 aSig <<= 7;
1806 bSig <<= 7;
1807 if ( 0 < expDiff ) goto aExpBigger;
1808 if ( expDiff < 0 ) goto bExpBigger;
1809 if ( aExp == 0xFF ) {
1810 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1811 float_raise( float_flag_invalid STATUS_VAR);
1812 return float32_default_nan;
1813 }
1814 if ( aExp == 0 ) {
1815 aExp = 1;
1816 bExp = 1;
1817 }
1818 if ( bSig < aSig ) goto aBigger;
1819 if ( aSig < bSig ) goto bBigger;
1820 return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1821 bExpBigger:
1822 if ( bExp == 0xFF ) {
1823 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1824 return packFloat32( zSign ^ 1, 0xFF, 0 );
1825 }
1826 if ( aExp == 0 ) {
1827 ++expDiff;
1828 }
1829 else {
1830 aSig |= 0x40000000;
1831 }
1832 shift32RightJamming( aSig, - expDiff, &aSig );
1833 bSig |= 0x40000000;
1834 bBigger:
1835 zSig = bSig - aSig;
1836 zExp = bExp;
1837 zSign ^= 1;
1838 goto normalizeRoundAndPack;
1839 aExpBigger:
1840 if ( aExp == 0xFF ) {
1841 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1842 return a;
1843 }
1844 if ( bExp == 0 ) {
1845 --expDiff;
1846 }
1847 else {
1848 bSig |= 0x40000000;
1849 }
1850 shift32RightJamming( bSig, expDiff, &bSig );
1851 aSig |= 0x40000000;
1852 aBigger:
1853 zSig = aSig - bSig;
1854 zExp = aExp;
1855 normalizeRoundAndPack:
1856 --zExp;
1857 return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1858
1859}
1860
1861/*----------------------------------------------------------------------------
1862| Returns the result of adding the single-precision floating-point values `a'
1863| and `b'. The operation is performed according to the IEC/IEEE Standard for
1864| Binary Floating-Point Arithmetic.
1865*----------------------------------------------------------------------------*/
1866
1867float32 float32_add( float32 a, float32 b STATUS_PARAM )
1868{
1869 flag aSign, bSign;
37d18660
PM
1870 a = float32_squash_input_denormal(a STATUS_VAR);
1871 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
1872
1873 aSign = extractFloat32Sign( a );
1874 bSign = extractFloat32Sign( b );
1875 if ( aSign == bSign ) {
1876 return addFloat32Sigs( a, b, aSign STATUS_VAR);
1877 }
1878 else {
1879 return subFloat32Sigs( a, b, aSign STATUS_VAR );
1880 }
1881
1882}
1883
1884/*----------------------------------------------------------------------------
1885| Returns the result of subtracting the single-precision floating-point values
1886| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1887| for Binary Floating-Point Arithmetic.
1888*----------------------------------------------------------------------------*/
1889
1890float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1891{
1892 flag aSign, bSign;
37d18660
PM
1893 a = float32_squash_input_denormal(a STATUS_VAR);
1894 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
1895
1896 aSign = extractFloat32Sign( a );
1897 bSign = extractFloat32Sign( b );
1898 if ( aSign == bSign ) {
1899 return subFloat32Sigs( a, b, aSign STATUS_VAR );
1900 }
1901 else {
1902 return addFloat32Sigs( a, b, aSign STATUS_VAR );
1903 }
1904
1905}
1906
1907/*----------------------------------------------------------------------------
1908| Returns the result of multiplying the single-precision floating-point values
1909| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1910| for Binary Floating-Point Arithmetic.
1911*----------------------------------------------------------------------------*/
1912
1913float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1914{
1915 flag aSign, bSign, zSign;
94a49d86 1916 int_fast16_t aExp, bExp, zExp;
bb98fe42
AF
1917 uint32_t aSig, bSig;
1918 uint64_t zSig64;
1919 uint32_t zSig;
158142c2 1920
37d18660
PM
1921 a = float32_squash_input_denormal(a STATUS_VAR);
1922 b = float32_squash_input_denormal(b STATUS_VAR);
1923
158142c2
FB
1924 aSig = extractFloat32Frac( a );
1925 aExp = extractFloat32Exp( a );
1926 aSign = extractFloat32Sign( a );
1927 bSig = extractFloat32Frac( b );
1928 bExp = extractFloat32Exp( b );
1929 bSign = extractFloat32Sign( b );
1930 zSign = aSign ^ bSign;
1931 if ( aExp == 0xFF ) {
1932 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1933 return propagateFloat32NaN( a, b STATUS_VAR );
1934 }
1935 if ( ( bExp | bSig ) == 0 ) {
1936 float_raise( float_flag_invalid STATUS_VAR);
1937 return float32_default_nan;
1938 }
1939 return packFloat32( zSign, 0xFF, 0 );
1940 }
1941 if ( bExp == 0xFF ) {
1942 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1943 if ( ( aExp | aSig ) == 0 ) {
1944 float_raise( float_flag_invalid STATUS_VAR);
1945 return float32_default_nan;
1946 }
1947 return packFloat32( zSign, 0xFF, 0 );
1948 }
1949 if ( aExp == 0 ) {
1950 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1951 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1952 }
1953 if ( bExp == 0 ) {
1954 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1955 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1956 }
1957 zExp = aExp + bExp - 0x7F;
1958 aSig = ( aSig | 0x00800000 )<<7;
1959 bSig = ( bSig | 0x00800000 )<<8;
bb98fe42 1960 shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
158142c2 1961 zSig = zSig64;
bb98fe42 1962 if ( 0 <= (int32_t) ( zSig<<1 ) ) {
158142c2
FB
1963 zSig <<= 1;
1964 --zExp;
1965 }
1966 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1967
1968}
1969
1970/*----------------------------------------------------------------------------
1971| Returns the result of dividing the single-precision floating-point value `a'
1972| by the corresponding value `b'. The operation is performed according to the
1973| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1974*----------------------------------------------------------------------------*/
1975
1976float32 float32_div( float32 a, float32 b STATUS_PARAM )
1977{
1978 flag aSign, bSign, zSign;
94a49d86 1979 int_fast16_t aExp, bExp, zExp;
bb98fe42 1980 uint32_t aSig, bSig, zSig;
37d18660
PM
1981 a = float32_squash_input_denormal(a STATUS_VAR);
1982 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
1983
1984 aSig = extractFloat32Frac( a );
1985 aExp = extractFloat32Exp( a );
1986 aSign = extractFloat32Sign( a );
1987 bSig = extractFloat32Frac( b );
1988 bExp = extractFloat32Exp( b );
1989 bSign = extractFloat32Sign( b );
1990 zSign = aSign ^ bSign;
1991 if ( aExp == 0xFF ) {
1992 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1993 if ( bExp == 0xFF ) {
1994 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1995 float_raise( float_flag_invalid STATUS_VAR);
1996 return float32_default_nan;
1997 }
1998 return packFloat32( zSign, 0xFF, 0 );
1999 }
2000 if ( bExp == 0xFF ) {
2001 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2002 return packFloat32( zSign, 0, 0 );
2003 }
2004 if ( bExp == 0 ) {
2005 if ( bSig == 0 ) {
2006 if ( ( aExp | aSig ) == 0 ) {
2007 float_raise( float_flag_invalid STATUS_VAR);
2008 return float32_default_nan;
2009 }
2010 float_raise( float_flag_divbyzero STATUS_VAR);
2011 return packFloat32( zSign, 0xFF, 0 );
2012 }
2013 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2014 }
2015 if ( aExp == 0 ) {
2016 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2017 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2018 }
2019 zExp = aExp - bExp + 0x7D;
2020 aSig = ( aSig | 0x00800000 )<<7;
2021 bSig = ( bSig | 0x00800000 )<<8;
2022 if ( bSig <= ( aSig + aSig ) ) {
2023 aSig >>= 1;
2024 ++zExp;
2025 }
bb98fe42 2026 zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
158142c2 2027 if ( ( zSig & 0x3F ) == 0 ) {
bb98fe42 2028 zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
158142c2
FB
2029 }
2030 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2031
2032}
2033
2034/*----------------------------------------------------------------------------
2035| Returns the remainder of the single-precision floating-point value `a'
2036| with respect to the corresponding value `b'. The operation is performed
2037| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2038*----------------------------------------------------------------------------*/
2039
2040float32 float32_rem( float32 a, float32 b STATUS_PARAM )
2041{
ed086f3d 2042 flag aSign, zSign;
94a49d86 2043 int_fast16_t aExp, bExp, expDiff;
bb98fe42
AF
2044 uint32_t aSig, bSig;
2045 uint32_t q;
2046 uint64_t aSig64, bSig64, q64;
2047 uint32_t alternateASig;
2048 int32_t sigMean;
37d18660
PM
2049 a = float32_squash_input_denormal(a STATUS_VAR);
2050 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2051
2052 aSig = extractFloat32Frac( a );
2053 aExp = extractFloat32Exp( a );
2054 aSign = extractFloat32Sign( a );
2055 bSig = extractFloat32Frac( b );
2056 bExp = extractFloat32Exp( b );
158142c2
FB
2057 if ( aExp == 0xFF ) {
2058 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2059 return propagateFloat32NaN( a, b STATUS_VAR );
2060 }
2061 float_raise( float_flag_invalid STATUS_VAR);
2062 return float32_default_nan;
2063 }
2064 if ( bExp == 0xFF ) {
2065 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2066 return a;
2067 }
2068 if ( bExp == 0 ) {
2069 if ( bSig == 0 ) {
2070 float_raise( float_flag_invalid STATUS_VAR);
2071 return float32_default_nan;
2072 }
2073 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2074 }
2075 if ( aExp == 0 ) {
2076 if ( aSig == 0 ) return a;
2077 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2078 }
2079 expDiff = aExp - bExp;
2080 aSig |= 0x00800000;
2081 bSig |= 0x00800000;
2082 if ( expDiff < 32 ) {
2083 aSig <<= 8;
2084 bSig <<= 8;
2085 if ( expDiff < 0 ) {
2086 if ( expDiff < -1 ) return a;
2087 aSig >>= 1;
2088 }
2089 q = ( bSig <= aSig );
2090 if ( q ) aSig -= bSig;
2091 if ( 0 < expDiff ) {
bb98fe42 2092 q = ( ( (uint64_t) aSig )<<32 ) / bSig;
158142c2
FB
2093 q >>= 32 - expDiff;
2094 bSig >>= 2;
2095 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2096 }
2097 else {
2098 aSig >>= 2;
2099 bSig >>= 2;
2100 }
2101 }
2102 else {
2103 if ( bSig <= aSig ) aSig -= bSig;
bb98fe42
AF
2104 aSig64 = ( (uint64_t) aSig )<<40;
2105 bSig64 = ( (uint64_t) bSig )<<40;
158142c2
FB
2106 expDiff -= 64;
2107 while ( 0 < expDiff ) {
2108 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2109 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2110 aSig64 = - ( ( bSig * q64 )<<38 );
2111 expDiff -= 62;
2112 }
2113 expDiff += 64;
2114 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2115 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2116 q = q64>>( 64 - expDiff );
2117 bSig <<= 6;
2118 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2119 }
2120 do {
2121 alternateASig = aSig;
2122 ++q;
2123 aSig -= bSig;
bb98fe42 2124 } while ( 0 <= (int32_t) aSig );
158142c2
FB
2125 sigMean = aSig + alternateASig;
2126 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2127 aSig = alternateASig;
2128 }
bb98fe42 2129 zSign = ( (int32_t) aSig < 0 );
158142c2
FB
2130 if ( zSign ) aSig = - aSig;
2131 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2132
2133}
2134
369be8f6
PM
2135/*----------------------------------------------------------------------------
2136| Returns the result of multiplying the single-precision floating-point values
2137| `a' and `b' then adding 'c', with no intermediate rounding step after the
2138| multiplication. The operation is performed according to the IEC/IEEE
2139| Standard for Binary Floating-Point Arithmetic 754-2008.
2140| The flags argument allows the caller to select negation of the
2141| addend, the intermediate product, or the final result. (The difference
2142| between this and having the caller do a separate negation is that negating
2143| externally will flip the sign bit on NaNs.)
2144*----------------------------------------------------------------------------*/
2145
2146float32 float32_muladd(float32 a, float32 b, float32 c, int flags STATUS_PARAM)
2147{
2148 flag aSign, bSign, cSign, zSign;
94a49d86 2149 int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
369be8f6
PM
2150 uint32_t aSig, bSig, cSig;
2151 flag pInf, pZero, pSign;
2152 uint64_t pSig64, cSig64, zSig64;
2153 uint32_t pSig;
2154 int shiftcount;
2155 flag signflip, infzero;
2156
2157 a = float32_squash_input_denormal(a STATUS_VAR);
2158 b = float32_squash_input_denormal(b STATUS_VAR);
2159 c = float32_squash_input_denormal(c STATUS_VAR);
2160 aSig = extractFloat32Frac(a);
2161 aExp = extractFloat32Exp(a);
2162 aSign = extractFloat32Sign(a);
2163 bSig = extractFloat32Frac(b);
2164 bExp = extractFloat32Exp(b);
2165 bSign = extractFloat32Sign(b);
2166 cSig = extractFloat32Frac(c);
2167 cExp = extractFloat32Exp(c);
2168 cSign = extractFloat32Sign(c);
2169
2170 infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) ||
2171 (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0));
2172
2173 /* It is implementation-defined whether the cases of (0,inf,qnan)
2174 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
2175 * they return if they do), so we have to hand this information
2176 * off to the target-specific pick-a-NaN routine.
2177 */
2178 if (((aExp == 0xff) && aSig) ||
2179 ((bExp == 0xff) && bSig) ||
2180 ((cExp == 0xff) && cSig)) {
2181 return propagateFloat32MulAddNaN(a, b, c, infzero STATUS_VAR);
2182 }
2183
2184 if (infzero) {
2185 float_raise(float_flag_invalid STATUS_VAR);
2186 return float32_default_nan;
2187 }
2188
2189 if (flags & float_muladd_negate_c) {
2190 cSign ^= 1;
2191 }
2192
2193 signflip = (flags & float_muladd_negate_result) ? 1 : 0;
2194
2195 /* Work out the sign and type of the product */
2196 pSign = aSign ^ bSign;
2197 if (flags & float_muladd_negate_product) {
2198 pSign ^= 1;
2199 }
2200 pInf = (aExp == 0xff) || (bExp == 0xff);
2201 pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
2202
2203 if (cExp == 0xff) {
2204 if (pInf && (pSign ^ cSign)) {
2205 /* addition of opposite-signed infinities => InvalidOperation */
2206 float_raise(float_flag_invalid STATUS_VAR);
2207 return float32_default_nan;
2208 }
2209 /* Otherwise generate an infinity of the same sign */
2210 return packFloat32(cSign ^ signflip, 0xff, 0);
2211 }
2212
2213 if (pInf) {
2214 return packFloat32(pSign ^ signflip, 0xff, 0);
2215 }
2216
2217 if (pZero) {
2218 if (cExp == 0) {
2219 if (cSig == 0) {
2220 /* Adding two exact zeroes */
2221 if (pSign == cSign) {
2222 zSign = pSign;
2223 } else if (STATUS(float_rounding_mode) == float_round_down) {
2224 zSign = 1;
2225 } else {
2226 zSign = 0;
2227 }
2228 return packFloat32(zSign ^ signflip, 0, 0);
2229 }
2230 /* Exact zero plus a denorm */
2231 if (STATUS(flush_to_zero)) {
2232 float_raise(float_flag_output_denormal STATUS_VAR);
2233 return packFloat32(cSign ^ signflip, 0, 0);
2234 }
2235 }
2236 /* Zero plus something non-zero : just return the something */
a6e7c184 2237 return packFloat32(cSign ^ signflip, cExp, cSig);
369be8f6
PM
2238 }
2239
2240 if (aExp == 0) {
2241 normalizeFloat32Subnormal(aSig, &aExp, &aSig);
2242 }
2243 if (bExp == 0) {
2244 normalizeFloat32Subnormal(bSig, &bExp, &bSig);
2245 }
2246
2247 /* Calculate the actual result a * b + c */
2248
2249 /* Multiply first; this is easy. */
2250 /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
2251 * because we want the true exponent, not the "one-less-than"
2252 * flavour that roundAndPackFloat32() takes.
2253 */
2254 pExp = aExp + bExp - 0x7e;
2255 aSig = (aSig | 0x00800000) << 7;
2256 bSig = (bSig | 0x00800000) << 8;
2257 pSig64 = (uint64_t)aSig * bSig;
2258 if ((int64_t)(pSig64 << 1) >= 0) {
2259 pSig64 <<= 1;
2260 pExp--;
2261 }
2262
2263 zSign = pSign ^ signflip;
2264
2265 /* Now pSig64 is the significand of the multiply, with the explicit bit in
2266 * position 62.
2267 */
2268 if (cExp == 0) {
2269 if (!cSig) {
2270 /* Throw out the special case of c being an exact zero now */
2271 shift64RightJamming(pSig64, 32, &pSig64);
2272 pSig = pSig64;
2273 return roundAndPackFloat32(zSign, pExp - 1,
2274 pSig STATUS_VAR);
2275 }
2276 normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2277 }
2278
2279 cSig64 = (uint64_t)cSig << (62 - 23);
2280 cSig64 |= LIT64(0x4000000000000000);
2281 expDiff = pExp - cExp;
2282
2283 if (pSign == cSign) {
2284 /* Addition */
2285 if (expDiff > 0) {
2286 /* scale c to match p */
2287 shift64RightJamming(cSig64, expDiff, &cSig64);
2288 zExp = pExp;
2289 } else if (expDiff < 0) {
2290 /* scale p to match c */
2291 shift64RightJamming(pSig64, -expDiff, &pSig64);
2292 zExp = cExp;
2293 } else {
2294 /* no scaling needed */
2295 zExp = cExp;
2296 }
2297 /* Add significands and make sure explicit bit ends up in posn 62 */
2298 zSig64 = pSig64 + cSig64;
2299 if ((int64_t)zSig64 < 0) {
2300 shift64RightJamming(zSig64, 1, &zSig64);
2301 } else {
2302 zExp--;
2303 }
2304 } else {
2305 /* Subtraction */
2306 if (expDiff > 0) {
2307 shift64RightJamming(cSig64, expDiff, &cSig64);
2308 zSig64 = pSig64 - cSig64;
2309 zExp = pExp;
2310 } else if (expDiff < 0) {
2311 shift64RightJamming(pSig64, -expDiff, &pSig64);
2312 zSig64 = cSig64 - pSig64;
2313 zExp = cExp;
2314 zSign ^= 1;
2315 } else {
2316 zExp = pExp;
2317 if (cSig64 < pSig64) {
2318 zSig64 = pSig64 - cSig64;
2319 } else if (pSig64 < cSig64) {
2320 zSig64 = cSig64 - pSig64;
2321 zSign ^= 1;
2322 } else {
2323 /* Exact zero */
2324 zSign = signflip;
2325 if (STATUS(float_rounding_mode) == float_round_down) {
2326 zSign ^= 1;
2327 }
2328 return packFloat32(zSign, 0, 0);
2329 }
2330 }
2331 --zExp;
2332 /* Normalize to put the explicit bit back into bit 62. */
2333 shiftcount = countLeadingZeros64(zSig64) - 1;
2334 zSig64 <<= shiftcount;
2335 zExp -= shiftcount;
2336 }
2337 shift64RightJamming(zSig64, 32, &zSig64);
2338 return roundAndPackFloat32(zSign, zExp, zSig64 STATUS_VAR);
2339}
2340
2341
158142c2
FB
2342/*----------------------------------------------------------------------------
2343| Returns the square root of the single-precision floating-point value `a'.
2344| The operation is performed according to the IEC/IEEE Standard for Binary
2345| Floating-Point Arithmetic.
2346*----------------------------------------------------------------------------*/
2347
2348float32 float32_sqrt( float32 a STATUS_PARAM )
2349{
2350 flag aSign;
94a49d86 2351 int_fast16_t aExp, zExp;
bb98fe42
AF
2352 uint32_t aSig, zSig;
2353 uint64_t rem, term;
37d18660 2354 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
2355
2356 aSig = extractFloat32Frac( a );
2357 aExp = extractFloat32Exp( a );
2358 aSign = extractFloat32Sign( a );
2359 if ( aExp == 0xFF ) {
f090c9d4 2360 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
158142c2
FB
2361 if ( ! aSign ) return a;
2362 float_raise( float_flag_invalid STATUS_VAR);
2363 return float32_default_nan;
2364 }
2365 if ( aSign ) {
2366 if ( ( aExp | aSig ) == 0 ) return a;
2367 float_raise( float_flag_invalid STATUS_VAR);
2368 return float32_default_nan;
2369 }
2370 if ( aExp == 0 ) {
f090c9d4 2371 if ( aSig == 0 ) return float32_zero;
158142c2
FB
2372 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2373 }
2374 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2375 aSig = ( aSig | 0x00800000 )<<8;
2376 zSig = estimateSqrt32( aExp, aSig ) + 2;
2377 if ( ( zSig & 0x7F ) <= 5 ) {
2378 if ( zSig < 2 ) {
2379 zSig = 0x7FFFFFFF;
2380 goto roundAndPack;
2381 }
2382 aSig >>= aExp & 1;
bb98fe42
AF
2383 term = ( (uint64_t) zSig ) * zSig;
2384 rem = ( ( (uint64_t) aSig )<<32 ) - term;
2385 while ( (int64_t) rem < 0 ) {
158142c2 2386 --zSig;
bb98fe42 2387 rem += ( ( (uint64_t) zSig )<<1 ) | 1;
158142c2
FB
2388 }
2389 zSig |= ( rem != 0 );
2390 }
2391 shift32RightJamming( zSig, 1, &zSig );
2392 roundAndPack:
2393 return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2394
2395}
2396
8229c991
AJ
2397/*----------------------------------------------------------------------------
2398| Returns the binary exponential of the single-precision floating-point value
2399| `a'. The operation is performed according to the IEC/IEEE Standard for
2400| Binary Floating-Point Arithmetic.
2401|
2402| Uses the following identities:
2403|
2404| 1. -------------------------------------------------------------------------
2405| x x*ln(2)
2406| 2 = e
2407|
2408| 2. -------------------------------------------------------------------------
2409| 2 3 4 5 n
2410| x x x x x x x
2411| e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2412| 1! 2! 3! 4! 5! n!
2413*----------------------------------------------------------------------------*/
2414
2415static const float64 float32_exp2_coefficients[15] =
2416{
d5138cf4
PM
2417 const_float64( 0x3ff0000000000000ll ), /* 1 */
2418 const_float64( 0x3fe0000000000000ll ), /* 2 */
2419 const_float64( 0x3fc5555555555555ll ), /* 3 */
2420 const_float64( 0x3fa5555555555555ll ), /* 4 */
2421 const_float64( 0x3f81111111111111ll ), /* 5 */
2422 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
2423 const_float64( 0x3f2a01a01a01a01all ), /* 7 */
2424 const_float64( 0x3efa01a01a01a01all ), /* 8 */
2425 const_float64( 0x3ec71de3a556c734ll ), /* 9 */
2426 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2427 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2428 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2429 const_float64( 0x3de6124613a86d09ll ), /* 13 */
2430 const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2431 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
8229c991
AJ
2432};
2433
2434float32 float32_exp2( float32 a STATUS_PARAM )
2435{
2436 flag aSign;
94a49d86 2437 int_fast16_t aExp;
bb98fe42 2438 uint32_t aSig;
8229c991
AJ
2439 float64 r, x, xn;
2440 int i;
37d18660 2441 a = float32_squash_input_denormal(a STATUS_VAR);
8229c991
AJ
2442
2443 aSig = extractFloat32Frac( a );
2444 aExp = extractFloat32Exp( a );
2445 aSign = extractFloat32Sign( a );
2446
2447 if ( aExp == 0xFF) {
2448 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2449 return (aSign) ? float32_zero : a;
2450 }
2451 if (aExp == 0) {
2452 if (aSig == 0) return float32_one;
2453 }
2454
2455 float_raise( float_flag_inexact STATUS_VAR);
2456
2457 /* ******************************* */
2458 /* using float64 for approximation */
2459 /* ******************************* */
2460 x = float32_to_float64(a STATUS_VAR);
2461 x = float64_mul(x, float64_ln2 STATUS_VAR);
2462
2463 xn = x;
2464 r = float64_one;
2465 for (i = 0 ; i < 15 ; i++) {
2466 float64 f;
2467
2468 f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2469 r = float64_add(r, f STATUS_VAR);
2470
2471 xn = float64_mul(xn, x STATUS_VAR);
2472 }
2473
2474 return float64_to_float32(r, status);
2475}
2476
374dfc33
AJ
2477/*----------------------------------------------------------------------------
2478| Returns the binary log of the single-precision floating-point value `a'.
2479| The operation is performed according to the IEC/IEEE Standard for Binary
2480| Floating-Point Arithmetic.
2481*----------------------------------------------------------------------------*/
2482float32 float32_log2( float32 a STATUS_PARAM )
2483{
2484 flag aSign, zSign;
94a49d86 2485 int_fast16_t aExp;
bb98fe42 2486 uint32_t aSig, zSig, i;
374dfc33 2487
37d18660 2488 a = float32_squash_input_denormal(a STATUS_VAR);
374dfc33
AJ
2489 aSig = extractFloat32Frac( a );
2490 aExp = extractFloat32Exp( a );
2491 aSign = extractFloat32Sign( a );
2492
2493 if ( aExp == 0 ) {
2494 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2495 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2496 }
2497 if ( aSign ) {
2498 float_raise( float_flag_invalid STATUS_VAR);
2499 return float32_default_nan;
2500 }
2501 if ( aExp == 0xFF ) {
2502 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2503 return a;
2504 }
2505
2506 aExp -= 0x7F;
2507 aSig |= 0x00800000;
2508 zSign = aExp < 0;
2509 zSig = aExp << 23;
2510
2511 for (i = 1 << 22; i > 0; i >>= 1) {
bb98fe42 2512 aSig = ( (uint64_t)aSig * aSig ) >> 23;
374dfc33
AJ
2513 if ( aSig & 0x01000000 ) {
2514 aSig >>= 1;
2515 zSig |= i;
2516 }
2517 }
2518
2519 if ( zSign )
2520 zSig = -zSig;
2521
2522 return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2523}
2524
158142c2
FB
2525/*----------------------------------------------------------------------------
2526| Returns 1 if the single-precision floating-point value `a' is equal to
b689362d
AJ
2527| the corresponding value `b', and 0 otherwise. The invalid exception is
2528| raised if either operand is a NaN. Otherwise, the comparison is performed
158142c2
FB
2529| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2530*----------------------------------------------------------------------------*/
2531
b689362d 2532int float32_eq( float32 a, float32 b STATUS_PARAM )
158142c2 2533{
b689362d 2534 uint32_t av, bv;
37d18660
PM
2535 a = float32_squash_input_denormal(a STATUS_VAR);
2536 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2537
2538 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2539 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2540 ) {
b689362d 2541 float_raise( float_flag_invalid STATUS_VAR);
158142c2
FB
2542 return 0;
2543 }
b689362d
AJ
2544 av = float32_val(a);
2545 bv = float32_val(b);
2546 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
158142c2
FB
2547}
2548
2549/*----------------------------------------------------------------------------
2550| Returns 1 if the single-precision floating-point value `a' is less than
f5a64251
AJ
2551| or equal to the corresponding value `b', and 0 otherwise. The invalid
2552| exception is raised if either operand is a NaN. The comparison is performed
2553| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
2554*----------------------------------------------------------------------------*/
2555
750afe93 2556int float32_le( float32 a, float32 b STATUS_PARAM )
158142c2
FB
2557{
2558 flag aSign, bSign;
bb98fe42 2559 uint32_t av, bv;
37d18660
PM
2560 a = float32_squash_input_denormal(a STATUS_VAR);
2561 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2562
2563 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2564 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2565 ) {
2566 float_raise( float_flag_invalid STATUS_VAR);
2567 return 0;
2568 }
2569 aSign = extractFloat32Sign( a );
2570 bSign = extractFloat32Sign( b );
f090c9d4
PB
2571 av = float32_val(a);
2572 bv = float32_val(b);
bb98fe42 2573 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 2574 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
2575
2576}
2577
2578/*----------------------------------------------------------------------------
2579| Returns 1 if the single-precision floating-point value `a' is less than
f5a64251
AJ
2580| the corresponding value `b', and 0 otherwise. The invalid exception is
2581| raised if either operand is a NaN. The comparison is performed according
2582| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
2583*----------------------------------------------------------------------------*/
2584
750afe93 2585int float32_lt( float32 a, float32 b STATUS_PARAM )
158142c2
FB
2586{
2587 flag aSign, bSign;
bb98fe42 2588 uint32_t av, bv;
37d18660
PM
2589 a = float32_squash_input_denormal(a STATUS_VAR);
2590 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2591
2592 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2593 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2594 ) {
2595 float_raise( float_flag_invalid STATUS_VAR);
2596 return 0;
2597 }
2598 aSign = extractFloat32Sign( a );
2599 bSign = extractFloat32Sign( b );
f090c9d4
PB
2600 av = float32_val(a);
2601 bv = float32_val(b);
bb98fe42 2602 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 2603 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
2604
2605}
2606
67b7861d
AJ
2607/*----------------------------------------------------------------------------
2608| Returns 1 if the single-precision floating-point values `a' and `b' cannot
f5a64251
AJ
2609| be compared, and 0 otherwise. The invalid exception is raised if either
2610| operand is a NaN. The comparison is performed according to the IEC/IEEE
2611| Standard for Binary Floating-Point Arithmetic.
67b7861d
AJ
2612*----------------------------------------------------------------------------*/
2613
2614int float32_unordered( float32 a, float32 b STATUS_PARAM )
2615{
2616 a = float32_squash_input_denormal(a STATUS_VAR);
2617 b = float32_squash_input_denormal(b STATUS_VAR);
2618
2619 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2620 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2621 ) {
2622 float_raise( float_flag_invalid STATUS_VAR);
2623 return 1;
2624 }
2625 return 0;
2626}
b689362d 2627
158142c2
FB
2628/*----------------------------------------------------------------------------
2629| Returns 1 if the single-precision floating-point value `a' is equal to
f5a64251
AJ
2630| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2631| exception. The comparison is performed according to the IEC/IEEE Standard
2632| for Binary Floating-Point Arithmetic.
158142c2
FB
2633*----------------------------------------------------------------------------*/
2634
b689362d 2635int float32_eq_quiet( float32 a, float32 b STATUS_PARAM )
158142c2 2636{
37d18660
PM
2637 a = float32_squash_input_denormal(a STATUS_VAR);
2638 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2639
2640 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2641 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2642 ) {
b689362d
AJ
2643 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2644 float_raise( float_flag_invalid STATUS_VAR);
2645 }
158142c2
FB
2646 return 0;
2647 }
b689362d
AJ
2648 return ( float32_val(a) == float32_val(b) ) ||
2649 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
158142c2
FB
2650}
2651
2652/*----------------------------------------------------------------------------
2653| Returns 1 if the single-precision floating-point value `a' is less than or
2654| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2655| cause an exception. Otherwise, the comparison is performed according to the
2656| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2657*----------------------------------------------------------------------------*/
2658
750afe93 2659int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
158142c2
FB
2660{
2661 flag aSign, bSign;
bb98fe42 2662 uint32_t av, bv;
37d18660
PM
2663 a = float32_squash_input_denormal(a STATUS_VAR);
2664 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2665
2666 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2667 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2668 ) {
2669 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2670 float_raise( float_flag_invalid STATUS_VAR);
2671 }
2672 return 0;
2673 }
2674 aSign = extractFloat32Sign( a );
2675 bSign = extractFloat32Sign( b );
f090c9d4
PB
2676 av = float32_val(a);
2677 bv = float32_val(b);
bb98fe42 2678 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 2679 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
2680
2681}
2682
2683/*----------------------------------------------------------------------------
2684| Returns 1 if the single-precision floating-point value `a' is less than
2685| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2686| exception. Otherwise, the comparison is performed according to the IEC/IEEE
2687| Standard for Binary Floating-Point Arithmetic.
2688*----------------------------------------------------------------------------*/
2689
750afe93 2690int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
158142c2
FB
2691{
2692 flag aSign, bSign;
bb98fe42 2693 uint32_t av, bv;
37d18660
PM
2694 a = float32_squash_input_denormal(a STATUS_VAR);
2695 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2696
2697 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2698 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2699 ) {
2700 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2701 float_raise( float_flag_invalid STATUS_VAR);
2702 }
2703 return 0;
2704 }
2705 aSign = extractFloat32Sign( a );
2706 bSign = extractFloat32Sign( b );
f090c9d4
PB
2707 av = float32_val(a);
2708 bv = float32_val(b);
bb98fe42 2709 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 2710 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
2711
2712}
2713
67b7861d
AJ
2714/*----------------------------------------------------------------------------
2715| Returns 1 if the single-precision floating-point values `a' and `b' cannot
2716| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
2717| comparison is performed according to the IEC/IEEE Standard for Binary
2718| Floating-Point Arithmetic.
2719*----------------------------------------------------------------------------*/
2720
2721int float32_unordered_quiet( float32 a, float32 b STATUS_PARAM )
2722{
2723 a = float32_squash_input_denormal(a STATUS_VAR);
2724 b = float32_squash_input_denormal(b STATUS_VAR);
2725
2726 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2727 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2728 ) {
2729 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2730 float_raise( float_flag_invalid STATUS_VAR);
2731 }
2732 return 1;
2733 }
2734 return 0;
2735}
2736
158142c2
FB
2737/*----------------------------------------------------------------------------
2738| Returns the result of converting the double-precision floating-point value
2739| `a' to the 32-bit two's complement integer format. The conversion is
2740| performed according to the IEC/IEEE Standard for Binary Floating-Point
2741| Arithmetic---which means in particular that the conversion is rounded
2742| according to the current rounding mode. If `a' is a NaN, the largest
2743| positive integer is returned. Otherwise, if the conversion overflows, the
2744| largest integer with the same sign as `a' is returned.
2745*----------------------------------------------------------------------------*/
2746
2747int32 float64_to_int32( float64 a STATUS_PARAM )
2748{
2749 flag aSign;
94a49d86 2750 int_fast16_t aExp, shiftCount;
bb98fe42 2751 uint64_t aSig;
37d18660 2752 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
2753
2754 aSig = extractFloat64Frac( a );
2755 aExp = extractFloat64Exp( a );
2756 aSign = extractFloat64Sign( a );
2757 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2758 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2759 shiftCount = 0x42C - aExp;
2760 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2761 return roundAndPackInt32( aSign, aSig STATUS_VAR );
2762
2763}
2764
2765/*----------------------------------------------------------------------------
2766| Returns the result of converting the double-precision floating-point value
2767| `a' to the 32-bit two's complement integer format. The conversion is
2768| performed according to the IEC/IEEE Standard for Binary Floating-Point
2769| Arithmetic, except that the conversion is always rounded toward zero.
2770| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2771| the conversion overflows, the largest integer with the same sign as `a' is
2772| returned.
2773*----------------------------------------------------------------------------*/
2774
2775int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2776{
2777 flag aSign;
94a49d86 2778 int_fast16_t aExp, shiftCount;
bb98fe42 2779 uint64_t aSig, savedASig;
b3a6a2e0 2780 int32_t z;
37d18660 2781 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
2782
2783 aSig = extractFloat64Frac( a );
2784 aExp = extractFloat64Exp( a );
2785 aSign = extractFloat64Sign( a );
2786 if ( 0x41E < aExp ) {
2787 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2788 goto invalid;
2789 }
2790 else if ( aExp < 0x3FF ) {
2791 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2792 return 0;
2793 }
2794 aSig |= LIT64( 0x0010000000000000 );
2795 shiftCount = 0x433 - aExp;
2796 savedASig = aSig;
2797 aSig >>= shiftCount;
2798 z = aSig;
2799 if ( aSign ) z = - z;
2800 if ( ( z < 0 ) ^ aSign ) {
2801 invalid:
2802 float_raise( float_flag_invalid STATUS_VAR);
bb98fe42 2803 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2
FB
2804 }
2805 if ( ( aSig<<shiftCount ) != savedASig ) {
2806 STATUS(float_exception_flags) |= float_flag_inexact;
2807 }
2808 return z;
2809
2810}
2811
cbcef455
PM
2812/*----------------------------------------------------------------------------
2813| Returns the result of converting the double-precision floating-point value
2814| `a' to the 16-bit two's complement integer format. The conversion is
2815| performed according to the IEC/IEEE Standard for Binary Floating-Point
2816| Arithmetic, except that the conversion is always rounded toward zero.
2817| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2818| the conversion overflows, the largest integer with the same sign as `a' is
2819| returned.
2820*----------------------------------------------------------------------------*/
2821
94a49d86 2822int_fast16_t float64_to_int16_round_to_zero(float64 a STATUS_PARAM)
cbcef455
PM
2823{
2824 flag aSign;
94a49d86 2825 int_fast16_t aExp, shiftCount;
bb98fe42 2826 uint64_t aSig, savedASig;
cbcef455
PM
2827 int32 z;
2828
2829 aSig = extractFloat64Frac( a );
2830 aExp = extractFloat64Exp( a );
2831 aSign = extractFloat64Sign( a );
2832 if ( 0x40E < aExp ) {
2833 if ( ( aExp == 0x7FF ) && aSig ) {
2834 aSign = 0;
2835 }
2836 goto invalid;
2837 }
2838 else if ( aExp < 0x3FF ) {
2839 if ( aExp || aSig ) {
2840 STATUS(float_exception_flags) |= float_flag_inexact;
2841 }
2842 return 0;
2843 }
2844 aSig |= LIT64( 0x0010000000000000 );
2845 shiftCount = 0x433 - aExp;
2846 savedASig = aSig;
2847 aSig >>= shiftCount;
2848 z = aSig;
2849 if ( aSign ) {
2850 z = - z;
2851 }
2852 if ( ( (int16_t)z < 0 ) ^ aSign ) {
2853 invalid:
2854 float_raise( float_flag_invalid STATUS_VAR);
bb98fe42 2855 return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
cbcef455
PM
2856 }
2857 if ( ( aSig<<shiftCount ) != savedASig ) {
2858 STATUS(float_exception_flags) |= float_flag_inexact;
2859 }
2860 return z;
2861}
2862
158142c2
FB
2863/*----------------------------------------------------------------------------
2864| Returns the result of converting the double-precision floating-point value
2865| `a' to the 64-bit two's complement integer format. The conversion is
2866| performed according to the IEC/IEEE Standard for Binary Floating-Point
2867| Arithmetic---which means in particular that the conversion is rounded
2868| according to the current rounding mode. If `a' is a NaN, the largest
2869| positive integer is returned. Otherwise, if the conversion overflows, the
2870| largest integer with the same sign as `a' is returned.
2871*----------------------------------------------------------------------------*/
2872
2873int64 float64_to_int64( float64 a STATUS_PARAM )
2874{
2875 flag aSign;
94a49d86 2876 int_fast16_t aExp, shiftCount;
bb98fe42 2877 uint64_t aSig, aSigExtra;
37d18660 2878 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
2879
2880 aSig = extractFloat64Frac( a );
2881 aExp = extractFloat64Exp( a );
2882 aSign = extractFloat64Sign( a );
2883 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2884 shiftCount = 0x433 - aExp;
2885 if ( shiftCount <= 0 ) {
2886 if ( 0x43E < aExp ) {
2887 float_raise( float_flag_invalid STATUS_VAR);
2888 if ( ! aSign
2889 || ( ( aExp == 0x7FF )
2890 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2891 ) {
2892 return LIT64( 0x7FFFFFFFFFFFFFFF );
2893 }
bb98fe42 2894 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
2895 }
2896 aSigExtra = 0;
2897 aSig <<= - shiftCount;
2898 }
2899 else {
2900 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2901 }
2902 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2903
2904}
2905
2906/*----------------------------------------------------------------------------
2907| Returns the result of converting the double-precision floating-point value
2908| `a' to the 64-bit two's complement integer format. The conversion is
2909| performed according to the IEC/IEEE Standard for Binary Floating-Point
2910| Arithmetic, except that the conversion is always rounded toward zero.
2911| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2912| the conversion overflows, the largest integer with the same sign as `a' is
2913| returned.
2914*----------------------------------------------------------------------------*/
2915
2916int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2917{
2918 flag aSign;
94a49d86 2919 int_fast16_t aExp, shiftCount;
bb98fe42 2920 uint64_t aSig;
158142c2 2921 int64 z;
37d18660 2922 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
2923
2924 aSig = extractFloat64Frac( a );
2925 aExp = extractFloat64Exp( a );
2926 aSign = extractFloat64Sign( a );
2927 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2928 shiftCount = aExp - 0x433;
2929 if ( 0 <= shiftCount ) {
2930 if ( 0x43E <= aExp ) {
f090c9d4 2931 if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
158142c2
FB
2932 float_raise( float_flag_invalid STATUS_VAR);
2933 if ( ! aSign
2934 || ( ( aExp == 0x7FF )
2935 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2936 ) {
2937 return LIT64( 0x7FFFFFFFFFFFFFFF );
2938 }
2939 }
bb98fe42 2940 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
2941 }
2942 z = aSig<<shiftCount;
2943 }
2944 else {
2945 if ( aExp < 0x3FE ) {
2946 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2947 return 0;
2948 }
2949 z = aSig>>( - shiftCount );
bb98fe42 2950 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
158142c2
FB
2951 STATUS(float_exception_flags) |= float_flag_inexact;
2952 }
2953 }
2954 if ( aSign ) z = - z;
2955 return z;
2956
2957}
2958
2959/*----------------------------------------------------------------------------
2960| Returns the result of converting the double-precision floating-point value
2961| `a' to the single-precision floating-point format. The conversion is
2962| performed according to the IEC/IEEE Standard for Binary Floating-Point
2963| Arithmetic.
2964*----------------------------------------------------------------------------*/
2965
2966float32 float64_to_float32( float64 a STATUS_PARAM )
2967{
2968 flag aSign;
94a49d86 2969 int_fast16_t aExp;
bb98fe42
AF
2970 uint64_t aSig;
2971 uint32_t zSig;
37d18660 2972 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
2973
2974 aSig = extractFloat64Frac( a );
2975 aExp = extractFloat64Exp( a );
2976 aSign = extractFloat64Sign( a );
2977 if ( aExp == 0x7FF ) {
bcd4d9af 2978 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
2979 return packFloat32( aSign, 0xFF, 0 );
2980 }
2981 shift64RightJamming( aSig, 22, &aSig );
2982 zSig = aSig;
2983 if ( aExp || zSig ) {
2984 zSig |= 0x40000000;
2985 aExp -= 0x381;
2986 }
2987 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2988
2989}
2990
60011498
PB
2991
2992/*----------------------------------------------------------------------------
2993| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2994| half-precision floating-point value, returning the result. After being
2995| shifted into the proper positions, the three fields are simply added
2996| together to form the result. This means that any integer portion of `zSig'
2997| will be added into the exponent. Since a properly normalized significand
2998| will have an integer portion equal to 1, the `zExp' input should be 1 less
2999| than the desired result exponent whenever `zSig' is a complete, normalized
3000| significand.
3001*----------------------------------------------------------------------------*/
94a49d86 3002static float16 packFloat16(flag zSign, int_fast16_t zExp, uint16_t zSig)
60011498 3003{
bb4d4bb3 3004 return make_float16(
bb98fe42 3005 (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
60011498
PB
3006}
3007
3008/* Half precision floats come in two formats: standard IEEE and "ARM" format.
3009 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
bb4d4bb3
PM
3010
3011float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM)
60011498
PB
3012{
3013 flag aSign;
94a49d86 3014 int_fast16_t aExp;
bb98fe42 3015 uint32_t aSig;
60011498 3016
bb4d4bb3
PM
3017 aSign = extractFloat16Sign(a);
3018 aExp = extractFloat16Exp(a);
3019 aSig = extractFloat16Frac(a);
60011498
PB
3020
3021 if (aExp == 0x1f && ieee) {
3022 if (aSig) {
f591e1be 3023 return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
60011498 3024 }
4be8eeac 3025 return packFloat32(aSign, 0xff, 0);
60011498
PB
3026 }
3027 if (aExp == 0) {
3028 int8 shiftCount;
3029
3030 if (aSig == 0) {
3031 return packFloat32(aSign, 0, 0);
3032 }
3033
3034 shiftCount = countLeadingZeros32( aSig ) - 21;
3035 aSig = aSig << shiftCount;
3036 aExp = -shiftCount;
3037 }
3038 return packFloat32( aSign, aExp + 0x70, aSig << 13);
3039}
3040
bb4d4bb3 3041float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM)
60011498
PB
3042{
3043 flag aSign;
94a49d86 3044 int_fast16_t aExp;
bb98fe42
AF
3045 uint32_t aSig;
3046 uint32_t mask;
3047 uint32_t increment;
60011498 3048 int8 roundingMode;
38970efa
PM
3049 int maxexp = ieee ? 15 : 16;
3050 bool rounding_bumps_exp;
3051 bool is_tiny = false;
3052
37d18660 3053 a = float32_squash_input_denormal(a STATUS_VAR);
60011498
PB
3054
3055 aSig = extractFloat32Frac( a );
3056 aExp = extractFloat32Exp( a );
3057 aSign = extractFloat32Sign( a );
3058 if ( aExp == 0xFF ) {
3059 if (aSig) {
600e30d2 3060 /* Input is a NaN */
600e30d2 3061 if (!ieee) {
38970efa 3062 float_raise(float_flag_invalid STATUS_VAR);
600e30d2
PM
3063 return packFloat16(aSign, 0, 0);
3064 }
38970efa
PM
3065 return commonNaNToFloat16(
3066 float32ToCommonNaN(a STATUS_VAR) STATUS_VAR);
60011498 3067 }
600e30d2
PM
3068 /* Infinity */
3069 if (!ieee) {
3070 float_raise(float_flag_invalid STATUS_VAR);
3071 return packFloat16(aSign, 0x1f, 0x3ff);
3072 }
3073 return packFloat16(aSign, 0x1f, 0);
60011498 3074 }
600e30d2 3075 if (aExp == 0 && aSig == 0) {
60011498
PB
3076 return packFloat16(aSign, 0, 0);
3077 }
38970efa
PM
3078 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3079 * even if the input is denormal; however this is harmless because
3080 * the largest possible single-precision denormal is still smaller
3081 * than the smallest representable half-precision denormal, and so we
3082 * will end up ignoring aSig and returning via the "always return zero"
3083 * codepath.
3084 */
60011498
PB
3085 aSig |= 0x00800000;
3086 aExp -= 0x7f;
38970efa
PM
3087 /* Calculate the mask of bits of the mantissa which are not
3088 * representable in half-precision and will be lost.
3089 */
60011498 3090 if (aExp < -14) {
38970efa 3091 /* Will be denormal in halfprec */
600e30d2
PM
3092 mask = 0x00ffffff;
3093 if (aExp >= -24) {
3094 mask >>= 25 + aExp;
60011498
PB
3095 }
3096 } else {
38970efa 3097 /* Normal number in halfprec */
60011498
PB
3098 mask = 0x00001fff;
3099 }
60011498 3100
38970efa
PM
3101 roundingMode = STATUS(float_rounding_mode);
3102 switch (roundingMode) {
3103 case float_round_nearest_even:
3104 increment = (mask + 1) >> 1;
3105 if ((aSig & mask) == increment) {
3106 increment = aSig & (increment << 1);
3107 }
3108 break;
3109 case float_round_up:
3110 increment = aSign ? 0 : mask;
3111 break;
3112 case float_round_down:
3113 increment = aSign ? mask : 0;
3114 break;
3115 default: /* round_to_zero */
3116 increment = 0;
3117 break;
3118 }
3119
3120 rounding_bumps_exp = (aSig + increment >= 0x01000000);
3121
3122 if (aExp > maxexp || (aExp == maxexp && rounding_bumps_exp)) {
3123 if (ieee) {
3124 float_raise(float_flag_overflow | float_flag_inexact STATUS_VAR);
60011498 3125 return packFloat16(aSign, 0x1f, 0);
38970efa
PM
3126 } else {
3127 float_raise(float_flag_invalid STATUS_VAR);
60011498
PB
3128 return packFloat16(aSign, 0x1f, 0x3ff);
3129 }
3130 }
38970efa
PM
3131
3132 if (aExp < -14) {
3133 /* Note that flush-to-zero does not affect half-precision results */
3134 is_tiny =
3135 (STATUS(float_detect_tininess) == float_tininess_before_rounding)
3136 || (aExp < -15)
3137 || (!rounding_bumps_exp);
3138 }
3139 if (aSig & mask) {
3140 float_raise(float_flag_inexact STATUS_VAR);
3141 if (is_tiny) {
3142 float_raise(float_flag_underflow STATUS_VAR);
3143 }
3144 }
3145
3146 aSig += increment;
3147 if (rounding_bumps_exp) {
3148 aSig >>= 1;
3149 aExp++;
3150 }
3151
60011498
PB
3152 if (aExp < -24) {
3153 return packFloat16(aSign, 0, 0);
3154 }
3155 if (aExp < -14) {
3156 aSig >>= -14 - aExp;
3157 aExp = -14;
3158 }
3159 return packFloat16(aSign, aExp + 14, aSig >> 13);
3160}
3161
158142c2
FB
3162/*----------------------------------------------------------------------------
3163| Returns the result of converting the double-precision floating-point value
3164| `a' to the extended double-precision floating-point format. The conversion
3165| is performed according to the IEC/IEEE Standard for Binary Floating-Point
3166| Arithmetic.
3167*----------------------------------------------------------------------------*/
3168
3169floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
3170{
3171 flag aSign;
94a49d86 3172 int_fast16_t aExp;
bb98fe42 3173 uint64_t aSig;
158142c2 3174
37d18660 3175 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
3176 aSig = extractFloat64Frac( a );
3177 aExp = extractFloat64Exp( a );
3178 aSign = extractFloat64Sign( a );
3179 if ( aExp == 0x7FF ) {
bcd4d9af 3180 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
3181 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3182 }
3183 if ( aExp == 0 ) {
3184 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3185 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3186 }
3187 return
3188 packFloatx80(
3189 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
3190
3191}
3192
158142c2
FB
3193/*----------------------------------------------------------------------------
3194| Returns the result of converting the double-precision floating-point value
3195| `a' to the quadruple-precision floating-point format. The conversion is
3196| performed according to the IEC/IEEE Standard for Binary Floating-Point
3197| Arithmetic.
3198*----------------------------------------------------------------------------*/
3199
3200float128 float64_to_float128( float64 a STATUS_PARAM )
3201{
3202 flag aSign;
94a49d86 3203 int_fast16_t aExp;
bb98fe42 3204 uint64_t aSig, zSig0, zSig1;
158142c2 3205
37d18660 3206 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
3207 aSig = extractFloat64Frac( a );
3208 aExp = extractFloat64Exp( a );
3209 aSign = extractFloat64Sign( a );
3210 if ( aExp == 0x7FF ) {
bcd4d9af 3211 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
3212 return packFloat128( aSign, 0x7FFF, 0, 0 );
3213 }
3214 if ( aExp == 0 ) {
3215 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3216 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3217 --aExp;
3218 }
3219 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
3220 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
3221
3222}
3223
158142c2
FB
3224/*----------------------------------------------------------------------------
3225| Rounds the double-precision floating-point value `a' to an integer, and
3226| returns the result as a double-precision floating-point value. The
3227| operation is performed according to the IEC/IEEE Standard for Binary
3228| Floating-Point Arithmetic.
3229*----------------------------------------------------------------------------*/
3230
3231float64 float64_round_to_int( float64 a STATUS_PARAM )
3232{
3233 flag aSign;
94a49d86 3234 int_fast16_t aExp;
bb98fe42 3235 uint64_t lastBitMask, roundBitsMask;
158142c2 3236 int8 roundingMode;
bb98fe42 3237 uint64_t z;
37d18660 3238 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
3239
3240 aExp = extractFloat64Exp( a );
3241 if ( 0x433 <= aExp ) {
3242 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
3243 return propagateFloat64NaN( a, a STATUS_VAR );
3244 }
3245 return a;
3246 }
3247 if ( aExp < 0x3FF ) {
bb98fe42 3248 if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
158142c2
FB
3249 STATUS(float_exception_flags) |= float_flag_inexact;
3250 aSign = extractFloat64Sign( a );
3251 switch ( STATUS(float_rounding_mode) ) {
3252 case float_round_nearest_even:
3253 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3254 return packFloat64( aSign, 0x3FF, 0 );
3255 }
3256 break;
3257 case float_round_down:
f090c9d4 3258 return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
158142c2 3259 case float_round_up:
f090c9d4
PB
3260 return make_float64(
3261 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
158142c2
FB
3262 }
3263 return packFloat64( aSign, 0, 0 );
3264 }
3265 lastBitMask = 1;
3266 lastBitMask <<= 0x433 - aExp;
3267 roundBitsMask = lastBitMask - 1;
f090c9d4 3268 z = float64_val(a);
158142c2
FB
3269 roundingMode = STATUS(float_rounding_mode);
3270 if ( roundingMode == float_round_nearest_even ) {
3271 z += lastBitMask>>1;
3272 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
3273 }
3274 else if ( roundingMode != float_round_to_zero ) {
f090c9d4 3275 if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
158142c2
FB
3276 z += roundBitsMask;
3277 }
3278 }
3279 z &= ~ roundBitsMask;
f090c9d4
PB
3280 if ( z != float64_val(a) )
3281 STATUS(float_exception_flags) |= float_flag_inexact;
3282 return make_float64(z);
158142c2
FB
3283
3284}
3285
e6e5906b
PB
3286float64 float64_trunc_to_int( float64 a STATUS_PARAM)
3287{
3288 int oldmode;
3289 float64 res;
3290 oldmode = STATUS(float_rounding_mode);
3291 STATUS(float_rounding_mode) = float_round_to_zero;
3292 res = float64_round_to_int(a STATUS_VAR);
3293 STATUS(float_rounding_mode) = oldmode;
3294 return res;
3295}
3296
158142c2
FB
3297/*----------------------------------------------------------------------------
3298| Returns the result of adding the absolute values of the double-precision
3299| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
3300| before being returned. `zSign' is ignored if the result is a NaN.
3301| The addition is performed according to the IEC/IEEE Standard for Binary
3302| Floating-Point Arithmetic.
3303*----------------------------------------------------------------------------*/
3304
3305static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3306{
94a49d86 3307 int_fast16_t aExp, bExp, zExp;
bb98fe42 3308 uint64_t aSig, bSig, zSig;
94a49d86 3309 int_fast16_t expDiff;
158142c2
FB
3310
3311 aSig = extractFloat64Frac( a );
3312 aExp = extractFloat64Exp( a );
3313 bSig = extractFloat64Frac( b );
3314 bExp = extractFloat64Exp( b );
3315 expDiff = aExp - bExp;
3316 aSig <<= 9;
3317 bSig <<= 9;
3318 if ( 0 < expDiff ) {
3319 if ( aExp == 0x7FF ) {
3320 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3321 return a;
3322 }
3323 if ( bExp == 0 ) {
3324 --expDiff;
3325 }
3326 else {
3327 bSig |= LIT64( 0x2000000000000000 );
3328 }
3329 shift64RightJamming( bSig, expDiff, &bSig );
3330 zExp = aExp;
3331 }
3332 else if ( expDiff < 0 ) {
3333 if ( bExp == 0x7FF ) {
3334 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3335 return packFloat64( zSign, 0x7FF, 0 );
3336 }
3337 if ( aExp == 0 ) {
3338 ++expDiff;
3339 }
3340 else {
3341 aSig |= LIT64( 0x2000000000000000 );
3342 }
3343 shift64RightJamming( aSig, - expDiff, &aSig );
3344 zExp = bExp;
3345 }
3346 else {
3347 if ( aExp == 0x7FF ) {
3348 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3349 return a;
3350 }
fe76d976 3351 if ( aExp == 0 ) {
e6afc87f
PM
3352 if (STATUS(flush_to_zero)) {
3353 if (aSig | bSig) {
3354 float_raise(float_flag_output_denormal STATUS_VAR);
3355 }
3356 return packFloat64(zSign, 0, 0);
3357 }
fe76d976
PB
3358 return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3359 }
158142c2
FB
3360 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3361 zExp = aExp;
3362 goto roundAndPack;
3363 }
3364 aSig |= LIT64( 0x2000000000000000 );
3365 zSig = ( aSig + bSig )<<1;
3366 --zExp;
bb98fe42 3367 if ( (int64_t) zSig < 0 ) {
158142c2
FB
3368 zSig = aSig + bSig;
3369 ++zExp;
3370 }
3371 roundAndPack:
3372 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3373
3374}
3375
3376/*----------------------------------------------------------------------------
3377| Returns the result of subtracting the absolute values of the double-
3378| precision floating-point values `a' and `b'. If `zSign' is 1, the
3379| difference is negated before being returned. `zSign' is ignored if the
3380| result is a NaN. The subtraction is performed according to the IEC/IEEE
3381| Standard for Binary Floating-Point Arithmetic.
3382*----------------------------------------------------------------------------*/
3383
3384static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3385{
94a49d86 3386 int_fast16_t aExp, bExp, zExp;
bb98fe42 3387 uint64_t aSig, bSig, zSig;
94a49d86 3388 int_fast16_t expDiff;
158142c2
FB
3389
3390 aSig = extractFloat64Frac( a );
3391 aExp = extractFloat64Exp( a );
3392 bSig = extractFloat64Frac( b );
3393 bExp = extractFloat64Exp( b );
3394 expDiff = aExp - bExp;
3395 aSig <<= 10;
3396 bSig <<= 10;
3397 if ( 0 < expDiff ) goto aExpBigger;
3398 if ( expDiff < 0 ) goto bExpBigger;
3399 if ( aExp == 0x7FF ) {
3400 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3401 float_raise( float_flag_invalid STATUS_VAR);
3402 return float64_default_nan;
3403 }
3404 if ( aExp == 0 ) {
3405 aExp = 1;
3406 bExp = 1;
3407 }
3408 if ( bSig < aSig ) goto aBigger;
3409 if ( aSig < bSig ) goto bBigger;
3410 return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3411 bExpBigger:
3412 if ( bExp == 0x7FF ) {
3413 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3414 return packFloat64( zSign ^ 1, 0x7FF, 0 );
3415 }
3416 if ( aExp == 0 ) {
3417 ++expDiff;
3418 }
3419 else {
3420 aSig |= LIT64( 0x4000000000000000 );
3421 }
3422 shift64RightJamming( aSig, - expDiff, &aSig );
3423 bSig |= LIT64( 0x4000000000000000 );
3424 bBigger:
3425 zSig = bSig - aSig;
3426 zExp = bExp;
3427 zSign ^= 1;
3428 goto normalizeRoundAndPack;
3429 aExpBigger:
3430 if ( aExp == 0x7FF ) {
3431 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3432 return a;
3433 }
3434 if ( bExp == 0 ) {
3435 --expDiff;
3436 }
3437 else {
3438 bSig |= LIT64( 0x4000000000000000 );
3439 }
3440 shift64RightJamming( bSig, expDiff, &bSig );
3441 aSig |= LIT64( 0x4000000000000000 );
3442 aBigger:
3443 zSig = aSig - bSig;
3444 zExp = aExp;
3445 normalizeRoundAndPack:
3446 --zExp;
3447 return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3448
3449}
3450
3451/*----------------------------------------------------------------------------
3452| Returns the result of adding the double-precision floating-point values `a'
3453| and `b'. The operation is performed according to the IEC/IEEE Standard for
3454| Binary Floating-Point Arithmetic.
3455*----------------------------------------------------------------------------*/
3456
3457float64 float64_add( float64 a, float64 b STATUS_PARAM )
3458{
3459 flag aSign, bSign;
37d18660
PM
3460 a = float64_squash_input_denormal(a STATUS_VAR);
3461 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
3462
3463 aSign = extractFloat64Sign( a );
3464 bSign = extractFloat64Sign( b );
3465 if ( aSign == bSign ) {
3466 return addFloat64Sigs( a, b, aSign STATUS_VAR );
3467 }
3468 else {
3469 return subFloat64Sigs( a, b, aSign STATUS_VAR );
3470 }
3471
3472}
3473
3474/*----------------------------------------------------------------------------
3475| Returns the result of subtracting the double-precision floating-point values
3476| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3477| for Binary Floating-Point Arithmetic.
3478*----------------------------------------------------------------------------*/
3479
3480float64 float64_sub( float64 a, float64 b STATUS_PARAM )
3481{
3482 flag aSign, bSign;
37d18660
PM
3483 a = float64_squash_input_denormal(a STATUS_VAR);
3484 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
3485
3486 aSign = extractFloat64Sign( a );
3487 bSign = extractFloat64Sign( b );
3488 if ( aSign == bSign ) {
3489 return subFloat64Sigs( a, b, aSign STATUS_VAR );
3490 }
3491 else {
3492 return addFloat64Sigs( a, b, aSign STATUS_VAR );
3493 }
3494
3495}
3496
3497/*----------------------------------------------------------------------------
3498| Returns the result of multiplying the double-precision floating-point values
3499| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3500| for Binary Floating-Point Arithmetic.
3501*----------------------------------------------------------------------------*/
3502
3503float64 float64_mul( float64 a, float64 b STATUS_PARAM )
3504{
3505 flag aSign, bSign, zSign;
94a49d86 3506 int_fast16_t aExp, bExp, zExp;
bb98fe42 3507 uint64_t aSig, bSig, zSig0, zSig1;
158142c2 3508
37d18660
PM
3509 a = float64_squash_input_denormal(a STATUS_VAR);
3510 b = float64_squash_input_denormal(b STATUS_VAR);
3511
158142c2
FB
3512 aSig = extractFloat64Frac( a );
3513 aExp = extractFloat64Exp( a );
3514 aSign = extractFloat64Sign( a );
3515 bSig = extractFloat64Frac( b );
3516 bExp = extractFloat64Exp( b );
3517 bSign = extractFloat64Sign( b );
3518 zSign = aSign ^ bSign;
3519 if ( aExp == 0x7FF ) {
3520 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3521 return propagateFloat64NaN( a, b STATUS_VAR );
3522 }
3523 if ( ( bExp | bSig ) == 0 ) {
3524 float_raise( float_flag_invalid STATUS_VAR);
3525 return float64_default_nan;
3526 }
3527 return packFloat64( zSign, 0x7FF, 0 );
3528 }
3529 if ( bExp == 0x7FF ) {
3530 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3531 if ( ( aExp | aSig ) == 0 ) {
3532 float_raise( float_flag_invalid STATUS_VAR);
3533 return float64_default_nan;
3534 }
3535 return packFloat64( zSign, 0x7FF, 0 );
3536 }
3537 if ( aExp == 0 ) {
3538 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3539 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3540 }
3541 if ( bExp == 0 ) {
3542 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
3543 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3544 }
3545 zExp = aExp + bExp - 0x3FF;
3546 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3547 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3548 mul64To128( aSig, bSig, &zSig0, &zSig1 );
3549 zSig0 |= ( zSig1 != 0 );
bb98fe42 3550 if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
158142c2
FB
3551 zSig0 <<= 1;
3552 --zExp;
3553 }
3554 return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
3555
3556}
3557
3558/*----------------------------------------------------------------------------
3559| Returns the result of dividing the double-precision floating-point value `a'
3560| by the corresponding value `b'. The operation is performed according to
3561| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3562*----------------------------------------------------------------------------*/
3563
3564float64 float64_div( float64 a, float64 b STATUS_PARAM )
3565{
3566 flag aSign, bSign, zSign;
94a49d86 3567 int_fast16_t aExp, bExp, zExp;
bb98fe42
AF
3568 uint64_t aSig, bSig, zSig;
3569 uint64_t rem0, rem1;
3570 uint64_t term0, term1;
37d18660
PM
3571 a = float64_squash_input_denormal(a STATUS_VAR);
3572 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
3573
3574 aSig = extractFloat64Frac( a );
3575 aExp = extractFloat64Exp( a );
3576 aSign = extractFloat64Sign( a );
3577 bSig = extractFloat64Frac( b );
3578 bExp = extractFloat64Exp( b );
3579 bSign = extractFloat64Sign( b );
3580 zSign = aSign ^ bSign;
3581 if ( aExp == 0x7FF ) {
3582 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3583 if ( bExp == 0x7FF ) {
3584 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3585 float_raise( float_flag_invalid STATUS_VAR);
3586 return float64_default_nan;
3587 }
3588 return packFloat64( zSign, 0x7FF, 0 );
3589 }
3590 if ( bExp == 0x7FF ) {
3591 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3592 return packFloat64( zSign, 0, 0 );
3593 }
3594 if ( bExp == 0 ) {
3595 if ( bSig == 0 ) {
3596 if ( ( aExp | aSig ) == 0 ) {
3597 float_raise( float_flag_invalid STATUS_VAR);
3598 return float64_default_nan;
3599 }
3600 float_raise( float_flag_divbyzero STATUS_VAR);
3601 return packFloat64( zSign, 0x7FF, 0 );
3602 }
3603 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3604 }
3605 if ( aExp == 0 ) {
3606 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3607 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3608 }
3609 zExp = aExp - bExp + 0x3FD;
3610 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3611 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3612 if ( bSig <= ( aSig + aSig ) ) {
3613 aSig >>= 1;
3614 ++zExp;
3615 }
3616 zSig = estimateDiv128To64( aSig, 0, bSig );
3617 if ( ( zSig & 0x1FF ) <= 2 ) {
3618 mul64To128( bSig, zSig, &term0, &term1 );
3619 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
bb98fe42 3620 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
3621 --zSig;
3622 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3623 }
3624 zSig |= ( rem1 != 0 );
3625 }
3626 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3627
3628}
3629
3630/*----------------------------------------------------------------------------
3631| Returns the remainder of the double-precision floating-point value `a'
3632| with respect to the corresponding value `b'. The operation is performed
3633| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3634*----------------------------------------------------------------------------*/
3635
3636float64 float64_rem( float64 a, float64 b STATUS_PARAM )
3637{
ed086f3d 3638 flag aSign, zSign;
94a49d86 3639 int_fast16_t aExp, bExp, expDiff;
bb98fe42
AF
3640 uint64_t aSig, bSig;
3641 uint64_t q, alternateASig;
3642 int64_t sigMean;
158142c2 3643
37d18660
PM
3644 a = float64_squash_input_denormal(a STATUS_VAR);
3645 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
3646 aSig = extractFloat64Frac( a );
3647 aExp = extractFloat64Exp( a );
3648 aSign = extractFloat64Sign( a );
3649 bSig = extractFloat64Frac( b );
3650 bExp = extractFloat64Exp( b );
158142c2
FB
3651 if ( aExp == 0x7FF ) {
3652 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3653 return propagateFloat64NaN( a, b STATUS_VAR );
3654 }
3655 float_raise( float_flag_invalid STATUS_VAR);
3656 return float64_default_nan;
3657 }
3658 if ( bExp == 0x7FF ) {
3659 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3660 return a;
3661 }
3662 if ( bExp == 0 ) {
3663 if ( bSig == 0 ) {
3664 float_raise( float_flag_invalid STATUS_VAR);
3665 return float64_default_nan;
3666 }
3667 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3668 }
3669 if ( aExp == 0 ) {
3670 if ( aSig == 0 ) return a;
3671 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3672 }
3673 expDiff = aExp - bExp;
3674 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3675 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3676 if ( expDiff < 0 ) {
3677 if ( expDiff < -1 ) return a;
3678 aSig >>= 1;
3679 }
3680 q = ( bSig <= aSig );
3681 if ( q ) aSig -= bSig;
3682 expDiff -= 64;
3683 while ( 0 < expDiff ) {
3684 q = estimateDiv128To64( aSig, 0, bSig );
3685 q = ( 2 < q ) ? q - 2 : 0;
3686 aSig = - ( ( bSig>>2 ) * q );
3687 expDiff -= 62;
3688 }
3689 expDiff += 64;
3690 if ( 0 < expDiff ) {
3691 q = estimateDiv128To64( aSig, 0, bSig );
3692 q = ( 2 < q ) ? q - 2 : 0;
3693 q >>= 64 - expDiff;
3694 bSig >>= 2;
3695 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3696 }
3697 else {
3698 aSig >>= 2;
3699 bSig >>= 2;
3700 }
3701 do {
3702 alternateASig = aSig;
3703 ++q;
3704 aSig -= bSig;
bb98fe42 3705 } while ( 0 <= (int64_t) aSig );
158142c2
FB
3706 sigMean = aSig + alternateASig;
3707 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3708 aSig = alternateASig;
3709 }
bb98fe42 3710 zSign = ( (int64_t) aSig < 0 );
158142c2
FB
3711 if ( zSign ) aSig = - aSig;
3712 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
3713
3714}
3715
369be8f6
PM
3716/*----------------------------------------------------------------------------
3717| Returns the result of multiplying the double-precision floating-point values
3718| `a' and `b' then adding 'c', with no intermediate rounding step after the
3719| multiplication. The operation is performed according to the IEC/IEEE
3720| Standard for Binary Floating-Point Arithmetic 754-2008.
3721| The flags argument allows the caller to select negation of the
3722| addend, the intermediate product, or the final result. (The difference
3723| between this and having the caller do a separate negation is that negating
3724| externally will flip the sign bit on NaNs.)
3725*----------------------------------------------------------------------------*/
3726
3727float64 float64_muladd(float64 a, float64 b, float64 c, int flags STATUS_PARAM)
3728{
3729 flag aSign, bSign, cSign, zSign;
94a49d86 3730 int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
369be8f6
PM
3731 uint64_t aSig, bSig, cSig;
3732 flag pInf, pZero, pSign;
3733 uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1;
3734 int shiftcount;
3735 flag signflip, infzero;
3736
3737 a = float64_squash_input_denormal(a STATUS_VAR);
3738 b = float64_squash_input_denormal(b STATUS_VAR);
3739 c = float64_squash_input_denormal(c STATUS_VAR);
3740 aSig = extractFloat64Frac(a);
3741 aExp = extractFloat64Exp(a);
3742 aSign = extractFloat64Sign(a);
3743 bSig = extractFloat64Frac(b);
3744 bExp = extractFloat64Exp(b);
3745 bSign = extractFloat64Sign(b);
3746 cSig = extractFloat64Frac(c);
3747 cExp = extractFloat64Exp(c);
3748 cSign = extractFloat64Sign(c);
3749
3750 infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) ||
3751 (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0));
3752
3753 /* It is implementation-defined whether the cases of (0,inf,qnan)
3754 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
3755 * they return if they do), so we have to hand this information
3756 * off to the target-specific pick-a-NaN routine.
3757 */
3758 if (((aExp == 0x7ff) && aSig) ||
3759 ((bExp == 0x7ff) && bSig) ||
3760 ((cExp == 0x7ff) && cSig)) {
3761 return propagateFloat64MulAddNaN(a, b, c, infzero STATUS_VAR);
3762 }
3763
3764 if (infzero) {
3765 float_raise(float_flag_invalid STATUS_VAR);
3766 return float64_default_nan;
3767 }
3768
3769 if (flags & float_muladd_negate_c) {
3770 cSign ^= 1;
3771 }
3772
3773 signflip = (flags & float_muladd_negate_result) ? 1 : 0;
3774
3775 /* Work out the sign and type of the product */
3776 pSign = aSign ^ bSign;
3777 if (flags & float_muladd_negate_product) {
3778 pSign ^= 1;
3779 }
3780 pInf = (aExp == 0x7ff) || (bExp == 0x7ff);
3781 pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
3782
3783 if (cExp == 0x7ff) {
3784 if (pInf && (pSign ^ cSign)) {
3785 /* addition of opposite-signed infinities => InvalidOperation */
3786 float_raise(float_flag_invalid STATUS_VAR);
3787 return float64_default_nan;
3788 }
3789 /* Otherwise generate an infinity of the same sign */
3790 return packFloat64(cSign ^ signflip, 0x7ff, 0);
3791 }
3792
3793 if (pInf) {
3794 return packFloat64(pSign ^ signflip, 0x7ff, 0);
3795 }
3796
3797 if (pZero) {
3798 if (cExp == 0) {
3799 if (cSig == 0) {
3800 /* Adding two exact zeroes */
3801 if (pSign == cSign) {
3802 zSign = pSign;
3803 } else if (STATUS(float_rounding_mode) == float_round_down) {
3804 zSign = 1;
3805 } else {
3806 zSign = 0;
3807 }
3808 return packFloat64(zSign ^ signflip, 0, 0);
3809 }
3810 /* Exact zero plus a denorm */
3811 if (STATUS(flush_to_zero)) {
3812 float_raise(float_flag_output_denormal STATUS_VAR);
3813 return packFloat64(cSign ^ signflip, 0, 0);
3814 }
3815 }
3816 /* Zero plus something non-zero : just return the something */
a6e7c184 3817 return packFloat64(cSign ^ signflip, cExp, cSig);
369be8f6
PM
3818 }
3819
3820 if (aExp == 0) {
3821 normalizeFloat64Subnormal(aSig, &aExp, &aSig);
3822 }
3823 if (bExp == 0) {
3824 normalizeFloat64Subnormal(bSig, &bExp, &bSig);
3825 }
3826
3827 /* Calculate the actual result a * b + c */
3828
3829 /* Multiply first; this is easy. */
3830 /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff
3831 * because we want the true exponent, not the "one-less-than"
3832 * flavour that roundAndPackFloat64() takes.
3833 */
3834 pExp = aExp + bExp - 0x3fe;
3835 aSig = (aSig | LIT64(0x0010000000000000))<<10;
3836 bSig = (bSig | LIT64(0x0010000000000000))<<11;
3837 mul64To128(aSig, bSig, &pSig0, &pSig1);
3838 if ((int64_t)(pSig0 << 1) >= 0) {
3839 shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1);
3840 pExp--;
3841 }
3842
3843 zSign = pSign ^ signflip;
3844
3845 /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit
3846 * bit in position 126.
3847 */
3848 if (cExp == 0) {
3849 if (!cSig) {
3850 /* Throw out the special case of c being an exact zero now */
3851 shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1);
3852 return roundAndPackFloat64(zSign, pExp - 1,
3853 pSig1 STATUS_VAR);
3854 }
3855 normalizeFloat64Subnormal(cSig, &cExp, &cSig);
3856 }
3857
3858 /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the
3859 * significand of the addend, with the explicit bit in position 126.
3860 */
3861 cSig0 = cSig << (126 - 64 - 52);
3862 cSig1 = 0;
3863 cSig0 |= LIT64(0x4000000000000000);
3864 expDiff = pExp - cExp;
3865
3866 if (pSign == cSign) {
3867 /* Addition */
3868 if (expDiff > 0) {
3869 /* scale c to match p */
3870 shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
3871 zExp = pExp;
3872 } else if (expDiff < 0) {
3873 /* scale p to match c */
3874 shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
3875 zExp = cExp;
3876 } else {
3877 /* no scaling needed */
3878 zExp = cExp;
3879 }
3880 /* Add significands and make sure explicit bit ends up in posn 126 */
3881 add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
3882 if ((int64_t)zSig0 < 0) {
3883 shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);
3884 } else {
3885 zExp--;
3886 }
3887 shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1);
3888 return roundAndPackFloat64(zSign, zExp, zSig1 STATUS_VAR);
3889 } else {
3890 /* Subtraction */
3891 if (expDiff > 0) {
3892 shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
3893 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
3894 zExp = pExp;
3895 } else if (expDiff < 0) {
3896 shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
3897 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
3898 zExp = cExp;
3899 zSign ^= 1;
3900 } else {
3901 zExp = pExp;
3902 if (lt128(cSig0, cSig1, pSig0, pSig1)) {
3903 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
3904 } else if (lt128(pSig0, pSig1, cSig0, cSig1)) {
3905 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
3906 zSign ^= 1;
3907 } else {
3908 /* Exact zero */
3909 zSign = signflip;
3910 if (STATUS(float_rounding_mode) == float_round_down) {
3911 zSign ^= 1;
3912 }
3913 return packFloat64(zSign, 0, 0);
3914 }
3915 }
3916 --zExp;
3917 /* Do the equivalent of normalizeRoundAndPackFloat64() but
3918 * starting with the significand in a pair of uint64_t.
3919 */
3920 if (zSig0) {
3921 shiftcount = countLeadingZeros64(zSig0) - 1;
3922 shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1);
3923 if (zSig1) {
3924 zSig0 |= 1;
3925 }
3926 zExp -= shiftcount;
3927 } else {
e3d142d0
PM
3928 shiftcount = countLeadingZeros64(zSig1);
3929 if (shiftcount == 0) {
3930 zSig0 = (zSig1 >> 1) | (zSig1 & 1);
3931 zExp -= 63;
3932 } else {
3933 shiftcount--;
3934 zSig0 = zSig1 << shiftcount;
3935 zExp -= (shiftcount + 64);
3936 }
369be8f6
PM
3937 }
3938 return roundAndPackFloat64(zSign, zExp, zSig0 STATUS_VAR);
3939 }
3940}
3941
158142c2
FB
3942/*----------------------------------------------------------------------------
3943| Returns the square root of the double-precision floating-point value `a'.
3944| The operation is performed according to the IEC/IEEE Standard for Binary
3945| Floating-Point Arithmetic.
3946*----------------------------------------------------------------------------*/
3947
3948float64 float64_sqrt( float64 a STATUS_PARAM )
3949{
3950 flag aSign;
94a49d86 3951 int_fast16_t aExp, zExp;
bb98fe42
AF
3952 uint64_t aSig, zSig, doubleZSig;
3953 uint64_t rem0, rem1, term0, term1;
37d18660 3954 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
3955
3956 aSig = extractFloat64Frac( a );
3957 aExp = extractFloat64Exp( a );
3958 aSign = extractFloat64Sign( a );
3959 if ( aExp == 0x7FF ) {
3960 if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
3961 if ( ! aSign ) return a;
3962 float_raise( float_flag_invalid STATUS_VAR);
3963 return float64_default_nan;
3964 }
3965 if ( aSign ) {
3966 if ( ( aExp | aSig ) == 0 ) return a;
3967 float_raise( float_flag_invalid STATUS_VAR);
3968 return float64_default_nan;
3969 }
3970 if ( aExp == 0 ) {
f090c9d4 3971 if ( aSig == 0 ) return float64_zero;
158142c2
FB
3972 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3973 }
3974 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3975 aSig |= LIT64( 0x0010000000000000 );
3976 zSig = estimateSqrt32( aExp, aSig>>21 );
3977 aSig <<= 9 - ( aExp & 1 );
3978 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3979 if ( ( zSig & 0x1FF ) <= 5 ) {
3980 doubleZSig = zSig<<1;
3981 mul64To128( zSig, zSig, &term0, &term1 );
3982 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
bb98fe42 3983 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
3984 --zSig;
3985 doubleZSig -= 2;
3986 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3987 }
3988 zSig |= ( ( rem0 | rem1 ) != 0 );
3989 }
3990 return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
3991
3992}
3993
374dfc33
AJ
3994/*----------------------------------------------------------------------------
3995| Returns the binary log of the double-precision floating-point value `a'.
3996| The operation is performed according to the IEC/IEEE Standard for Binary
3997| Floating-Point Arithmetic.
3998*----------------------------------------------------------------------------*/
3999float64 float64_log2( float64 a STATUS_PARAM )
4000{
4001 flag aSign, zSign;
94a49d86 4002 int_fast16_t aExp;
bb98fe42 4003 uint64_t aSig, aSig0, aSig1, zSig, i;
37d18660 4004 a = float64_squash_input_denormal(a STATUS_VAR);
374dfc33
AJ
4005
4006 aSig = extractFloat64Frac( a );
4007 aExp = extractFloat64Exp( a );
4008 aSign = extractFloat64Sign( a );
4009
4010 if ( aExp == 0 ) {
4011 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4012 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4013 }
4014 if ( aSign ) {
4015 float_raise( float_flag_invalid STATUS_VAR);
4016 return float64_default_nan;
4017 }
4018 if ( aExp == 0x7FF ) {
4019 if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
4020 return a;
4021 }
4022
4023 aExp -= 0x3FF;
4024 aSig |= LIT64( 0x0010000000000000 );
4025 zSign = aExp < 0;
bb98fe42 4026 zSig = (uint64_t)aExp << 52;
374dfc33
AJ
4027 for (i = 1LL << 51; i > 0; i >>= 1) {
4028 mul64To128( aSig, aSig, &aSig0, &aSig1 );
4029 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4030 if ( aSig & LIT64( 0x0020000000000000 ) ) {
4031 aSig >>= 1;
4032 zSig |= i;
4033 }
4034 }
4035
4036 if ( zSign )
4037 zSig = -zSig;
4038 return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
4039}
4040
158142c2
FB
4041/*----------------------------------------------------------------------------
4042| Returns 1 if the double-precision floating-point value `a' is equal to the
b689362d
AJ
4043| corresponding value `b', and 0 otherwise. The invalid exception is raised
4044| if either operand is a NaN. Otherwise, the comparison is performed
158142c2
FB
4045| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4046*----------------------------------------------------------------------------*/
4047
b689362d 4048int float64_eq( float64 a, float64 b STATUS_PARAM )
158142c2 4049{
bb98fe42 4050 uint64_t av, bv;
37d18660
PM
4051 a = float64_squash_input_denormal(a STATUS_VAR);
4052 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
4053
4054 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4055 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4056 ) {
b689362d 4057 float_raise( float_flag_invalid STATUS_VAR);
158142c2
FB
4058 return 0;
4059 }
f090c9d4 4060 av = float64_val(a);
a1b91bb4 4061 bv = float64_val(b);
bb98fe42 4062 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
158142c2
FB
4063
4064}
4065
4066/*----------------------------------------------------------------------------
4067| Returns 1 if the double-precision floating-point value `a' is less than or
f5a64251
AJ
4068| equal to the corresponding value `b', and 0 otherwise. The invalid
4069| exception is raised if either operand is a NaN. The comparison is performed
4070| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
4071*----------------------------------------------------------------------------*/
4072
750afe93 4073int float64_le( float64 a, float64 b STATUS_PARAM )
158142c2
FB
4074{
4075 flag aSign, bSign;
bb98fe42 4076 uint64_t av, bv;
37d18660
PM
4077 a = float64_squash_input_denormal(a STATUS_VAR);
4078 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
4079
4080 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4081 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4082 ) {
4083 float_raise( float_flag_invalid STATUS_VAR);
4084 return 0;
4085 }
4086 aSign = extractFloat64Sign( a );
4087 bSign = extractFloat64Sign( b );
f090c9d4 4088 av = float64_val(a);
a1b91bb4 4089 bv = float64_val(b);
bb98fe42 4090 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 4091 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
4092
4093}
4094
4095/*----------------------------------------------------------------------------
4096| Returns 1 if the double-precision floating-point value `a' is less than
f5a64251
AJ
4097| the corresponding value `b', and 0 otherwise. The invalid exception is
4098| raised if either operand is a NaN. The comparison is performed according
4099| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
4100*----------------------------------------------------------------------------*/
4101
750afe93 4102int float64_lt( float64 a, float64 b STATUS_PARAM )
158142c2
FB
4103{
4104 flag aSign, bSign;
bb98fe42 4105 uint64_t av, bv;
158142c2 4106
37d18660
PM
4107 a = float64_squash_input_denormal(a STATUS_VAR);
4108 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
4109 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4110 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4111 ) {
4112 float_raise( float_flag_invalid STATUS_VAR);
4113 return 0;
4114 }
4115 aSign = extractFloat64Sign( a );
4116 bSign = extractFloat64Sign( b );
f090c9d4 4117 av = float64_val(a);
a1b91bb4 4118 bv = float64_val(b);
bb98fe42 4119 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 4120 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
4121
4122}
4123
67b7861d
AJ
4124/*----------------------------------------------------------------------------
4125| Returns 1 if the double-precision floating-point values `a' and `b' cannot
f5a64251
AJ
4126| be compared, and 0 otherwise. The invalid exception is raised if either
4127| operand is a NaN. The comparison is performed according to the IEC/IEEE
4128| Standard for Binary Floating-Point Arithmetic.
67b7861d
AJ
4129*----------------------------------------------------------------------------*/
4130
4131int float64_unordered( float64 a, float64 b STATUS_PARAM )
4132{
4133 a = float64_squash_input_denormal(a STATUS_VAR);
4134 b = float64_squash_input_denormal(b STATUS_VAR);
4135
4136 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4137 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4138 ) {
4139 float_raise( float_flag_invalid STATUS_VAR);
4140 return 1;
4141 }
4142 return 0;
4143}
4144
158142c2
FB
4145/*----------------------------------------------------------------------------
4146| Returns 1 if the double-precision floating-point value `a' is equal to the
f5a64251
AJ
4147| corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4148| exception.The comparison is performed according to the IEC/IEEE Standard
4149| for Binary Floating-Point Arithmetic.
158142c2
FB
4150*----------------------------------------------------------------------------*/
4151
b689362d 4152int float64_eq_quiet( float64 a, float64 b STATUS_PARAM )
158142c2 4153{
bb98fe42 4154 uint64_t av, bv;
37d18660
PM
4155 a = float64_squash_input_denormal(a STATUS_VAR);
4156 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
4157
4158 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4159 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4160 ) {
b689362d
AJ
4161 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4162 float_raise( float_flag_invalid STATUS_VAR);
4163 }
158142c2
FB
4164 return 0;
4165 }
f090c9d4 4166 av = float64_val(a);
a1b91bb4 4167 bv = float64_val(b);
bb98fe42 4168 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
158142c2
FB
4169
4170}
4171
4172/*----------------------------------------------------------------------------
4173| Returns 1 if the double-precision floating-point value `a' is less than or
4174| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4175| cause an exception. Otherwise, the comparison is performed according to the
4176| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4177*----------------------------------------------------------------------------*/
4178
750afe93 4179int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
158142c2
FB
4180{
4181 flag aSign, bSign;
bb98fe42 4182 uint64_t av, bv;
37d18660
PM
4183 a = float64_squash_input_denormal(a STATUS_VAR);
4184 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
4185
4186 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4187 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4188 ) {
4189 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4190 float_raise( float_flag_invalid STATUS_VAR);
4191 }
4192 return 0;
4193 }
4194 aSign = extractFloat64Sign( a );
4195 bSign = extractFloat64Sign( b );
f090c9d4 4196 av = float64_val(a);
a1b91bb4 4197 bv = float64_val(b);
bb98fe42 4198 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 4199 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
4200
4201}
4202
4203/*----------------------------------------------------------------------------
4204| Returns 1 if the double-precision floating-point value `a' is less than
4205| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4206| exception. Otherwise, the comparison is performed according to the IEC/IEEE
4207| Standard for Binary Floating-Point Arithmetic.
4208*----------------------------------------------------------------------------*/
4209
750afe93 4210int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
158142c2
FB
4211{
4212 flag aSign, bSign;
bb98fe42 4213 uint64_t av, bv;
37d18660
PM
4214 a = float64_squash_input_denormal(a STATUS_VAR);
4215 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
4216
4217 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4218 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4219 ) {
4220 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4221 float_raise( float_flag_invalid STATUS_VAR);
4222 }
4223 return 0;
4224 }
4225 aSign = extractFloat64Sign( a );
4226 bSign = extractFloat64Sign( b );
f090c9d4 4227 av = float64_val(a);
a1b91bb4 4228 bv = float64_val(b);
bb98fe42 4229 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 4230 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
4231
4232}
4233
67b7861d
AJ
4234/*----------------------------------------------------------------------------
4235| Returns 1 if the double-precision floating-point values `a' and `b' cannot
4236| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4237| comparison is performed according to the IEC/IEEE Standard for Binary
4238| Floating-Point Arithmetic.
4239*----------------------------------------------------------------------------*/
4240
4241int float64_unordered_quiet( float64 a, float64 b STATUS_PARAM )
4242{
4243 a = float64_squash_input_denormal(a STATUS_VAR);
4244 b = float64_squash_input_denormal(b STATUS_VAR);
4245
4246 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4247 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4248 ) {
4249 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4250 float_raise( float_flag_invalid STATUS_VAR);
4251 }
4252 return 1;
4253 }
4254 return 0;
4255}
4256
158142c2
FB
4257/*----------------------------------------------------------------------------
4258| Returns the result of converting the extended double-precision floating-
4259| point value `a' to the 32-bit two's complement integer format. The
4260| conversion is performed according to the IEC/IEEE Standard for Binary
4261| Floating-Point Arithmetic---which means in particular that the conversion
4262| is rounded according to the current rounding mode. If `a' is a NaN, the
4263| largest positive integer is returned. Otherwise, if the conversion
4264| overflows, the largest integer with the same sign as `a' is returned.
4265*----------------------------------------------------------------------------*/
4266
4267int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
4268{
4269 flag aSign;
4270 int32 aExp, shiftCount;
bb98fe42 4271 uint64_t aSig;
158142c2
FB
4272
4273 aSig = extractFloatx80Frac( a );
4274 aExp = extractFloatx80Exp( a );
4275 aSign = extractFloatx80Sign( a );
bb98fe42 4276 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
158142c2
FB
4277 shiftCount = 0x4037 - aExp;
4278 if ( shiftCount <= 0 ) shiftCount = 1;
4279 shift64RightJamming( aSig, shiftCount, &aSig );
4280 return roundAndPackInt32( aSign, aSig STATUS_VAR );
4281
4282}
4283
4284/*----------------------------------------------------------------------------
4285| Returns the result of converting the extended double-precision floating-
4286| point value `a' to the 32-bit two's complement integer format. The
4287| conversion is performed according to the IEC/IEEE Standard for Binary
4288| Floating-Point Arithmetic, except that the conversion is always rounded
4289| toward zero. If `a' is a NaN, the largest positive integer is returned.
4290| Otherwise, if the conversion overflows, the largest integer with the same
4291| sign as `a' is returned.
4292*----------------------------------------------------------------------------*/
4293
4294int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
4295{
4296 flag aSign;
4297 int32 aExp, shiftCount;
bb98fe42 4298 uint64_t aSig, savedASig;
b3a6a2e0 4299 int32_t z;
158142c2
FB
4300
4301 aSig = extractFloatx80Frac( a );
4302 aExp = extractFloatx80Exp( a );
4303 aSign = extractFloatx80Sign( a );
4304 if ( 0x401E < aExp ) {
bb98fe42 4305 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
158142c2
FB
4306 goto invalid;
4307 }
4308 else if ( aExp < 0x3FFF ) {
4309 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
4310 return 0;
4311 }
4312 shiftCount = 0x403E - aExp;
4313 savedASig = aSig;
4314 aSig >>= shiftCount;
4315 z = aSig;
4316 if ( aSign ) z = - z;
4317 if ( ( z < 0 ) ^ aSign ) {
4318 invalid:
4319 float_raise( float_flag_invalid STATUS_VAR);
bb98fe42 4320 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2
FB
4321 }
4322 if ( ( aSig<<shiftCount ) != savedASig ) {
4323 STATUS(float_exception_flags) |= float_flag_inexact;
4324 }
4325 return z;
4326
4327}
4328
4329/*----------------------------------------------------------------------------
4330| Returns the result of converting the extended double-precision floating-
4331| point value `a' to the 64-bit two's complement integer format. The
4332| conversion is performed according to the IEC/IEEE Standard for Binary
4333| Floating-Point Arithmetic---which means in particular that the conversion
4334| is rounded according to the current rounding mode. If `a' is a NaN,
4335| the largest positive integer is returned. Otherwise, if the conversion
4336| overflows, the largest integer with the same sign as `a' is returned.
4337*----------------------------------------------------------------------------*/
4338
4339int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
4340{
4341 flag aSign;
4342 int32 aExp, shiftCount;
bb98fe42 4343 uint64_t aSig, aSigExtra;
158142c2
FB
4344
4345 aSig = extractFloatx80Frac( a );
4346 aExp = extractFloatx80Exp( a );
4347 aSign = extractFloatx80Sign( a );
4348 shiftCount = 0x403E - aExp;
4349 if ( shiftCount <= 0 ) {
4350 if ( shiftCount ) {
4351 float_raise( float_flag_invalid STATUS_VAR);
4352 if ( ! aSign
4353 || ( ( aExp == 0x7FFF )
4354 && ( aSig != LIT64( 0x8000000000000000 ) ) )
4355 ) {
4356 return LIT64( 0x7FFFFFFFFFFFFFFF );
4357 }
bb98fe42 4358 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
4359 }
4360 aSigExtra = 0;
4361 }
4362 else {
4363 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4364 }
4365 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
4366
4367}
4368
4369/*----------------------------------------------------------------------------
4370| Returns the result of converting the extended double-precision floating-
4371| point value `a' to the 64-bit two's complement integer format. The
4372| conversion is performed according to the IEC/IEEE Standard for Binary
4373| Floating-Point Arithmetic, except that the conversion is always rounded
4374| toward zero. If `a' is a NaN, the largest positive integer is returned.
4375| Otherwise, if the conversion overflows, the largest integer with the same
4376| sign as `a' is returned.
4377*----------------------------------------------------------------------------*/
4378
4379int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
4380{
4381 flag aSign;
4382 int32 aExp, shiftCount;
bb98fe42 4383 uint64_t aSig;
158142c2
FB
4384 int64 z;
4385
4386 aSig = extractFloatx80Frac( a );
4387 aExp = extractFloatx80Exp( a );
4388 aSign = extractFloatx80Sign( a );
4389 shiftCount = aExp - 0x403E;
4390 if ( 0 <= shiftCount ) {
4391 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4392 if ( ( a.high != 0xC03E ) || aSig ) {
4393 float_raise( float_flag_invalid STATUS_VAR);
4394 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4395 return LIT64( 0x7FFFFFFFFFFFFFFF );
4396 }
4397 }
bb98fe42 4398 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
4399 }
4400 else if ( aExp < 0x3FFF ) {
4401 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
4402 return 0;
4403 }
4404 z = aSig>>( - shiftCount );
bb98fe42 4405 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
158142c2
FB
4406 STATUS(float_exception_flags) |= float_flag_inexact;
4407 }
4408 if ( aSign ) z = - z;
4409 return z;
4410
4411}
4412
4413/*----------------------------------------------------------------------------
4414| Returns the result of converting the extended double-precision floating-
4415| point value `a' to the single-precision floating-point format. The
4416| conversion is performed according to the IEC/IEEE Standard for Binary
4417| Floating-Point Arithmetic.
4418*----------------------------------------------------------------------------*/
4419
4420float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
4421{
4422 flag aSign;
4423 int32 aExp;
bb98fe42 4424 uint64_t aSig;
158142c2
FB
4425
4426 aSig = extractFloatx80Frac( a );
4427 aExp = extractFloatx80Exp( a );
4428 aSign = extractFloatx80Sign( a );
4429 if ( aExp == 0x7FFF ) {
bb98fe42 4430 if ( (uint64_t) ( aSig<<1 ) ) {
bcd4d9af 4431 return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
4432 }
4433 return packFloat32( aSign, 0xFF, 0 );
4434 }
4435 shift64RightJamming( aSig, 33, &aSig );
4436 if ( aExp || aSig ) aExp -= 0x3F81;
4437 return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
4438
4439}
4440
4441/*----------------------------------------------------------------------------
4442| Returns the result of converting the extended double-precision floating-
4443| point value `a' to the double-precision floating-point format. The
4444| conversion is performed according to the IEC/IEEE Standard for Binary
4445| Floating-Point Arithmetic.
4446*----------------------------------------------------------------------------*/
4447
4448float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
4449{
4450 flag aSign;
4451 int32 aExp;
bb98fe42 4452 uint64_t aSig, zSig;
158142c2
FB
4453
4454 aSig = extractFloatx80Frac( a );
4455 aExp = extractFloatx80Exp( a );
4456 aSign = extractFloatx80Sign( a );
4457 if ( aExp == 0x7FFF ) {
bb98fe42 4458 if ( (uint64_t) ( aSig<<1 ) ) {
bcd4d9af 4459 return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
4460 }
4461 return packFloat64( aSign, 0x7FF, 0 );
4462 }
4463 shift64RightJamming( aSig, 1, &zSig );
4464 if ( aExp || aSig ) aExp -= 0x3C01;
4465 return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
4466
4467}
4468
158142c2
FB
4469/*----------------------------------------------------------------------------
4470| Returns the result of converting the extended double-precision floating-
4471| point value `a' to the quadruple-precision floating-point format. The
4472| conversion is performed according to the IEC/IEEE Standard for Binary
4473| Floating-Point Arithmetic.
4474*----------------------------------------------------------------------------*/
4475
4476float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
4477{
4478 flag aSign;
94a49d86 4479 int_fast16_t aExp;
bb98fe42 4480 uint64_t aSig, zSig0, zSig1;
158142c2
FB
4481
4482 aSig = extractFloatx80Frac( a );
4483 aExp = extractFloatx80Exp( a );
4484 aSign = extractFloatx80Sign( a );
bb98fe42 4485 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
bcd4d9af 4486 return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
4487 }
4488 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4489 return packFloat128( aSign, aExp, zSig0, zSig1 );
4490
4491}
4492
158142c2
FB
4493/*----------------------------------------------------------------------------
4494| Rounds the extended double-precision floating-point value `a' to an integer,
4495| and returns the result as an extended quadruple-precision floating-point
4496| value. The operation is performed according to the IEC/IEEE Standard for
4497| Binary Floating-Point Arithmetic.
4498*----------------------------------------------------------------------------*/
4499
4500floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
4501{
4502 flag aSign;
4503 int32 aExp;
bb98fe42 4504 uint64_t lastBitMask, roundBitsMask;
158142c2
FB
4505 int8 roundingMode;
4506 floatx80 z;
4507
4508 aExp = extractFloatx80Exp( a );
4509 if ( 0x403E <= aExp ) {
bb98fe42 4510 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
158142c2
FB
4511 return propagateFloatx80NaN( a, a STATUS_VAR );
4512 }
4513 return a;
4514 }
4515 if ( aExp < 0x3FFF ) {
4516 if ( ( aExp == 0 )
bb98fe42 4517 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
158142c2
FB
4518 return a;
4519 }
4520 STATUS(float_exception_flags) |= float_flag_inexact;
4521 aSign = extractFloatx80Sign( a );
4522 switch ( STATUS(float_rounding_mode) ) {
4523 case float_round_nearest_even:
bb98fe42 4524 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
158142c2
FB
4525 ) {
4526 return
4527 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4528 }
4529 break;
4530 case float_round_down:
4531 return
4532 aSign ?
4533 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4534 : packFloatx80( 0, 0, 0 );
4535 case float_round_up:
4536 return
4537 aSign ? packFloatx80( 1, 0, 0 )
4538 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4539 }
4540 return packFloatx80( aSign, 0, 0 );
4541 }
4542 lastBitMask = 1;
4543 lastBitMask <<= 0x403E - aExp;
4544 roundBitsMask = lastBitMask - 1;
4545 z = a;
4546 roundingMode = STATUS(float_rounding_mode);
4547 if ( roundingMode == float_round_nearest_even ) {
4548 z.low += lastBitMask>>1;
4549 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4550 }
4551 else if ( roundingMode != float_round_to_zero ) {
4552 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
4553 z.low += roundBitsMask;
4554 }
4555 }
4556 z.low &= ~ roundBitsMask;
4557 if ( z.low == 0 ) {
4558 ++z.high;
4559 z.low = LIT64( 0x8000000000000000 );
4560 }
4561 if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
4562 return z;
4563
4564}
4565
4566/*----------------------------------------------------------------------------
4567| Returns the result of adding the absolute values of the extended double-
4568| precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
4569| negated before being returned. `zSign' is ignored if the result is a NaN.
4570| The addition is performed according to the IEC/IEEE Standard for Binary
4571| Floating-Point Arithmetic.
4572*----------------------------------------------------------------------------*/
4573
4574static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
4575{
4576 int32 aExp, bExp, zExp;
bb98fe42 4577 uint64_t aSig, bSig, zSig0, zSig1;
158142c2
FB
4578 int32 expDiff;
4579
4580 aSig = extractFloatx80Frac( a );
4581 aExp = extractFloatx80Exp( a );
4582 bSig = extractFloatx80Frac( b );
4583 bExp = extractFloatx80Exp( b );
4584 expDiff = aExp - bExp;
4585 if ( 0 < expDiff ) {
4586 if ( aExp == 0x7FFF ) {
bb98fe42 4587 if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
4588 return a;
4589 }
4590 if ( bExp == 0 ) --expDiff;
4591 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4592 zExp = aExp;
4593 }
4594 else if ( expDiff < 0 ) {
4595 if ( bExp == 0x7FFF ) {
bb98fe42 4596 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
4597 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4598 }
4599 if ( aExp == 0 ) ++expDiff;
4600 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4601 zExp = bExp;
4602 }
4603 else {
4604 if ( aExp == 0x7FFF ) {
bb98fe42 4605 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
158142c2
FB
4606 return propagateFloatx80NaN( a, b STATUS_VAR );
4607 }
4608 return a;
4609 }
4610 zSig1 = 0;
4611 zSig0 = aSig + bSig;
4612 if ( aExp == 0 ) {
4613 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4614 goto roundAndPack;
4615 }
4616 zExp = aExp;
4617 goto shiftRight1;
4618 }
4619 zSig0 = aSig + bSig;
bb98fe42 4620 if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
158142c2
FB
4621 shiftRight1:
4622 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4623 zSig0 |= LIT64( 0x8000000000000000 );
4624 ++zExp;
4625 roundAndPack:
4626 return
4627 roundAndPackFloatx80(
4628 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4629
4630}
4631
4632/*----------------------------------------------------------------------------
4633| Returns the result of subtracting the absolute values of the extended
4634| double-precision floating-point values `a' and `b'. If `zSign' is 1, the
4635| difference is negated before being returned. `zSign' is ignored if the
4636| result is a NaN. The subtraction is performed according to the IEC/IEEE
4637| Standard for Binary Floating-Point Arithmetic.
4638*----------------------------------------------------------------------------*/
4639
4640static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
4641{
4642 int32 aExp, bExp, zExp;
bb98fe42 4643 uint64_t aSig, bSig, zSig0, zSig1;
158142c2
FB
4644 int32 expDiff;
4645 floatx80 z;
4646
4647 aSig = extractFloatx80Frac( a );
4648 aExp = extractFloatx80Exp( a );
4649 bSig = extractFloatx80Frac( b );
4650 bExp = extractFloatx80Exp( b );
4651 expDiff = aExp - bExp;
4652 if ( 0 < expDiff ) goto aExpBigger;
4653 if ( expDiff < 0 ) goto bExpBigger;
4654 if ( aExp == 0x7FFF ) {
bb98fe42 4655 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
158142c2
FB
4656 return propagateFloatx80NaN( a, b STATUS_VAR );
4657 }
4658 float_raise( float_flag_invalid STATUS_VAR);
4659 z.low = floatx80_default_nan_low;
4660 z.high = floatx80_default_nan_high;
4661 return z;
4662 }
4663 if ( aExp == 0 ) {
4664 aExp = 1;
4665 bExp = 1;
4666 }
4667 zSig1 = 0;
4668 if ( bSig < aSig ) goto aBigger;
4669 if ( aSig < bSig ) goto bBigger;
4670 return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
4671 bExpBigger:
4672 if ( bExp == 0x7FFF ) {
bb98fe42 4673 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
4674 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
4675 }
4676 if ( aExp == 0 ) ++expDiff;
4677 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4678 bBigger:
4679 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4680 zExp = bExp;
4681 zSign ^= 1;
4682 goto normalizeRoundAndPack;
4683 aExpBigger:
4684 if ( aExp == 0x7FFF ) {
bb98fe42 4685 if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
4686 return a;
4687 }
4688 if ( bExp == 0 ) --expDiff;
4689 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4690 aBigger:
4691 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4692 zExp = aExp;
4693 normalizeRoundAndPack:
4694 return
4695 normalizeRoundAndPackFloatx80(
4696 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4697
4698}
4699
4700/*----------------------------------------------------------------------------
4701| Returns the result of adding the extended double-precision floating-point
4702| values `a' and `b'. The operation is performed according to the IEC/IEEE
4703| Standard for Binary Floating-Point Arithmetic.
4704*----------------------------------------------------------------------------*/
4705
4706floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
4707{
4708 flag aSign, bSign;
4709
4710 aSign = extractFloatx80Sign( a );
4711 bSign = extractFloatx80Sign( b );
4712 if ( aSign == bSign ) {
4713 return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4714 }
4715 else {
4716 return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4717 }
4718
4719}
4720
4721/*----------------------------------------------------------------------------
4722| Returns the result of subtracting the extended double-precision floating-
4723| point values `a' and `b'. The operation is performed according to the
4724| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4725*----------------------------------------------------------------------------*/
4726
4727floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
4728{
4729 flag aSign, bSign;
4730
4731 aSign = extractFloatx80Sign( a );
4732 bSign = extractFloatx80Sign( b );
4733 if ( aSign == bSign ) {
4734 return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4735 }
4736 else {
4737 return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4738 }
4739
4740}
4741
4742/*----------------------------------------------------------------------------
4743| Returns the result of multiplying the extended double-precision floating-
4744| point values `a' and `b'. The operation is performed according to the
4745| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4746*----------------------------------------------------------------------------*/
4747
4748floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
4749{
4750 flag aSign, bSign, zSign;
4751 int32 aExp, bExp, zExp;
bb98fe42 4752 uint64_t aSig, bSig, zSig0, zSig1;
158142c2
FB
4753 floatx80 z;
4754
4755 aSig = extractFloatx80Frac( a );
4756 aExp = extractFloatx80Exp( a );
4757 aSign = extractFloatx80Sign( a );
4758 bSig = extractFloatx80Frac( b );
4759 bExp = extractFloatx80Exp( b );
4760 bSign = extractFloatx80Sign( b );
4761 zSign = aSign ^ bSign;
4762 if ( aExp == 0x7FFF ) {
bb98fe42
AF
4763 if ( (uint64_t) ( aSig<<1 )
4764 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
158142c2
FB
4765 return propagateFloatx80NaN( a, b STATUS_VAR );
4766 }
4767 if ( ( bExp | bSig ) == 0 ) goto invalid;
4768 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4769 }
4770 if ( bExp == 0x7FFF ) {
bb98fe42 4771 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
4772 if ( ( aExp | aSig ) == 0 ) {
4773 invalid:
4774 float_raise( float_flag_invalid STATUS_VAR);
4775 z.low = floatx80_default_nan_low;
4776 z.high = floatx80_default_nan_high;
4777 return z;
4778 }
4779 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4780 }
4781 if ( aExp == 0 ) {
4782 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4783 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4784 }
4785 if ( bExp == 0 ) {
4786 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
4787 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4788 }
4789 zExp = aExp + bExp - 0x3FFE;
4790 mul64To128( aSig, bSig, &zSig0, &zSig1 );
bb98fe42 4791 if ( 0 < (int64_t) zSig0 ) {
158142c2
FB
4792 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
4793 --zExp;
4794 }
4795 return
4796 roundAndPackFloatx80(
4797 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4798
4799}
4800
4801/*----------------------------------------------------------------------------
4802| Returns the result of dividing the extended double-precision floating-point
4803| value `a' by the corresponding value `b'. The operation is performed
4804| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4805*----------------------------------------------------------------------------*/
4806
4807floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
4808{
4809 flag aSign, bSign, zSign;
4810 int32 aExp, bExp, zExp;
bb98fe42
AF
4811 uint64_t aSig, bSig, zSig0, zSig1;
4812 uint64_t rem0, rem1, rem2, term0, term1, term2;
158142c2
FB
4813 floatx80 z;
4814
4815 aSig = extractFloatx80Frac( a );
4816 aExp = extractFloatx80Exp( a );
4817 aSign = extractFloatx80Sign( a );
4818 bSig = extractFloatx80Frac( b );
4819 bExp = extractFloatx80Exp( b );
4820 bSign = extractFloatx80Sign( b );
4821 zSign = aSign ^ bSign;
4822 if ( aExp == 0x7FFF ) {
bb98fe42 4823 if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2 4824 if ( bExp == 0x7FFF ) {
bb98fe42 4825 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
4826 goto invalid;
4827 }
4828 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4829 }
4830 if ( bExp == 0x7FFF ) {
bb98fe42 4831 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
4832 return packFloatx80( zSign, 0, 0 );
4833 }
4834 if ( bExp == 0 ) {
4835 if ( bSig == 0 ) {
4836 if ( ( aExp | aSig ) == 0 ) {
4837 invalid:
4838 float_raise( float_flag_invalid STATUS_VAR);
4839 z.low = floatx80_default_nan_low;
4840 z.high = floatx80_default_nan_high;
4841 return z;
4842 }
4843 float_raise( float_flag_divbyzero STATUS_VAR);
4844 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4845 }
4846 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4847 }
4848 if ( aExp == 0 ) {
4849 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4850 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4851 }
4852 zExp = aExp - bExp + 0x3FFE;
4853 rem1 = 0;
4854 if ( bSig <= aSig ) {
4855 shift128Right( aSig, 0, 1, &aSig, &rem1 );
4856 ++zExp;
4857 }
4858 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
4859 mul64To128( bSig, zSig0, &term0, &term1 );
4860 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
bb98fe42 4861 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
4862 --zSig0;
4863 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4864 }
4865 zSig1 = estimateDiv128To64( rem1, 0, bSig );
bb98fe42 4866 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
158142c2
FB
4867 mul64To128( bSig, zSig1, &term1, &term2 );
4868 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
bb98fe42 4869 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
4870 --zSig1;
4871 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
4872 }
4873 zSig1 |= ( ( rem1 | rem2 ) != 0 );
4874 }
4875 return
4876 roundAndPackFloatx80(
4877 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4878
4879}
4880
4881/*----------------------------------------------------------------------------
4882| Returns the remainder of the extended double-precision floating-point value
4883| `a' with respect to the corresponding value `b'. The operation is performed
4884| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4885*----------------------------------------------------------------------------*/
4886
4887floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
4888{
ed086f3d 4889 flag aSign, zSign;
158142c2 4890 int32 aExp, bExp, expDiff;
bb98fe42
AF
4891 uint64_t aSig0, aSig1, bSig;
4892 uint64_t q, term0, term1, alternateASig0, alternateASig1;
158142c2
FB
4893 floatx80 z;
4894
4895 aSig0 = extractFloatx80Frac( a );
4896 aExp = extractFloatx80Exp( a );
4897 aSign = extractFloatx80Sign( a );
4898 bSig = extractFloatx80Frac( b );
4899 bExp = extractFloatx80Exp( b );
158142c2 4900 if ( aExp == 0x7FFF ) {
bb98fe42
AF
4901 if ( (uint64_t) ( aSig0<<1 )
4902 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
158142c2
FB
4903 return propagateFloatx80NaN( a, b STATUS_VAR );
4904 }
4905 goto invalid;
4906 }
4907 if ( bExp == 0x7FFF ) {
bb98fe42 4908 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
4909 return a;
4910 }
4911 if ( bExp == 0 ) {
4912 if ( bSig == 0 ) {
4913 invalid:
4914 float_raise( float_flag_invalid STATUS_VAR);
4915 z.low = floatx80_default_nan_low;
4916 z.high = floatx80_default_nan_high;
4917 return z;
4918 }
4919 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4920 }
4921 if ( aExp == 0 ) {
bb98fe42 4922 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
158142c2
FB
4923 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4924 }
4925 bSig |= LIT64( 0x8000000000000000 );
4926 zSign = aSign;
4927 expDiff = aExp - bExp;
4928 aSig1 = 0;
4929 if ( expDiff < 0 ) {
4930 if ( expDiff < -1 ) return a;
4931 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
4932 expDiff = 0;
4933 }
4934 q = ( bSig <= aSig0 );
4935 if ( q ) aSig0 -= bSig;
4936 expDiff -= 64;
4937 while ( 0 < expDiff ) {
4938 q = estimateDiv128To64( aSig0, aSig1, bSig );
4939 q = ( 2 < q ) ? q - 2 : 0;
4940 mul64To128( bSig, q, &term0, &term1 );
4941 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4942 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
4943 expDiff -= 62;
4944 }
4945 expDiff += 64;
4946 if ( 0 < expDiff ) {
4947 q = estimateDiv128To64( aSig0, aSig1, bSig );
4948 q = ( 2 < q ) ? q - 2 : 0;
4949 q >>= 64 - expDiff;
4950 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
4951 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4952 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4953 while ( le128( term0, term1, aSig0, aSig1 ) ) {
4954 ++q;
4955 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4956 }
4957 }
4958 else {
4959 term1 = 0;
4960 term0 = bSig;
4961 }
4962 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4963 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4964 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4965 && ( q & 1 ) )
4966 ) {
4967 aSig0 = alternateASig0;
4968 aSig1 = alternateASig1;
4969 zSign = ! zSign;
4970 }
4971 return
4972 normalizeRoundAndPackFloatx80(
4973 80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
4974
4975}
4976
4977/*----------------------------------------------------------------------------
4978| Returns the square root of the extended double-precision floating-point
4979| value `a'. The operation is performed according to the IEC/IEEE Standard
4980| for Binary Floating-Point Arithmetic.
4981*----------------------------------------------------------------------------*/
4982
4983floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
4984{
4985 flag aSign;
4986 int32 aExp, zExp;
bb98fe42
AF
4987 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4988 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
158142c2
FB
4989 floatx80 z;
4990
4991 aSig0 = extractFloatx80Frac( a );
4992 aExp = extractFloatx80Exp( a );
4993 aSign = extractFloatx80Sign( a );
4994 if ( aExp == 0x7FFF ) {
bb98fe42 4995 if ( (uint64_t) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
158142c2
FB
4996 if ( ! aSign ) return a;
4997 goto invalid;
4998 }
4999 if ( aSign ) {
5000 if ( ( aExp | aSig0 ) == 0 ) return a;
5001 invalid:
5002 float_raise( float_flag_invalid STATUS_VAR);
5003 z.low = floatx80_default_nan_low;
5004 z.high = floatx80_default_nan_high;
5005 return z;
5006 }
5007 if ( aExp == 0 ) {
5008 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5009 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5010 }
5011 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5012 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5013 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5014 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5015 doubleZSig0 = zSig0<<1;
5016 mul64To128( zSig0, zSig0, &term0, &term1 );
5017 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
bb98fe42 5018 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
5019 --zSig0;
5020 doubleZSig0 -= 2;
5021 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5022 }
5023 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5024 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5025 if ( zSig1 == 0 ) zSig1 = 1;
5026 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5027 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5028 mul64To128( zSig1, zSig1, &term2, &term3 );
5029 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
bb98fe42 5030 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
5031 --zSig1;
5032 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5033 term3 |= 1;
5034 term2 |= doubleZSig0;
5035 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5036 }
5037 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5038 }
5039 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5040 zSig0 |= doubleZSig0;
5041 return
5042 roundAndPackFloatx80(
5043 STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
5044
5045}
5046
5047/*----------------------------------------------------------------------------
b689362d
AJ
5048| Returns 1 if the extended double-precision floating-point value `a' is equal
5049| to the corresponding value `b', and 0 otherwise. The invalid exception is
5050| raised if either operand is a NaN. Otherwise, the comparison is performed
5051| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
5052*----------------------------------------------------------------------------*/
5053
b689362d 5054int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
158142c2
FB
5055{
5056
5057 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5058 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5059 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5060 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2 5061 ) {
b689362d 5062 float_raise( float_flag_invalid STATUS_VAR);
158142c2
FB
5063 return 0;
5064 }
5065 return
5066 ( a.low == b.low )
5067 && ( ( a.high == b.high )
5068 || ( ( a.low == 0 )
bb98fe42 5069 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
5070 );
5071
5072}
5073
5074/*----------------------------------------------------------------------------
5075| Returns 1 if the extended double-precision floating-point value `a' is
5076| less than or equal to the corresponding value `b', and 0 otherwise. The
f5a64251
AJ
5077| invalid exception is raised if either operand is a NaN. The comparison is
5078| performed according to the IEC/IEEE Standard for Binary Floating-Point
5079| Arithmetic.
158142c2
FB
5080*----------------------------------------------------------------------------*/
5081
750afe93 5082int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
158142c2
FB
5083{
5084 flag aSign, bSign;
5085
5086 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5087 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5088 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5089 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2
FB
5090 ) {
5091 float_raise( float_flag_invalid STATUS_VAR);
5092 return 0;
5093 }
5094 aSign = extractFloatx80Sign( a );
5095 bSign = extractFloatx80Sign( b );
5096 if ( aSign != bSign ) {
5097 return
5098 aSign
bb98fe42 5099 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5100 == 0 );
5101 }
5102 return
5103 aSign ? le128( b.high, b.low, a.high, a.low )
5104 : le128( a.high, a.low, b.high, b.low );
5105
5106}
5107
5108/*----------------------------------------------------------------------------
5109| Returns 1 if the extended double-precision floating-point value `a' is
f5a64251
AJ
5110| less than the corresponding value `b', and 0 otherwise. The invalid
5111| exception is raised if either operand is a NaN. The comparison is performed
5112| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
5113*----------------------------------------------------------------------------*/
5114
750afe93 5115int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
158142c2
FB
5116{
5117 flag aSign, bSign;
5118
5119 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5120 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5121 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5122 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2
FB
5123 ) {
5124 float_raise( float_flag_invalid STATUS_VAR);
5125 return 0;
5126 }
5127 aSign = extractFloatx80Sign( a );
5128 bSign = extractFloatx80Sign( b );
5129 if ( aSign != bSign ) {
5130 return
5131 aSign
bb98fe42 5132 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5133 != 0 );
5134 }
5135 return
5136 aSign ? lt128( b.high, b.low, a.high, a.low )
5137 : lt128( a.high, a.low, b.high, b.low );
5138
5139}
5140
67b7861d
AJ
5141/*----------------------------------------------------------------------------
5142| Returns 1 if the extended double-precision floating-point values `a' and `b'
f5a64251
AJ
5143| cannot be compared, and 0 otherwise. The invalid exception is raised if
5144| either operand is a NaN. The comparison is performed according to the
5145| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
67b7861d
AJ
5146*----------------------------------------------------------------------------*/
5147int floatx80_unordered( floatx80 a, floatx80 b STATUS_PARAM )
5148{
5149 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5150 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5151 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5152 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5153 ) {
5154 float_raise( float_flag_invalid STATUS_VAR);
5155 return 1;
5156 }
5157 return 0;
5158}
5159
158142c2 5160/*----------------------------------------------------------------------------
b689362d 5161| Returns 1 if the extended double-precision floating-point value `a' is
f5a64251
AJ
5162| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5163| cause an exception. The comparison is performed according to the IEC/IEEE
5164| Standard for Binary Floating-Point Arithmetic.
158142c2
FB
5165*----------------------------------------------------------------------------*/
5166
b689362d 5167int floatx80_eq_quiet( floatx80 a, floatx80 b STATUS_PARAM )
158142c2
FB
5168{
5169
5170 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5171 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5172 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5173 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2 5174 ) {
b689362d
AJ
5175 if ( floatx80_is_signaling_nan( a )
5176 || floatx80_is_signaling_nan( b ) ) {
5177 float_raise( float_flag_invalid STATUS_VAR);
5178 }
158142c2
FB
5179 return 0;
5180 }
5181 return
5182 ( a.low == b.low )
5183 && ( ( a.high == b.high )
5184 || ( ( a.low == 0 )
bb98fe42 5185 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
5186 );
5187
5188}
5189
5190/*----------------------------------------------------------------------------
5191| Returns 1 if the extended double-precision floating-point value `a' is less
5192| than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
5193| do not cause an exception. Otherwise, the comparison is performed according
5194| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5195*----------------------------------------------------------------------------*/
5196
750afe93 5197int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
158142c2
FB
5198{
5199 flag aSign, bSign;
5200
5201 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5202 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5203 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5204 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2
FB
5205 ) {
5206 if ( floatx80_is_signaling_nan( a )
5207 || floatx80_is_signaling_nan( b ) ) {
5208 float_raise( float_flag_invalid STATUS_VAR);
5209 }
5210 return 0;
5211 }
5212 aSign = extractFloatx80Sign( a );
5213 bSign = extractFloatx80Sign( b );
5214 if ( aSign != bSign ) {
5215 return
5216 aSign
bb98fe42 5217 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5218 == 0 );
5219 }
5220 return
5221 aSign ? le128( b.high, b.low, a.high, a.low )
5222 : le128( a.high, a.low, b.high, b.low );
5223
5224}
5225
5226/*----------------------------------------------------------------------------
5227| Returns 1 if the extended double-precision floating-point value `a' is less
5228| than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
5229| an exception. Otherwise, the comparison is performed according to the
5230| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5231*----------------------------------------------------------------------------*/
5232
750afe93 5233int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
158142c2
FB
5234{
5235 flag aSign, bSign;
5236
5237 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5238 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5239 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5240 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2
FB
5241 ) {
5242 if ( floatx80_is_signaling_nan( a )
5243 || floatx80_is_signaling_nan( b ) ) {
5244 float_raise( float_flag_invalid STATUS_VAR);
5245 }
5246 return 0;
5247 }
5248 aSign = extractFloatx80Sign( a );
5249 bSign = extractFloatx80Sign( b );
5250 if ( aSign != bSign ) {
5251 return
5252 aSign
bb98fe42 5253 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5254 != 0 );
5255 }
5256 return
5257 aSign ? lt128( b.high, b.low, a.high, a.low )
5258 : lt128( a.high, a.low, b.high, b.low );
5259
5260}
5261
67b7861d
AJ
5262/*----------------------------------------------------------------------------
5263| Returns 1 if the extended double-precision floating-point values `a' and `b'
5264| cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
5265| The comparison is performed according to the IEC/IEEE Standard for Binary
5266| Floating-Point Arithmetic.
5267*----------------------------------------------------------------------------*/
5268int floatx80_unordered_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5269{
5270 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5271 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5272 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5273 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5274 ) {
5275 if ( floatx80_is_signaling_nan( a )
5276 || floatx80_is_signaling_nan( b ) ) {
5277 float_raise( float_flag_invalid STATUS_VAR);
5278 }
5279 return 1;
5280 }
5281 return 0;
5282}
5283
158142c2
FB
5284/*----------------------------------------------------------------------------
5285| Returns the result of converting the quadruple-precision floating-point
5286| value `a' to the 32-bit two's complement integer format. The conversion
5287| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5288| Arithmetic---which means in particular that the conversion is rounded
5289| according to the current rounding mode. If `a' is a NaN, the largest
5290| positive integer is returned. Otherwise, if the conversion overflows, the
5291| largest integer with the same sign as `a' is returned.
5292*----------------------------------------------------------------------------*/
5293
5294int32 float128_to_int32( float128 a STATUS_PARAM )
5295{
5296 flag aSign;
5297 int32 aExp, shiftCount;
bb98fe42 5298 uint64_t aSig0, aSig1;
158142c2
FB
5299
5300 aSig1 = extractFloat128Frac1( a );
5301 aSig0 = extractFloat128Frac0( a );
5302 aExp = extractFloat128Exp( a );
5303 aSign = extractFloat128Sign( a );
5304 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5305 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5306 aSig0 |= ( aSig1 != 0 );
5307 shiftCount = 0x4028 - aExp;
5308 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5309 return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
5310
5311}
5312
5313/*----------------------------------------------------------------------------
5314| Returns the result of converting the quadruple-precision floating-point
5315| value `a' to the 32-bit two's complement integer format. The conversion
5316| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5317| Arithmetic, except that the conversion is always rounded toward zero. If
5318| `a' is a NaN, the largest positive integer is returned. Otherwise, if the
5319| conversion overflows, the largest integer with the same sign as `a' is
5320| returned.
5321*----------------------------------------------------------------------------*/
5322
5323int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
5324{
5325 flag aSign;
5326 int32 aExp, shiftCount;
bb98fe42 5327 uint64_t aSig0, aSig1, savedASig;
b3a6a2e0 5328 int32_t z;
158142c2
FB
5329
5330 aSig1 = extractFloat128Frac1( a );
5331 aSig0 = extractFloat128Frac0( a );
5332 aExp = extractFloat128Exp( a );
5333 aSign = extractFloat128Sign( a );
5334 aSig0 |= ( aSig1 != 0 );
5335 if ( 0x401E < aExp ) {
5336 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5337 goto invalid;
5338 }
5339 else if ( aExp < 0x3FFF ) {
5340 if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
5341 return 0;
5342 }
5343 aSig0 |= LIT64( 0x0001000000000000 );
5344 shiftCount = 0x402F - aExp;
5345 savedASig = aSig0;
5346 aSig0 >>= shiftCount;
5347 z = aSig0;
5348 if ( aSign ) z = - z;
5349 if ( ( z < 0 ) ^ aSign ) {
5350 invalid:
5351 float_raise( float_flag_invalid STATUS_VAR);
bb98fe42 5352 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2
FB
5353 }
5354 if ( ( aSig0<<shiftCount ) != savedASig ) {
5355 STATUS(float_exception_flags) |= float_flag_inexact;
5356 }
5357 return z;
5358
5359}
5360
5361/*----------------------------------------------------------------------------
5362| Returns the result of converting the quadruple-precision floating-point
5363| value `a' to the 64-bit two's complement integer format. The conversion
5364| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5365| Arithmetic---which means in particular that the conversion is rounded
5366| according to the current rounding mode. If `a' is a NaN, the largest
5367| positive integer is returned. Otherwise, if the conversion overflows, the
5368| largest integer with the same sign as `a' is returned.
5369*----------------------------------------------------------------------------*/
5370
5371int64 float128_to_int64( float128 a STATUS_PARAM )
5372{
5373 flag aSign;
5374 int32 aExp, shiftCount;
bb98fe42 5375 uint64_t aSig0, aSig1;
158142c2
FB
5376
5377 aSig1 = extractFloat128Frac1( a );
5378 aSig0 = extractFloat128Frac0( a );
5379 aExp = extractFloat128Exp( a );
5380 aSign = extractFloat128Sign( a );
5381 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5382 shiftCount = 0x402F - aExp;
5383 if ( shiftCount <= 0 ) {
5384 if ( 0x403E < aExp ) {
5385 float_raise( float_flag_invalid STATUS_VAR);
5386 if ( ! aSign
5387 || ( ( aExp == 0x7FFF )
5388 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5389 )
5390 ) {
5391 return LIT64( 0x7FFFFFFFFFFFFFFF );
5392 }
bb98fe42 5393 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
5394 }
5395 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5396 }
5397 else {
5398 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5399 }
5400 return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
5401
5402}
5403
5404/*----------------------------------------------------------------------------
5405| Returns the result of converting the quadruple-precision floating-point
5406| value `a' to the 64-bit two's complement integer format. The conversion
5407| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5408| Arithmetic, except that the conversion is always rounded toward zero.
5409| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
5410| the conversion overflows, the largest integer with the same sign as `a' is
5411| returned.
5412*----------------------------------------------------------------------------*/
5413
5414int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
5415{
5416 flag aSign;
5417 int32 aExp, shiftCount;
bb98fe42 5418 uint64_t aSig0, aSig1;
158142c2
FB
5419 int64 z;
5420
5421 aSig1 = extractFloat128Frac1( a );
5422 aSig0 = extractFloat128Frac0( a );
5423 aExp = extractFloat128Exp( a );
5424 aSign = extractFloat128Sign( a );
5425 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5426 shiftCount = aExp - 0x402F;
5427 if ( 0 < shiftCount ) {
5428 if ( 0x403E <= aExp ) {
5429 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5430 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
5431 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5432 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
5433 }
5434 else {
5435 float_raise( float_flag_invalid STATUS_VAR);
5436 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5437 return LIT64( 0x7FFFFFFFFFFFFFFF );
5438 }
5439 }
bb98fe42 5440 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
5441 }
5442 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
bb98fe42 5443 if ( (uint64_t) ( aSig1<<shiftCount ) ) {
158142c2
FB
5444 STATUS(float_exception_flags) |= float_flag_inexact;
5445 }
5446 }
5447 else {
5448 if ( aExp < 0x3FFF ) {
5449 if ( aExp | aSig0 | aSig1 ) {
5450 STATUS(float_exception_flags) |= float_flag_inexact;
5451 }
5452 return 0;
5453 }
5454 z = aSig0>>( - shiftCount );
5455 if ( aSig1
bb98fe42 5456 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
158142c2
FB
5457 STATUS(float_exception_flags) |= float_flag_inexact;
5458 }
5459 }
5460 if ( aSign ) z = - z;
5461 return z;
5462
5463}
5464
5465/*----------------------------------------------------------------------------
5466| Returns the result of converting the quadruple-precision floating-point
5467| value `a' to the single-precision floating-point format. The conversion
5468| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5469| Arithmetic.
5470*----------------------------------------------------------------------------*/
5471
5472float32 float128_to_float32( float128 a STATUS_PARAM )
5473{
5474 flag aSign;
5475 int32 aExp;
bb98fe42
AF
5476 uint64_t aSig0, aSig1;
5477 uint32_t zSig;
158142c2
FB
5478
5479 aSig1 = extractFloat128Frac1( a );
5480 aSig0 = extractFloat128Frac0( a );
5481 aExp = extractFloat128Exp( a );
5482 aSign = extractFloat128Sign( a );
5483 if ( aExp == 0x7FFF ) {
5484 if ( aSig0 | aSig1 ) {
bcd4d9af 5485 return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
5486 }
5487 return packFloat32( aSign, 0xFF, 0 );
5488 }
5489 aSig0 |= ( aSig1 != 0 );
5490 shift64RightJamming( aSig0, 18, &aSig0 );
5491 zSig = aSig0;
5492 if ( aExp || zSig ) {
5493 zSig |= 0x40000000;
5494 aExp -= 0x3F81;
5495 }
5496 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
5497
5498}
5499
5500/*----------------------------------------------------------------------------
5501| Returns the result of converting the quadruple-precision floating-point
5502| value `a' to the double-precision floating-point format. The conversion
5503| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5504| Arithmetic.
5505*----------------------------------------------------------------------------*/
5506
5507float64 float128_to_float64( float128 a STATUS_PARAM )
5508{
5509 flag aSign;
5510 int32 aExp;
bb98fe42 5511 uint64_t aSig0, aSig1;
158142c2
FB
5512
5513 aSig1 = extractFloat128Frac1( a );
5514 aSig0 = extractFloat128Frac0( a );
5515 aExp = extractFloat128Exp( a );
5516 aSign = extractFloat128Sign( a );
5517 if ( aExp == 0x7FFF ) {
5518 if ( aSig0 | aSig1 ) {
bcd4d9af 5519 return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
5520 }
5521 return packFloat64( aSign, 0x7FF, 0 );
5522 }
5523 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5524 aSig0 |= ( aSig1 != 0 );
5525 if ( aExp || aSig0 ) {
5526 aSig0 |= LIT64( 0x4000000000000000 );
5527 aExp -= 0x3C01;
5528 }
5529 return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
5530
5531}
5532
158142c2
FB
5533/*----------------------------------------------------------------------------
5534| Returns the result of converting the quadruple-precision floating-point
5535| value `a' to the extended double-precision floating-point format. The
5536| conversion is performed according to the IEC/IEEE Standard for Binary
5537| Floating-Point Arithmetic.
5538*----------------------------------------------------------------------------*/
5539
5540floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
5541{
5542 flag aSign;
5543 int32 aExp;
bb98fe42 5544 uint64_t aSig0, aSig1;
158142c2
FB
5545
5546 aSig1 = extractFloat128Frac1( a );
5547 aSig0 = extractFloat128Frac0( a );
5548 aExp = extractFloat128Exp( a );
5549 aSign = extractFloat128Sign( a );
5550 if ( aExp == 0x7FFF ) {
5551 if ( aSig0 | aSig1 ) {
bcd4d9af 5552 return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
5553 }
5554 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5555 }
5556 if ( aExp == 0 ) {
5557 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5558 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5559 }
5560 else {
5561 aSig0 |= LIT64( 0x0001000000000000 );
5562 }
5563 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5564 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
5565
5566}
5567
158142c2
FB
5568/*----------------------------------------------------------------------------
5569| Rounds the quadruple-precision floating-point value `a' to an integer, and
5570| returns the result as a quadruple-precision floating-point value. The
5571| operation is performed according to the IEC/IEEE Standard for Binary
5572| Floating-Point Arithmetic.
5573*----------------------------------------------------------------------------*/
5574
5575float128 float128_round_to_int( float128 a STATUS_PARAM )
5576{
5577 flag aSign;
5578 int32 aExp;
bb98fe42 5579 uint64_t lastBitMask, roundBitsMask;
158142c2
FB
5580 int8 roundingMode;
5581 float128 z;
5582
5583 aExp = extractFloat128Exp( a );
5584 if ( 0x402F <= aExp ) {
5585 if ( 0x406F <= aExp ) {
5586 if ( ( aExp == 0x7FFF )
5587 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5588 ) {
5589 return propagateFloat128NaN( a, a STATUS_VAR );
5590 }
5591 return a;
5592 }
5593 lastBitMask = 1;
5594 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5595 roundBitsMask = lastBitMask - 1;
5596 z = a;
5597 roundingMode = STATUS(float_rounding_mode);
5598 if ( roundingMode == float_round_nearest_even ) {
5599 if ( lastBitMask ) {
5600 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
5601 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
5602 }
5603 else {
bb98fe42 5604 if ( (int64_t) z.low < 0 ) {
158142c2 5605 ++z.high;
bb98fe42 5606 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
158142c2
FB
5607 }
5608 }
5609 }
5610 else if ( roundingMode != float_round_to_zero ) {
5611 if ( extractFloat128Sign( z )
5612 ^ ( roundingMode == float_round_up ) ) {
5613 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
5614 }
5615 }
5616 z.low &= ~ roundBitsMask;
5617 }
5618 else {
5619 if ( aExp < 0x3FFF ) {
bb98fe42 5620 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
158142c2
FB
5621 STATUS(float_exception_flags) |= float_flag_inexact;
5622 aSign = extractFloat128Sign( a );
5623 switch ( STATUS(float_rounding_mode) ) {
5624 case float_round_nearest_even:
5625 if ( ( aExp == 0x3FFE )
5626 && ( extractFloat128Frac0( a )
5627 | extractFloat128Frac1( a ) )
5628 ) {
5629 return packFloat128( aSign, 0x3FFF, 0, 0 );
5630 }
5631 break;
5632 case float_round_down:
5633 return
5634 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
5635 : packFloat128( 0, 0, 0, 0 );
5636 case float_round_up:
5637 return
5638 aSign ? packFloat128( 1, 0, 0, 0 )
5639 : packFloat128( 0, 0x3FFF, 0, 0 );
5640 }
5641 return packFloat128( aSign, 0, 0, 0 );
5642 }
5643 lastBitMask = 1;
5644 lastBitMask <<= 0x402F - aExp;
5645 roundBitsMask = lastBitMask - 1;
5646 z.low = 0;
5647 z.high = a.high;
5648 roundingMode = STATUS(float_rounding_mode);
5649 if ( roundingMode == float_round_nearest_even ) {
5650 z.high += lastBitMask>>1;
5651 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
5652 z.high &= ~ lastBitMask;
5653 }
5654 }
5655 else if ( roundingMode != float_round_to_zero ) {
5656 if ( extractFloat128Sign( z )
5657 ^ ( roundingMode == float_round_up ) ) {
5658 z.high |= ( a.low != 0 );
5659 z.high += roundBitsMask;
5660 }
5661 }
5662 z.high &= ~ roundBitsMask;
5663 }
5664 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
5665 STATUS(float_exception_flags) |= float_flag_inexact;
5666 }
5667 return z;
5668
5669}
5670
5671/*----------------------------------------------------------------------------
5672| Returns the result of adding the absolute values of the quadruple-precision
5673| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
5674| before being returned. `zSign' is ignored if the result is a NaN.
5675| The addition is performed according to the IEC/IEEE Standard for Binary
5676| Floating-Point Arithmetic.
5677*----------------------------------------------------------------------------*/
5678
5679static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5680{
5681 int32 aExp, bExp, zExp;
bb98fe42 5682 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
158142c2
FB
5683 int32 expDiff;
5684
5685 aSig1 = extractFloat128Frac1( a );
5686 aSig0 = extractFloat128Frac0( a );
5687 aExp = extractFloat128Exp( a );
5688 bSig1 = extractFloat128Frac1( b );
5689 bSig0 = extractFloat128Frac0( b );
5690 bExp = extractFloat128Exp( b );
5691 expDiff = aExp - bExp;
5692 if ( 0 < expDiff ) {
5693 if ( aExp == 0x7FFF ) {
5694 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5695 return a;
5696 }
5697 if ( bExp == 0 ) {
5698 --expDiff;
5699 }
5700 else {
5701 bSig0 |= LIT64( 0x0001000000000000 );
5702 }
5703 shift128ExtraRightJamming(
5704 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
5705 zExp = aExp;
5706 }
5707 else if ( expDiff < 0 ) {
5708 if ( bExp == 0x7FFF ) {
5709 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5710 return packFloat128( zSign, 0x7FFF, 0, 0 );
5711 }
5712 if ( aExp == 0 ) {
5713 ++expDiff;
5714 }
5715 else {
5716 aSig0 |= LIT64( 0x0001000000000000 );
5717 }
5718 shift128ExtraRightJamming(
5719 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
5720 zExp = bExp;
5721 }
5722 else {
5723 if ( aExp == 0x7FFF ) {
5724 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5725 return propagateFloat128NaN( a, b STATUS_VAR );
5726 }
5727 return a;
5728 }
5729 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
fe76d976 5730 if ( aExp == 0 ) {
e6afc87f
PM
5731 if (STATUS(flush_to_zero)) {
5732 if (zSig0 | zSig1) {
5733 float_raise(float_flag_output_denormal STATUS_VAR);
5734 }
5735 return packFloat128(zSign, 0, 0, 0);
5736 }
fe76d976
PB
5737 return packFloat128( zSign, 0, zSig0, zSig1 );
5738 }
158142c2
FB
5739 zSig2 = 0;
5740 zSig0 |= LIT64( 0x0002000000000000 );
5741 zExp = aExp;
5742 goto shiftRight1;
5743 }
5744 aSig0 |= LIT64( 0x0001000000000000 );
5745 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5746 --zExp;
5747 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
5748 ++zExp;
5749 shiftRight1:
5750 shift128ExtraRightJamming(
5751 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5752 roundAndPack:
5753 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5754
5755}
5756
5757/*----------------------------------------------------------------------------
5758| Returns the result of subtracting the absolute values of the quadruple-
5759| precision floating-point values `a' and `b'. If `zSign' is 1, the
5760| difference is negated before being returned. `zSign' is ignored if the
5761| result is a NaN. The subtraction is performed according to the IEC/IEEE
5762| Standard for Binary Floating-Point Arithmetic.
5763*----------------------------------------------------------------------------*/
5764
5765static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5766{
5767 int32 aExp, bExp, zExp;
bb98fe42 5768 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
158142c2
FB
5769 int32 expDiff;
5770 float128 z;
5771
5772 aSig1 = extractFloat128Frac1( a );
5773 aSig0 = extractFloat128Frac0( a );
5774 aExp = extractFloat128Exp( a );
5775 bSig1 = extractFloat128Frac1( b );
5776 bSig0 = extractFloat128Frac0( b );
5777 bExp = extractFloat128Exp( b );
5778 expDiff = aExp - bExp;
5779 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5780 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
5781 if ( 0 < expDiff ) goto aExpBigger;
5782 if ( expDiff < 0 ) goto bExpBigger;
5783 if ( aExp == 0x7FFF ) {
5784 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5785 return propagateFloat128NaN( a, b STATUS_VAR );
5786 }
5787 float_raise( float_flag_invalid STATUS_VAR);
5788 z.low = float128_default_nan_low;
5789 z.high = float128_default_nan_high;
5790 return z;
5791 }
5792 if ( aExp == 0 ) {
5793 aExp = 1;
5794 bExp = 1;
5795 }
5796 if ( bSig0 < aSig0 ) goto aBigger;
5797 if ( aSig0 < bSig0 ) goto bBigger;
5798 if ( bSig1 < aSig1 ) goto aBigger;
5799 if ( aSig1 < bSig1 ) goto bBigger;
5800 return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
5801 bExpBigger:
5802 if ( bExp == 0x7FFF ) {
5803 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5804 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
5805 }
5806 if ( aExp == 0 ) {
5807 ++expDiff;
5808 }
5809 else {
5810 aSig0 |= LIT64( 0x4000000000000000 );
5811 }
5812 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5813 bSig0 |= LIT64( 0x4000000000000000 );
5814 bBigger:
5815 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
5816 zExp = bExp;
5817 zSign ^= 1;
5818 goto normalizeRoundAndPack;
5819 aExpBigger:
5820 if ( aExp == 0x7FFF ) {
5821 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5822 return a;
5823 }
5824 if ( bExp == 0 ) {
5825 --expDiff;
5826 }
5827 else {
5828 bSig0 |= LIT64( 0x4000000000000000 );
5829 }
5830 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
5831 aSig0 |= LIT64( 0x4000000000000000 );
5832 aBigger:
5833 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5834 zExp = aExp;
5835 normalizeRoundAndPack:
5836 --zExp;
5837 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
5838
5839}
5840
5841/*----------------------------------------------------------------------------
5842| Returns the result of adding the quadruple-precision floating-point values
5843| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
5844| for Binary Floating-Point Arithmetic.
5845*----------------------------------------------------------------------------*/
5846
5847float128 float128_add( float128 a, float128 b STATUS_PARAM )
5848{
5849 flag aSign, bSign;
5850
5851 aSign = extractFloat128Sign( a );
5852 bSign = extractFloat128Sign( b );
5853 if ( aSign == bSign ) {
5854 return addFloat128Sigs( a, b, aSign STATUS_VAR );
5855 }
5856 else {
5857 return subFloat128Sigs( a, b, aSign STATUS_VAR );
5858 }
5859
5860}
5861
5862/*----------------------------------------------------------------------------
5863| Returns the result of subtracting the quadruple-precision floating-point
5864| values `a' and `b'. The operation is performed according to the IEC/IEEE
5865| Standard for Binary Floating-Point Arithmetic.
5866*----------------------------------------------------------------------------*/
5867
5868float128 float128_sub( float128 a, float128 b STATUS_PARAM )
5869{
5870 flag aSign, bSign;
5871
5872 aSign = extractFloat128Sign( a );
5873 bSign = extractFloat128Sign( b );
5874 if ( aSign == bSign ) {
5875 return subFloat128Sigs( a, b, aSign STATUS_VAR );
5876 }
5877 else {
5878 return addFloat128Sigs( a, b, aSign STATUS_VAR );
5879 }
5880
5881}
5882
5883/*----------------------------------------------------------------------------
5884| Returns the result of multiplying the quadruple-precision floating-point
5885| values `a' and `b'. The operation is performed according to the IEC/IEEE
5886| Standard for Binary Floating-Point Arithmetic.
5887*----------------------------------------------------------------------------*/
5888
5889float128 float128_mul( float128 a, float128 b STATUS_PARAM )
5890{
5891 flag aSign, bSign, zSign;
5892 int32 aExp, bExp, zExp;
bb98fe42 5893 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
158142c2
FB
5894 float128 z;
5895
5896 aSig1 = extractFloat128Frac1( a );
5897 aSig0 = extractFloat128Frac0( a );
5898 aExp = extractFloat128Exp( a );
5899 aSign = extractFloat128Sign( a );
5900 bSig1 = extractFloat128Frac1( b );
5901 bSig0 = extractFloat128Frac0( b );
5902 bExp = extractFloat128Exp( b );
5903 bSign = extractFloat128Sign( b );
5904 zSign = aSign ^ bSign;
5905 if ( aExp == 0x7FFF ) {
5906 if ( ( aSig0 | aSig1 )
5907 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5908 return propagateFloat128NaN( a, b STATUS_VAR );
5909 }
5910 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
5911 return packFloat128( zSign, 0x7FFF, 0, 0 );
5912 }
5913 if ( bExp == 0x7FFF ) {
5914 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5915 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5916 invalid:
5917 float_raise( float_flag_invalid STATUS_VAR);
5918 z.low = float128_default_nan_low;
5919 z.high = float128_default_nan_high;
5920 return z;
5921 }
5922 return packFloat128( zSign, 0x7FFF, 0, 0 );
5923 }
5924 if ( aExp == 0 ) {
5925 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5926 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5927 }
5928 if ( bExp == 0 ) {
5929 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5930 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5931 }
5932 zExp = aExp + bExp - 0x4000;
5933 aSig0 |= LIT64( 0x0001000000000000 );
5934 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5935 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5936 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5937 zSig2 |= ( zSig3 != 0 );
5938 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5939 shift128ExtraRightJamming(
5940 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5941 ++zExp;
5942 }
5943 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5944
5945}
5946
5947/*----------------------------------------------------------------------------
5948| Returns the result of dividing the quadruple-precision floating-point value
5949| `a' by the corresponding value `b'. The operation is performed according to
5950| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5951*----------------------------------------------------------------------------*/
5952
5953float128 float128_div( float128 a, float128 b STATUS_PARAM )
5954{
5955 flag aSign, bSign, zSign;
5956 int32 aExp, bExp, zExp;
bb98fe42
AF
5957 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5958 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
158142c2
FB
5959 float128 z;
5960
5961 aSig1 = extractFloat128Frac1( a );
5962 aSig0 = extractFloat128Frac0( a );
5963 aExp = extractFloat128Exp( a );
5964 aSign = extractFloat128Sign( a );
5965 bSig1 = extractFloat128Frac1( b );
5966 bSig0 = extractFloat128Frac0( b );
5967 bExp = extractFloat128Exp( b );
5968 bSign = extractFloat128Sign( b );
5969 zSign = aSign ^ bSign;
5970 if ( aExp == 0x7FFF ) {
5971 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5972 if ( bExp == 0x7FFF ) {
5973 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5974 goto invalid;
5975 }
5976 return packFloat128( zSign, 0x7FFF, 0, 0 );
5977 }
5978 if ( bExp == 0x7FFF ) {
5979 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5980 return packFloat128( zSign, 0, 0, 0 );
5981 }
5982 if ( bExp == 0 ) {
5983 if ( ( bSig0 | bSig1 ) == 0 ) {
5984 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5985 invalid:
5986 float_raise( float_flag_invalid STATUS_VAR);
5987 z.low = float128_default_nan_low;
5988 z.high = float128_default_nan_high;
5989 return z;
5990 }
5991 float_raise( float_flag_divbyzero STATUS_VAR);
5992 return packFloat128( zSign, 0x7FFF, 0, 0 );
5993 }
5994 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5995 }
5996 if ( aExp == 0 ) {
5997 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5998 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5999 }
6000 zExp = aExp - bExp + 0x3FFD;
6001 shortShift128Left(
6002 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6003 shortShift128Left(
6004 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6005 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6006 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6007 ++zExp;
6008 }
6009 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6010 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6011 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
bb98fe42 6012 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
6013 --zSig0;
6014 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6015 }
6016 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6017 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6018 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6019 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
bb98fe42 6020 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
6021 --zSig1;
6022 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6023 }
6024 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6025 }
6026 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6027 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6028
6029}
6030
6031/*----------------------------------------------------------------------------
6032| Returns the remainder of the quadruple-precision floating-point value `a'
6033| with respect to the corresponding value `b'. The operation is performed
6034| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6035*----------------------------------------------------------------------------*/
6036
6037float128 float128_rem( float128 a, float128 b STATUS_PARAM )
6038{
ed086f3d 6039 flag aSign, zSign;
158142c2 6040 int32 aExp, bExp, expDiff;
bb98fe42
AF
6041 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6042 uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6043 int64_t sigMean0;
158142c2
FB
6044 float128 z;
6045
6046 aSig1 = extractFloat128Frac1( a );
6047 aSig0 = extractFloat128Frac0( a );
6048 aExp = extractFloat128Exp( a );
6049 aSign = extractFloat128Sign( a );
6050 bSig1 = extractFloat128Frac1( b );
6051 bSig0 = extractFloat128Frac0( b );
6052 bExp = extractFloat128Exp( b );
158142c2
FB
6053 if ( aExp == 0x7FFF ) {
6054 if ( ( aSig0 | aSig1 )
6055 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6056 return propagateFloat128NaN( a, b STATUS_VAR );
6057 }
6058 goto invalid;
6059 }
6060 if ( bExp == 0x7FFF ) {
6061 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6062 return a;
6063 }
6064 if ( bExp == 0 ) {
6065 if ( ( bSig0 | bSig1 ) == 0 ) {
6066 invalid:
6067 float_raise( float_flag_invalid STATUS_VAR);
6068 z.low = float128_default_nan_low;
6069 z.high = float128_default_nan_high;
6070 return z;
6071 }
6072 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6073 }
6074 if ( aExp == 0 ) {
6075 if ( ( aSig0 | aSig1 ) == 0 ) return a;
6076 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6077 }
6078 expDiff = aExp - bExp;
6079 if ( expDiff < -1 ) return a;
6080 shortShift128Left(
6081 aSig0 | LIT64( 0x0001000000000000 ),
6082 aSig1,
6083 15 - ( expDiff < 0 ),
6084 &aSig0,
6085 &aSig1
6086 );
6087 shortShift128Left(
6088 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6089 q = le128( bSig0, bSig1, aSig0, aSig1 );
6090 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6091 expDiff -= 64;
6092 while ( 0 < expDiff ) {
6093 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6094 q = ( 4 < q ) ? q - 4 : 0;
6095 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6096 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6097 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6098 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6099 expDiff -= 61;
6100 }
6101 if ( -64 < expDiff ) {
6102 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6103 q = ( 4 < q ) ? q - 4 : 0;
6104 q >>= - expDiff;
6105 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6106 expDiff += 52;
6107 if ( expDiff < 0 ) {
6108 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6109 }
6110 else {
6111 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6112 }
6113 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6114 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6115 }
6116 else {
6117 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6118 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6119 }
6120 do {
6121 alternateASig0 = aSig0;
6122 alternateASig1 = aSig1;
6123 ++q;
6124 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
bb98fe42 6125 } while ( 0 <= (int64_t) aSig0 );
158142c2 6126 add128(
bb98fe42 6127 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
158142c2
FB
6128 if ( ( sigMean0 < 0 )
6129 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6130 aSig0 = alternateASig0;
6131 aSig1 = alternateASig1;
6132 }
bb98fe42 6133 zSign = ( (int64_t) aSig0 < 0 );
158142c2
FB
6134 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6135 return
6136 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
6137
6138}
6139
6140/*----------------------------------------------------------------------------
6141| Returns the square root of the quadruple-precision floating-point value `a'.
6142| The operation is performed according to the IEC/IEEE Standard for Binary
6143| Floating-Point Arithmetic.
6144*----------------------------------------------------------------------------*/
6145
6146float128 float128_sqrt( float128 a STATUS_PARAM )
6147{
6148 flag aSign;
6149 int32 aExp, zExp;
bb98fe42
AF
6150 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6151 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
158142c2
FB
6152 float128 z;
6153
6154 aSig1 = extractFloat128Frac1( a );
6155 aSig0 = extractFloat128Frac0( a );
6156 aExp = extractFloat128Exp( a );
6157 aSign = extractFloat128Sign( a );
6158 if ( aExp == 0x7FFF ) {
6159 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
6160 if ( ! aSign ) return a;
6161 goto invalid;
6162 }
6163 if ( aSign ) {
6164 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6165 invalid:
6166 float_raise( float_flag_invalid STATUS_VAR);
6167 z.low = float128_default_nan_low;
6168 z.high = float128_default_nan_high;
6169 return z;
6170 }
6171 if ( aExp == 0 ) {
6172 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6173 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6174 }
6175 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6176 aSig0 |= LIT64( 0x0001000000000000 );
6177 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6178 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6179 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6180 doubleZSig0 = zSig0<<1;
6181 mul64To128( zSig0, zSig0, &term0, &term1 );
6182 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
bb98fe42 6183 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
6184 --zSig0;
6185 doubleZSig0 -= 2;
6186 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6187 }
6188 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6189 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6190 if ( zSig1 == 0 ) zSig1 = 1;
6191 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6192 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6193 mul64To128( zSig1, zSig1, &term2, &term3 );
6194 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
bb98fe42 6195 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
6196 --zSig1;
6197 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6198 term3 |= 1;
6199 term2 |= doubleZSig0;
6200 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6201 }
6202 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6203 }
6204 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6205 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6206
6207}
6208
6209/*----------------------------------------------------------------------------
6210| Returns 1 if the quadruple-precision floating-point value `a' is equal to
b689362d
AJ
6211| the corresponding value `b', and 0 otherwise. The invalid exception is
6212| raised if either operand is a NaN. Otherwise, the comparison is performed
158142c2
FB
6213| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6214*----------------------------------------------------------------------------*/
6215
b689362d 6216int float128_eq( float128 a, float128 b STATUS_PARAM )
158142c2
FB
6217{
6218
6219 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6220 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6221 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6222 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6223 ) {
b689362d 6224 float_raise( float_flag_invalid STATUS_VAR);
158142c2
FB
6225 return 0;
6226 }
6227 return
6228 ( a.low == b.low )
6229 && ( ( a.high == b.high )
6230 || ( ( a.low == 0 )
bb98fe42 6231 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
6232 );
6233
6234}
6235
6236/*----------------------------------------------------------------------------
6237| Returns 1 if the quadruple-precision floating-point value `a' is less than
f5a64251
AJ
6238| or equal to the corresponding value `b', and 0 otherwise. The invalid
6239| exception is raised if either operand is a NaN. The comparison is performed
6240| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
6241*----------------------------------------------------------------------------*/
6242
750afe93 6243int float128_le( float128 a, float128 b STATUS_PARAM )
158142c2
FB
6244{
6245 flag aSign, bSign;
6246
6247 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6248 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6249 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6250 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6251 ) {
6252 float_raise( float_flag_invalid STATUS_VAR);
6253 return 0;
6254 }
6255 aSign = extractFloat128Sign( a );
6256 bSign = extractFloat128Sign( b );
6257 if ( aSign != bSign ) {
6258 return
6259 aSign
bb98fe42 6260 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
6261 == 0 );
6262 }
6263 return
6264 aSign ? le128( b.high, b.low, a.high, a.low )
6265 : le128( a.high, a.low, b.high, b.low );
6266
6267}
6268
6269/*----------------------------------------------------------------------------
6270| Returns 1 if the quadruple-precision floating-point value `a' is less than
f5a64251
AJ
6271| the corresponding value `b', and 0 otherwise. The invalid exception is
6272| raised if either operand is a NaN. The comparison is performed according
6273| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
6274*----------------------------------------------------------------------------*/
6275
750afe93 6276int float128_lt( float128 a, float128 b STATUS_PARAM )
158142c2
FB
6277{
6278 flag aSign, bSign;
6279
6280 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6281 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6282 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6283 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6284 ) {
6285 float_raise( float_flag_invalid STATUS_VAR);
6286 return 0;
6287 }
6288 aSign = extractFloat128Sign( a );
6289 bSign = extractFloat128Sign( b );
6290 if ( aSign != bSign ) {
6291 return
6292 aSign
bb98fe42 6293 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
6294 != 0 );
6295 }
6296 return
6297 aSign ? lt128( b.high, b.low, a.high, a.low )
6298 : lt128( a.high, a.low, b.high, b.low );
6299
6300}
6301
67b7861d
AJ
6302/*----------------------------------------------------------------------------
6303| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
f5a64251
AJ
6304| be compared, and 0 otherwise. The invalid exception is raised if either
6305| operand is a NaN. The comparison is performed according to the IEC/IEEE
6306| Standard for Binary Floating-Point Arithmetic.
67b7861d
AJ
6307*----------------------------------------------------------------------------*/
6308
6309int float128_unordered( float128 a, float128 b STATUS_PARAM )
6310{
6311 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6312 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6313 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6314 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6315 ) {
6316 float_raise( float_flag_invalid STATUS_VAR);
6317 return 1;
6318 }
6319 return 0;
6320}
6321
158142c2
FB
6322/*----------------------------------------------------------------------------
6323| Returns 1 if the quadruple-precision floating-point value `a' is equal to
f5a64251
AJ
6324| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6325| exception. The comparison is performed according to the IEC/IEEE Standard
6326| for Binary Floating-Point Arithmetic.
158142c2
FB
6327*----------------------------------------------------------------------------*/
6328
b689362d 6329int float128_eq_quiet( float128 a, float128 b STATUS_PARAM )
158142c2
FB
6330{
6331
6332 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6333 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6334 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6335 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6336 ) {
b689362d
AJ
6337 if ( float128_is_signaling_nan( a )
6338 || float128_is_signaling_nan( b ) ) {
6339 float_raise( float_flag_invalid STATUS_VAR);
6340 }
158142c2
FB
6341 return 0;
6342 }
6343 return
6344 ( a.low == b.low )
6345 && ( ( a.high == b.high )
6346 || ( ( a.low == 0 )
bb98fe42 6347 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
6348 );
6349
6350}
6351
6352/*----------------------------------------------------------------------------
6353| Returns 1 if the quadruple-precision floating-point value `a' is less than
6354| or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
6355| cause an exception. Otherwise, the comparison is performed according to the
6356| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6357*----------------------------------------------------------------------------*/
6358
750afe93 6359int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
158142c2
FB
6360{
6361 flag aSign, bSign;
6362
6363 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6364 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6365 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6366 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6367 ) {
6368 if ( float128_is_signaling_nan( a )
6369 || float128_is_signaling_nan( b ) ) {
6370 float_raise( float_flag_invalid STATUS_VAR);
6371 }
6372 return 0;
6373 }
6374 aSign = extractFloat128Sign( a );
6375 bSign = extractFloat128Sign( b );
6376 if ( aSign != bSign ) {
6377 return
6378 aSign
bb98fe42 6379 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
6380 == 0 );
6381 }
6382 return
6383 aSign ? le128( b.high, b.low, a.high, a.low )
6384 : le128( a.high, a.low, b.high, b.low );
6385
6386}
6387
6388/*----------------------------------------------------------------------------
6389| Returns 1 if the quadruple-precision floating-point value `a' is less than
6390| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6391| exception. Otherwise, the comparison is performed according to the IEC/IEEE
6392| Standard for Binary Floating-Point Arithmetic.
6393*----------------------------------------------------------------------------*/
6394
750afe93 6395int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
158142c2
FB
6396{
6397 flag aSign, bSign;
6398
6399 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6400 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6401 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6402 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6403 ) {
6404 if ( float128_is_signaling_nan( a )
6405 || float128_is_signaling_nan( b ) ) {
6406 float_raise( float_flag_invalid STATUS_VAR);
6407 }
6408 return 0;
6409 }
6410 aSign = extractFloat128Sign( a );
6411 bSign = extractFloat128Sign( b );
6412 if ( aSign != bSign ) {
6413 return
6414 aSign
bb98fe42 6415 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
6416 != 0 );
6417 }
6418 return
6419 aSign ? lt128( b.high, b.low, a.high, a.low )
6420 : lt128( a.high, a.low, b.high, b.low );
6421
6422}
6423
67b7861d
AJ
6424/*----------------------------------------------------------------------------
6425| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6426| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
6427| comparison is performed according to the IEC/IEEE Standard for Binary
6428| Floating-Point Arithmetic.
6429*----------------------------------------------------------------------------*/
6430
6431int float128_unordered_quiet( float128 a, float128 b STATUS_PARAM )
6432{
6433 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6434 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6435 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6436 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6437 ) {
6438 if ( float128_is_signaling_nan( a )
6439 || float128_is_signaling_nan( b ) ) {
6440 float_raise( float_flag_invalid STATUS_VAR);
6441 }
6442 return 1;
6443 }
6444 return 0;
6445}
6446
1d6bda35 6447/* misc functions */
9f8d2a09 6448float32 uint32_to_float32( uint32 a STATUS_PARAM )
1d6bda35
FB
6449{
6450 return int64_to_float32(a STATUS_VAR);
6451}
6452
9f8d2a09 6453float64 uint32_to_float64( uint32 a STATUS_PARAM )
1d6bda35
FB
6454{
6455 return int64_to_float64(a STATUS_VAR);
6456}
6457
9f8d2a09 6458uint32 float32_to_uint32( float32 a STATUS_PARAM )
1d6bda35
FB
6459{
6460 int64_t v;
9f8d2a09 6461 uint32 res;
1d6bda35
FB
6462
6463 v = float32_to_int64(a STATUS_VAR);
6464 if (v < 0) {
6465 res = 0;
6466 float_raise( float_flag_invalid STATUS_VAR);
6467 } else if (v > 0xffffffff) {
6468 res = 0xffffffff;
6469 float_raise( float_flag_invalid STATUS_VAR);
6470 } else {
6471 res = v;
6472 }
6473 return res;
6474}
6475
9f8d2a09 6476uint32 float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
1d6bda35
FB
6477{
6478 int64_t v;
9f8d2a09 6479 uint32 res;
1d6bda35
FB
6480
6481 v = float32_to_int64_round_to_zero(a STATUS_VAR);
6482 if (v < 0) {
6483 res = 0;
6484 float_raise( float_flag_invalid STATUS_VAR);
6485 } else if (v > 0xffffffff) {
6486 res = 0xffffffff;
6487 float_raise( float_flag_invalid STATUS_VAR);
6488 } else {
6489 res = v;
6490 }
6491 return res;
6492}
6493
5aea4c58 6494uint_fast16_t float32_to_uint16_round_to_zero(float32 a STATUS_PARAM)
cbcef455
PM
6495{
6496 int64_t v;
5aea4c58 6497 uint_fast16_t res;
cbcef455
PM
6498
6499 v = float32_to_int64_round_to_zero(a STATUS_VAR);
6500 if (v < 0) {
6501 res = 0;
6502 float_raise( float_flag_invalid STATUS_VAR);
6503 } else if (v > 0xffff) {
6504 res = 0xffff;
6505 float_raise( float_flag_invalid STATUS_VAR);
6506 } else {
6507 res = v;
6508 }
6509 return res;
6510}
6511
9f8d2a09 6512uint32 float64_to_uint32( float64 a STATUS_PARAM )
1d6bda35
FB
6513{
6514 int64_t v;
9f8d2a09 6515 uint32 res;
1d6bda35
FB
6516
6517 v = float64_to_int64(a STATUS_VAR);
6518 if (v < 0) {
6519 res = 0;
6520 float_raise( float_flag_invalid STATUS_VAR);
6521 } else if (v > 0xffffffff) {
6522 res = 0xffffffff;
6523 float_raise( float_flag_invalid STATUS_VAR);
6524 } else {
6525 res = v;
6526 }
6527 return res;
6528}
6529
9f8d2a09 6530uint32 float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
1d6bda35
FB
6531{
6532 int64_t v;
9f8d2a09 6533 uint32 res;
1d6bda35
FB
6534
6535 v = float64_to_int64_round_to_zero(a STATUS_VAR);
6536 if (v < 0) {
6537 res = 0;
6538 float_raise( float_flag_invalid STATUS_VAR);
6539 } else if (v > 0xffffffff) {
6540 res = 0xffffffff;
6541 float_raise( float_flag_invalid STATUS_VAR);
6542 } else {
6543 res = v;
6544 }
6545 return res;
6546}
6547
5aea4c58 6548uint_fast16_t float64_to_uint16_round_to_zero(float64 a STATUS_PARAM)
cbcef455
PM
6549{
6550 int64_t v;
5aea4c58 6551 uint_fast16_t res;
cbcef455
PM
6552
6553 v = float64_to_int64_round_to_zero(a STATUS_VAR);
6554 if (v < 0) {
6555 res = 0;
6556 float_raise( float_flag_invalid STATUS_VAR);
6557 } else if (v > 0xffff) {
6558 res = 0xffff;
6559 float_raise( float_flag_invalid STATUS_VAR);
6560 } else {
6561 res = v;
6562 }
6563 return res;
6564}
6565
f090c9d4 6566/* FIXME: This looks broken. */
75d62a58
JM
6567uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
6568{
6569 int64_t v;
6570
f090c9d4
PB
6571 v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
6572 v += float64_val(a);
6573 v = float64_to_int64(make_float64(v) STATUS_VAR);
75d62a58
JM
6574
6575 return v - INT64_MIN;
6576}
6577
6578uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
6579{
6580 int64_t v;
6581
f090c9d4
PB
6582 v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
6583 v += float64_val(a);
6584 v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);
75d62a58
JM
6585
6586 return v - INT64_MIN;
6587}
6588
1d6bda35 6589#define COMPARE(s, nan_exp) \
750afe93 6590INLINE int float ## s ## _compare_internal( float ## s a, float ## s b, \
1d6bda35
FB
6591 int is_quiet STATUS_PARAM ) \
6592{ \
6593 flag aSign, bSign; \
bb98fe42 6594 uint ## s ## _t av, bv; \
37d18660
PM
6595 a = float ## s ## _squash_input_denormal(a STATUS_VAR); \
6596 b = float ## s ## _squash_input_denormal(b STATUS_VAR); \
1d6bda35
FB
6597 \
6598 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
6599 extractFloat ## s ## Frac( a ) ) || \
6600 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
6601 extractFloat ## s ## Frac( b ) )) { \
6602 if (!is_quiet || \
6603 float ## s ## _is_signaling_nan( a ) || \
6604 float ## s ## _is_signaling_nan( b ) ) { \
6605 float_raise( float_flag_invalid STATUS_VAR); \
6606 } \
6607 return float_relation_unordered; \
6608 } \
6609 aSign = extractFloat ## s ## Sign( a ); \
6610 bSign = extractFloat ## s ## Sign( b ); \
f090c9d4 6611 av = float ## s ## _val(a); \
cd8a2533 6612 bv = float ## s ## _val(b); \
1d6bda35 6613 if ( aSign != bSign ) { \
bb98fe42 6614 if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) { \
1d6bda35
FB
6615 /* zero case */ \
6616 return float_relation_equal; \
6617 } else { \
6618 return 1 - (2 * aSign); \
6619 } \
6620 } else { \
f090c9d4 6621 if (av == bv) { \
1d6bda35
FB
6622 return float_relation_equal; \
6623 } else { \
f090c9d4 6624 return 1 - 2 * (aSign ^ ( av < bv )); \
1d6bda35
FB
6625 } \
6626 } \
6627} \
6628 \
750afe93 6629int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM ) \
1d6bda35
FB
6630{ \
6631 return float ## s ## _compare_internal(a, b, 0 STATUS_VAR); \
6632} \
6633 \
750afe93 6634int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \
1d6bda35
FB
6635{ \
6636 return float ## s ## _compare_internal(a, b, 1 STATUS_VAR); \
6637}
6638
6639COMPARE(32, 0xff)
6640COMPARE(64, 0x7ff)
9ee6e8bb 6641
f6714d36
AJ
6642INLINE int floatx80_compare_internal( floatx80 a, floatx80 b,
6643 int is_quiet STATUS_PARAM )
6644{
6645 flag aSign, bSign;
6646
6647 if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
6648 ( extractFloatx80Frac( a )<<1 ) ) ||
6649 ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
6650 ( extractFloatx80Frac( b )<<1 ) )) {
6651 if (!is_quiet ||
6652 floatx80_is_signaling_nan( a ) ||
6653 floatx80_is_signaling_nan( b ) ) {
6654 float_raise( float_flag_invalid STATUS_VAR);
6655 }
6656 return float_relation_unordered;
6657 }
6658 aSign = extractFloatx80Sign( a );
6659 bSign = extractFloatx80Sign( b );
6660 if ( aSign != bSign ) {
6661
6662 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
6663 ( ( a.low | b.low ) == 0 ) ) {
6664 /* zero case */
6665 return float_relation_equal;
6666 } else {
6667 return 1 - (2 * aSign);
6668 }
6669 } else {
6670 if (a.low == b.low && a.high == b.high) {
6671 return float_relation_equal;
6672 } else {
6673 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6674 }
6675 }
6676}
6677
6678int floatx80_compare( floatx80 a, floatx80 b STATUS_PARAM )
6679{
6680 return floatx80_compare_internal(a, b, 0 STATUS_VAR);
6681}
6682
6683int floatx80_compare_quiet( floatx80 a, floatx80 b STATUS_PARAM )
6684{
6685 return floatx80_compare_internal(a, b, 1 STATUS_VAR);
6686}
6687
1f587329
BS
6688INLINE int float128_compare_internal( float128 a, float128 b,
6689 int is_quiet STATUS_PARAM )
6690{
6691 flag aSign, bSign;
6692
6693 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6694 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6695 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6696 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6697 if (!is_quiet ||
6698 float128_is_signaling_nan( a ) ||
6699 float128_is_signaling_nan( b ) ) {
6700 float_raise( float_flag_invalid STATUS_VAR);
6701 }
6702 return float_relation_unordered;
6703 }
6704 aSign = extractFloat128Sign( a );
6705 bSign = extractFloat128Sign( b );
6706 if ( aSign != bSign ) {
6707 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6708 /* zero case */
6709 return float_relation_equal;
6710 } else {
6711 return 1 - (2 * aSign);
6712 }
6713 } else {
6714 if (a.low == b.low && a.high == b.high) {
6715 return float_relation_equal;
6716 } else {
6717 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6718 }
6719 }
6720}
6721
6722int float128_compare( float128 a, float128 b STATUS_PARAM )
6723{
6724 return float128_compare_internal(a, b, 0 STATUS_VAR);
6725}
6726
6727int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
6728{
6729 return float128_compare_internal(a, b, 1 STATUS_VAR);
6730}
6731
274f1b04
PM
6732/* min() and max() functions. These can't be implemented as
6733 * 'compare and pick one input' because that would mishandle
6734 * NaNs and +0 vs -0.
e17ab310
WN
6735 *
6736 * minnum() and maxnum() functions. These are similar to the min()
6737 * and max() functions but if one of the arguments is a QNaN and
6738 * the other is numerical then the numerical argument is returned.
6739 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
6740 * and maxNum() operations. min() and max() are the typical min/max
6741 * semantics provided by many CPUs which predate that specification.
274f1b04 6742 */
e70614ea 6743#define MINMAX(s) \
274f1b04 6744INLINE float ## s float ## s ## _minmax(float ## s a, float ## s b, \
e17ab310 6745 int ismin, int isieee STATUS_PARAM) \
274f1b04
PM
6746{ \
6747 flag aSign, bSign; \
6748 uint ## s ## _t av, bv; \
6749 a = float ## s ## _squash_input_denormal(a STATUS_VAR); \
6750 b = float ## s ## _squash_input_denormal(b STATUS_VAR); \
6751 if (float ## s ## _is_any_nan(a) || \
6752 float ## s ## _is_any_nan(b)) { \
e17ab310
WN
6753 if (isieee) { \
6754 if (float ## s ## _is_quiet_nan(a) && \
6755 !float ## s ##_is_any_nan(b)) { \
6756 return b; \
6757 } else if (float ## s ## _is_quiet_nan(b) && \
6758 !float ## s ## _is_any_nan(a)) { \
6759 return a; \
6760 } \
6761 } \
274f1b04
PM
6762 return propagateFloat ## s ## NaN(a, b STATUS_VAR); \
6763 } \
6764 aSign = extractFloat ## s ## Sign(a); \
6765 bSign = extractFloat ## s ## Sign(b); \
6766 av = float ## s ## _val(a); \
6767 bv = float ## s ## _val(b); \
6768 if (aSign != bSign) { \
6769 if (ismin) { \
6770 return aSign ? a : b; \
6771 } else { \
6772 return aSign ? b : a; \
6773 } \
6774 } else { \
6775 if (ismin) { \
6776 return (aSign ^ (av < bv)) ? a : b; \
6777 } else { \
6778 return (aSign ^ (av < bv)) ? b : a; \
6779 } \
6780 } \
6781} \
6782 \
6783float ## s float ## s ## _min(float ## s a, float ## s b STATUS_PARAM) \
6784{ \
e17ab310 6785 return float ## s ## _minmax(a, b, 1, 0 STATUS_VAR); \
274f1b04
PM
6786} \
6787 \
6788float ## s float ## s ## _max(float ## s a, float ## s b STATUS_PARAM) \
6789{ \
e17ab310
WN
6790 return float ## s ## _minmax(a, b, 0, 0 STATUS_VAR); \
6791} \
6792 \
6793float ## s float ## s ## _minnum(float ## s a, float ## s b STATUS_PARAM) \
6794{ \
6795 return float ## s ## _minmax(a, b, 1, 1 STATUS_VAR); \
6796} \
6797 \
6798float ## s float ## s ## _maxnum(float ## s a, float ## s b STATUS_PARAM) \
6799{ \
6800 return float ## s ## _minmax(a, b, 0, 1 STATUS_VAR); \
274f1b04
PM
6801}
6802
e70614ea
WN
6803MINMAX(32)
6804MINMAX(64)
274f1b04
PM
6805
6806
9ee6e8bb
PB
6807/* Multiply A by 2 raised to the power N. */
6808float32 float32_scalbn( float32 a, int n STATUS_PARAM )
6809{
6810 flag aSign;
326b9e98 6811 int16_t aExp;
bb98fe42 6812 uint32_t aSig;
9ee6e8bb 6813
37d18660 6814 a = float32_squash_input_denormal(a STATUS_VAR);
9ee6e8bb
PB
6815 aSig = extractFloat32Frac( a );
6816 aExp = extractFloat32Exp( a );
6817 aSign = extractFloat32Sign( a );
6818
6819 if ( aExp == 0xFF ) {
326b9e98
AJ
6820 if ( aSig ) {
6821 return propagateFloat32NaN( a, a STATUS_VAR );
6822 }
9ee6e8bb
PB
6823 return a;
6824 }
69397542
PB
6825 if ( aExp != 0 )
6826 aSig |= 0x00800000;
6827 else if ( aSig == 0 )
6828 return a;
6829
326b9e98
AJ
6830 if (n > 0x200) {
6831 n = 0x200;
6832 } else if (n < -0x200) {
6833 n = -0x200;
6834 }
6835
69397542
PB
6836 aExp += n - 1;
6837 aSig <<= 7;
6838 return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
9ee6e8bb
PB
6839}
6840
6841float64 float64_scalbn( float64 a, int n STATUS_PARAM )
6842{
6843 flag aSign;
326b9e98 6844 int16_t aExp;
bb98fe42 6845 uint64_t aSig;
9ee6e8bb 6846
37d18660 6847 a = float64_squash_input_denormal(a STATUS_VAR);
9ee6e8bb
PB
6848 aSig = extractFloat64Frac( a );
6849 aExp = extractFloat64Exp( a );
6850 aSign = extractFloat64Sign( a );
6851
6852 if ( aExp == 0x7FF ) {
326b9e98
AJ
6853 if ( aSig ) {
6854 return propagateFloat64NaN( a, a STATUS_VAR );
6855 }
9ee6e8bb
PB
6856 return a;
6857 }
69397542
PB
6858 if ( aExp != 0 )
6859 aSig |= LIT64( 0x0010000000000000 );
6860 else if ( aSig == 0 )
6861 return a;
6862
326b9e98
AJ
6863 if (n > 0x1000) {
6864 n = 0x1000;
6865 } else if (n < -0x1000) {
6866 n = -0x1000;
6867 }
6868
69397542
PB
6869 aExp += n - 1;
6870 aSig <<= 10;
6871 return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
9ee6e8bb
PB
6872}
6873
9ee6e8bb
PB
6874floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
6875{
6876 flag aSign;
326b9e98 6877 int32_t aExp;
bb98fe42 6878 uint64_t aSig;
9ee6e8bb
PB
6879
6880 aSig = extractFloatx80Frac( a );
6881 aExp = extractFloatx80Exp( a );
6882 aSign = extractFloatx80Sign( a );
6883
326b9e98
AJ
6884 if ( aExp == 0x7FFF ) {
6885 if ( aSig<<1 ) {
6886 return propagateFloatx80NaN( a, a STATUS_VAR );
6887 }
9ee6e8bb
PB
6888 return a;
6889 }
326b9e98 6890
69397542
PB
6891 if (aExp == 0 && aSig == 0)
6892 return a;
6893
326b9e98
AJ
6894 if (n > 0x10000) {
6895 n = 0x10000;
6896 } else if (n < -0x10000) {
6897 n = -0x10000;
6898 }
6899
9ee6e8bb 6900 aExp += n;
69397542
PB
6901 return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
6902 aSign, aExp, aSig, 0 STATUS_VAR );
9ee6e8bb 6903}
9ee6e8bb 6904
9ee6e8bb
PB
6905float128 float128_scalbn( float128 a, int n STATUS_PARAM )
6906{
6907 flag aSign;
326b9e98 6908 int32_t aExp;
bb98fe42 6909 uint64_t aSig0, aSig1;
9ee6e8bb
PB
6910
6911 aSig1 = extractFloat128Frac1( a );
6912 aSig0 = extractFloat128Frac0( a );
6913 aExp = extractFloat128Exp( a );
6914 aSign = extractFloat128Sign( a );
6915 if ( aExp == 0x7FFF ) {
326b9e98
AJ
6916 if ( aSig0 | aSig1 ) {
6917 return propagateFloat128NaN( a, a STATUS_VAR );
6918 }
9ee6e8bb
PB
6919 return a;
6920 }
69397542
PB
6921 if ( aExp != 0 )
6922 aSig0 |= LIT64( 0x0001000000000000 );
6923 else if ( aSig0 == 0 && aSig1 == 0 )
6924 return a;
6925
326b9e98
AJ
6926 if (n > 0x10000) {
6927 n = 0x10000;
6928 } else if (n < -0x10000) {
6929 n = -0x10000;
6930 }
6931
69397542
PB
6932 aExp += n - 1;
6933 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
6934 STATUS_VAR );
9ee6e8bb
PB
6935
6936}