]> git.proxmox.com Git - mirror_qemu.git/blame - fpu/softfloat.c
softfloat: Fix float64_to_uint64
[mirror_qemu.git] / fpu / softfloat.c
CommitLineData
8d725fac
AF
1/*
2 * QEMU float support
3 *
4 * Derived from SoftFloat.
5 */
158142c2
FB
6
7/*============================================================================
8
9This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
10Package, Release 2b.
11
12Written by John R. Hauser. This work was made possible in part by the
13International Computer Science Institute, located at Suite 600, 1947 Center
14Street, Berkeley, California 94704. Funding was partially provided by the
15National Science Foundation under grant MIP-9311980. The original version
16of this code was written as part of a project to build a fixed-point vector
17processor in collaboration with the University of California at Berkeley,
18overseen by Profs. Nelson Morgan and John Wawrzynek. More information
19is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
20arithmetic/SoftFloat.html'.
21
22THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
23been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
24RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
25AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
26COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
27EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
28INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
29OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
30
31Derivative works are acceptable, even for commercial purposes, so long as
32(1) the source code for the derivative work includes prominent notice that
33the work is derivative, and (2) the source code includes prominent notice with
34these four paragraphs for those parts of this code that are retained.
35
36=============================================================================*/
37
2ac8bd03
PM
38/* softfloat (and in particular the code in softfloat-specialize.h) is
39 * target-dependent and needs the TARGET_* macros.
40 */
41#include "config.h"
42
6b4c305c 43#include "fpu/softfloat.h"
158142c2
FB
44
45/*----------------------------------------------------------------------------
46| Primitive arithmetic functions, including multi-word arithmetic, and
47| division and square root approximations. (Can be specialized to target if
48| desired.)
49*----------------------------------------------------------------------------*/
50#include "softfloat-macros.h"
51
52/*----------------------------------------------------------------------------
53| Functions and definitions to determine: (1) whether tininess for underflow
54| is detected before or after rounding by default, (2) what (if anything)
55| happens when exceptions are raised, (3) how signaling NaNs are distinguished
56| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
57| are propagated from function inputs to output. These details are target-
58| specific.
59*----------------------------------------------------------------------------*/
60#include "softfloat-specialize.h"
61
62void set_float_rounding_mode(int val STATUS_PARAM)
63{
64 STATUS(float_rounding_mode) = val;
65}
66
1d6bda35
FB
67void set_float_exception_flags(int val STATUS_PARAM)
68{
69 STATUS(float_exception_flags) = val;
70}
71
158142c2
FB
72void set_floatx80_rounding_precision(int val STATUS_PARAM)
73{
74 STATUS(floatx80_rounding_precision) = val;
75}
158142c2 76
bb4d4bb3
PM
77/*----------------------------------------------------------------------------
78| Returns the fraction bits of the half-precision floating-point value `a'.
79*----------------------------------------------------------------------------*/
80
81INLINE uint32_t extractFloat16Frac(float16 a)
82{
83 return float16_val(a) & 0x3ff;
84}
85
86/*----------------------------------------------------------------------------
87| Returns the exponent bits of the half-precision floating-point value `a'.
88*----------------------------------------------------------------------------*/
89
94a49d86 90INLINE int_fast16_t extractFloat16Exp(float16 a)
bb4d4bb3
PM
91{
92 return (float16_val(a) >> 10) & 0x1f;
93}
94
95/*----------------------------------------------------------------------------
96| Returns the sign bit of the single-precision floating-point value `a'.
97*----------------------------------------------------------------------------*/
98
99INLINE flag extractFloat16Sign(float16 a)
100{
101 return float16_val(a)>>15;
102}
103
158142c2
FB
104/*----------------------------------------------------------------------------
105| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
106| and 7, and returns the properly rounded 32-bit integer corresponding to the
107| input. If `zSign' is 1, the input is negated before being converted to an
108| integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
109| is simply rounded to an integer, with the inexact exception raised if the
110| input cannot be represented exactly as an integer. However, if the fixed-
111| point input is too large, the invalid exception is raised and the largest
112| positive or negative integer is returned.
113*----------------------------------------------------------------------------*/
114
bb98fe42 115static int32 roundAndPackInt32( flag zSign, uint64_t absZ STATUS_PARAM)
158142c2
FB
116{
117 int8 roundingMode;
118 flag roundNearestEven;
119 int8 roundIncrement, roundBits;
760e1416 120 int32_t z;
158142c2
FB
121
122 roundingMode = STATUS(float_rounding_mode);
123 roundNearestEven = ( roundingMode == float_round_nearest_even );
124 roundIncrement = 0x40;
125 if ( ! roundNearestEven ) {
126 if ( roundingMode == float_round_to_zero ) {
127 roundIncrement = 0;
128 }
129 else {
130 roundIncrement = 0x7F;
131 if ( zSign ) {
132 if ( roundingMode == float_round_up ) roundIncrement = 0;
133 }
134 else {
135 if ( roundingMode == float_round_down ) roundIncrement = 0;
136 }
137 }
138 }
139 roundBits = absZ & 0x7F;
140 absZ = ( absZ + roundIncrement )>>7;
141 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
142 z = absZ;
143 if ( zSign ) z = - z;
144 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
145 float_raise( float_flag_invalid STATUS_VAR);
bb98fe42 146 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2
FB
147 }
148 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
149 return z;
150
151}
152
153/*----------------------------------------------------------------------------
154| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
155| `absZ1', with binary point between bits 63 and 64 (between the input words),
156| and returns the properly rounded 64-bit integer corresponding to the input.
157| If `zSign' is 1, the input is negated before being converted to an integer.
158| Ordinarily, the fixed-point input is simply rounded to an integer, with
159| the inexact exception raised if the input cannot be represented exactly as
160| an integer. However, if the fixed-point input is too large, the invalid
161| exception is raised and the largest positive or negative integer is
162| returned.
163*----------------------------------------------------------------------------*/
164
bb98fe42 165static int64 roundAndPackInt64( flag zSign, uint64_t absZ0, uint64_t absZ1 STATUS_PARAM)
158142c2
FB
166{
167 int8 roundingMode;
168 flag roundNearestEven, increment;
760e1416 169 int64_t z;
158142c2
FB
170
171 roundingMode = STATUS(float_rounding_mode);
172 roundNearestEven = ( roundingMode == float_round_nearest_even );
bb98fe42 173 increment = ( (int64_t) absZ1 < 0 );
158142c2
FB
174 if ( ! roundNearestEven ) {
175 if ( roundingMode == float_round_to_zero ) {
176 increment = 0;
177 }
178 else {
179 if ( zSign ) {
180 increment = ( roundingMode == float_round_down ) && absZ1;
181 }
182 else {
183 increment = ( roundingMode == float_round_up ) && absZ1;
184 }
185 }
186 }
187 if ( increment ) {
188 ++absZ0;
189 if ( absZ0 == 0 ) goto overflow;
bb98fe42 190 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
158142c2
FB
191 }
192 z = absZ0;
193 if ( zSign ) z = - z;
194 if ( z && ( ( z < 0 ) ^ zSign ) ) {
195 overflow:
196 float_raise( float_flag_invalid STATUS_VAR);
197 return
bb98fe42 198 zSign ? (int64_t) LIT64( 0x8000000000000000 )
158142c2
FB
199 : LIT64( 0x7FFFFFFFFFFFFFFF );
200 }
201 if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
202 return z;
203
204}
205
fb3ea83a
TM
206/*----------------------------------------------------------------------------
207| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
208| `absZ1', with binary point between bits 63 and 64 (between the input words),
209| and returns the properly rounded 64-bit unsigned integer corresponding to the
210| input. Ordinarily, the fixed-point input is simply rounded to an integer,
211| with the inexact exception raised if the input cannot be represented exactly
212| as an integer. However, if the fixed-point input is too large, the invalid
213| exception is raised and the largest unsigned integer is returned.
214*----------------------------------------------------------------------------*/
215
216static int64 roundAndPackUint64(flag zSign, uint64_t absZ0,
217 uint64_t absZ1 STATUS_PARAM)
218{
219 int8 roundingMode;
220 flag roundNearestEven, increment;
221
222 roundingMode = STATUS(float_rounding_mode);
223 roundNearestEven = (roundingMode == float_round_nearest_even);
224 increment = ((int64_t)absZ1 < 0);
225 if (!roundNearestEven) {
226 if (roundingMode == float_round_to_zero) {
227 increment = 0;
228 } else if (absZ1) {
229 if (zSign) {
230 increment = (roundingMode == float_round_down) && absZ1;
231 } else {
232 increment = (roundingMode == float_round_up) && absZ1;
233 }
234 }
235 }
236 if (increment) {
237 ++absZ0;
238 if (absZ0 == 0) {
239 float_raise(float_flag_invalid STATUS_VAR);
240 return LIT64(0xFFFFFFFFFFFFFFFF);
241 }
242 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
243 }
244
245 if (zSign && absZ0) {
246 float_raise(float_flag_invalid STATUS_VAR);
247 return 0;
248 }
249
250 if (absZ1) {
251 STATUS(float_exception_flags) |= float_flag_inexact;
252 }
253 return absZ0;
254}
255
158142c2
FB
256/*----------------------------------------------------------------------------
257| Returns the fraction bits of the single-precision floating-point value `a'.
258*----------------------------------------------------------------------------*/
259
bb98fe42 260INLINE uint32_t extractFloat32Frac( float32 a )
158142c2
FB
261{
262
f090c9d4 263 return float32_val(a) & 0x007FFFFF;
158142c2
FB
264
265}
266
267/*----------------------------------------------------------------------------
268| Returns the exponent bits of the single-precision floating-point value `a'.
269*----------------------------------------------------------------------------*/
270
94a49d86 271INLINE int_fast16_t extractFloat32Exp(float32 a)
158142c2
FB
272{
273
f090c9d4 274 return ( float32_val(a)>>23 ) & 0xFF;
158142c2
FB
275
276}
277
278/*----------------------------------------------------------------------------
279| Returns the sign bit of the single-precision floating-point value `a'.
280*----------------------------------------------------------------------------*/
281
282INLINE flag extractFloat32Sign( float32 a )
283{
284
f090c9d4 285 return float32_val(a)>>31;
158142c2
FB
286
287}
288
37d18660
PM
289/*----------------------------------------------------------------------------
290| If `a' is denormal and we are in flush-to-zero mode then set the
291| input-denormal exception and return zero. Otherwise just return the value.
292*----------------------------------------------------------------------------*/
293static float32 float32_squash_input_denormal(float32 a STATUS_PARAM)
294{
295 if (STATUS(flush_inputs_to_zero)) {
296 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
297 float_raise(float_flag_input_denormal STATUS_VAR);
298 return make_float32(float32_val(a) & 0x80000000);
299 }
300 }
301 return a;
302}
303
158142c2
FB
304/*----------------------------------------------------------------------------
305| Normalizes the subnormal single-precision floating-point value represented
306| by the denormalized significand `aSig'. The normalized exponent and
307| significand are stored at the locations pointed to by `zExpPtr' and
308| `zSigPtr', respectively.
309*----------------------------------------------------------------------------*/
310
311static void
94a49d86 312 normalizeFloat32Subnormal(uint32_t aSig, int_fast16_t *zExpPtr, uint32_t *zSigPtr)
158142c2
FB
313{
314 int8 shiftCount;
315
316 shiftCount = countLeadingZeros32( aSig ) - 8;
317 *zSigPtr = aSig<<shiftCount;
318 *zExpPtr = 1 - shiftCount;
319
320}
321
322/*----------------------------------------------------------------------------
323| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
324| single-precision floating-point value, returning the result. After being
325| shifted into the proper positions, the three fields are simply added
326| together to form the result. This means that any integer portion of `zSig'
327| will be added into the exponent. Since a properly normalized significand
328| will have an integer portion equal to 1, the `zExp' input should be 1 less
329| than the desired result exponent whenever `zSig' is a complete, normalized
330| significand.
331*----------------------------------------------------------------------------*/
332
94a49d86 333INLINE float32 packFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig)
158142c2
FB
334{
335
f090c9d4 336 return make_float32(
bb98fe42 337 ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
158142c2
FB
338
339}
340
341/*----------------------------------------------------------------------------
342| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
343| and significand `zSig', and returns the proper single-precision floating-
344| point value corresponding to the abstract input. Ordinarily, the abstract
345| value is simply rounded and packed into the single-precision format, with
346| the inexact exception raised if the abstract input cannot be represented
347| exactly. However, if the abstract value is too large, the overflow and
348| inexact exceptions are raised and an infinity or maximal finite value is
349| returned. If the abstract value is too small, the input value is rounded to
350| a subnormal number, and the underflow and inexact exceptions are raised if
351| the abstract input cannot be represented exactly as a subnormal single-
352| precision floating-point number.
353| The input significand `zSig' has its binary point between bits 30
354| and 29, which is 7 bits to the left of the usual location. This shifted
355| significand must be normalized or smaller. If `zSig' is not normalized,
356| `zExp' must be 0; in that case, the result returned is a subnormal number,
357| and it must not require rounding. In the usual case that `zSig' is
358| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
359| The handling of underflow and overflow follows the IEC/IEEE Standard for
360| Binary Floating-Point Arithmetic.
361*----------------------------------------------------------------------------*/
362
94a49d86 363static float32 roundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM)
158142c2
FB
364{
365 int8 roundingMode;
366 flag roundNearestEven;
367 int8 roundIncrement, roundBits;
368 flag isTiny;
369
370 roundingMode = STATUS(float_rounding_mode);
371 roundNearestEven = ( roundingMode == float_round_nearest_even );
372 roundIncrement = 0x40;
373 if ( ! roundNearestEven ) {
374 if ( roundingMode == float_round_to_zero ) {
375 roundIncrement = 0;
376 }
377 else {
378 roundIncrement = 0x7F;
379 if ( zSign ) {
380 if ( roundingMode == float_round_up ) roundIncrement = 0;
381 }
382 else {
383 if ( roundingMode == float_round_down ) roundIncrement = 0;
384 }
385 }
386 }
387 roundBits = zSig & 0x7F;
bb98fe42 388 if ( 0xFD <= (uint16_t) zExp ) {
158142c2
FB
389 if ( ( 0xFD < zExp )
390 || ( ( zExp == 0xFD )
bb98fe42 391 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
158142c2
FB
392 ) {
393 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
f090c9d4 394 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
158142c2
FB
395 }
396 if ( zExp < 0 ) {
e6afc87f
PM
397 if (STATUS(flush_to_zero)) {
398 float_raise(float_flag_output_denormal STATUS_VAR);
399 return packFloat32(zSign, 0, 0);
400 }
158142c2
FB
401 isTiny =
402 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
403 || ( zExp < -1 )
404 || ( zSig + roundIncrement < 0x80000000 );
405 shift32RightJamming( zSig, - zExp, &zSig );
406 zExp = 0;
407 roundBits = zSig & 0x7F;
408 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
409 }
410 }
411 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
412 zSig = ( zSig + roundIncrement )>>7;
413 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
414 if ( zSig == 0 ) zExp = 0;
415 return packFloat32( zSign, zExp, zSig );
416
417}
418
419/*----------------------------------------------------------------------------
420| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
421| and significand `zSig', and returns the proper single-precision floating-
422| point value corresponding to the abstract input. This routine is just like
423| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
424| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
425| floating-point exponent.
426*----------------------------------------------------------------------------*/
427
428static float32
94a49d86 429 normalizeRoundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM)
158142c2
FB
430{
431 int8 shiftCount;
432
433 shiftCount = countLeadingZeros32( zSig ) - 1;
434 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
435
436}
437
438/*----------------------------------------------------------------------------
439| Returns the fraction bits of the double-precision floating-point value `a'.
440*----------------------------------------------------------------------------*/
441
bb98fe42 442INLINE uint64_t extractFloat64Frac( float64 a )
158142c2
FB
443{
444
f090c9d4 445 return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
158142c2
FB
446
447}
448
449/*----------------------------------------------------------------------------
450| Returns the exponent bits of the double-precision floating-point value `a'.
451*----------------------------------------------------------------------------*/
452
94a49d86 453INLINE int_fast16_t extractFloat64Exp(float64 a)
158142c2
FB
454{
455
f090c9d4 456 return ( float64_val(a)>>52 ) & 0x7FF;
158142c2
FB
457
458}
459
460/*----------------------------------------------------------------------------
461| Returns the sign bit of the double-precision floating-point value `a'.
462*----------------------------------------------------------------------------*/
463
464INLINE flag extractFloat64Sign( float64 a )
465{
466
f090c9d4 467 return float64_val(a)>>63;
158142c2
FB
468
469}
470
37d18660
PM
471/*----------------------------------------------------------------------------
472| If `a' is denormal and we are in flush-to-zero mode then set the
473| input-denormal exception and return zero. Otherwise just return the value.
474*----------------------------------------------------------------------------*/
475static float64 float64_squash_input_denormal(float64 a STATUS_PARAM)
476{
477 if (STATUS(flush_inputs_to_zero)) {
478 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
479 float_raise(float_flag_input_denormal STATUS_VAR);
480 return make_float64(float64_val(a) & (1ULL << 63));
481 }
482 }
483 return a;
484}
485
158142c2
FB
486/*----------------------------------------------------------------------------
487| Normalizes the subnormal double-precision floating-point value represented
488| by the denormalized significand `aSig'. The normalized exponent and
489| significand are stored at the locations pointed to by `zExpPtr' and
490| `zSigPtr', respectively.
491*----------------------------------------------------------------------------*/
492
493static void
94a49d86 494 normalizeFloat64Subnormal(uint64_t aSig, int_fast16_t *zExpPtr, uint64_t *zSigPtr)
158142c2
FB
495{
496 int8 shiftCount;
497
498 shiftCount = countLeadingZeros64( aSig ) - 11;
499 *zSigPtr = aSig<<shiftCount;
500 *zExpPtr = 1 - shiftCount;
501
502}
503
504/*----------------------------------------------------------------------------
505| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
506| double-precision floating-point value, returning the result. After being
507| shifted into the proper positions, the three fields are simply added
508| together to form the result. This means that any integer portion of `zSig'
509| will be added into the exponent. Since a properly normalized significand
510| will have an integer portion equal to 1, the `zExp' input should be 1 less
511| than the desired result exponent whenever `zSig' is a complete, normalized
512| significand.
513*----------------------------------------------------------------------------*/
514
94a49d86 515INLINE float64 packFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig)
158142c2
FB
516{
517
f090c9d4 518 return make_float64(
bb98fe42 519 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
158142c2
FB
520
521}
522
523/*----------------------------------------------------------------------------
524| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
525| and significand `zSig', and returns the proper double-precision floating-
526| point value corresponding to the abstract input. Ordinarily, the abstract
527| value is simply rounded and packed into the double-precision format, with
528| the inexact exception raised if the abstract input cannot be represented
529| exactly. However, if the abstract value is too large, the overflow and
530| inexact exceptions are raised and an infinity or maximal finite value is
531| returned. If the abstract value is too small, the input value is rounded
532| to a subnormal number, and the underflow and inexact exceptions are raised
533| if the abstract input cannot be represented exactly as a subnormal double-
534| precision floating-point number.
535| The input significand `zSig' has its binary point between bits 62
536| and 61, which is 10 bits to the left of the usual location. This shifted
537| significand must be normalized or smaller. If `zSig' is not normalized,
538| `zExp' must be 0; in that case, the result returned is a subnormal number,
539| and it must not require rounding. In the usual case that `zSig' is
540| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
541| The handling of underflow and overflow follows the IEC/IEEE Standard for
542| Binary Floating-Point Arithmetic.
543*----------------------------------------------------------------------------*/
544
94a49d86 545static float64 roundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM)
158142c2
FB
546{
547 int8 roundingMode;
548 flag roundNearestEven;
94a49d86 549 int_fast16_t roundIncrement, roundBits;
158142c2
FB
550 flag isTiny;
551
552 roundingMode = STATUS(float_rounding_mode);
553 roundNearestEven = ( roundingMode == float_round_nearest_even );
554 roundIncrement = 0x200;
555 if ( ! roundNearestEven ) {
556 if ( roundingMode == float_round_to_zero ) {
557 roundIncrement = 0;
558 }
559 else {
560 roundIncrement = 0x3FF;
561 if ( zSign ) {
562 if ( roundingMode == float_round_up ) roundIncrement = 0;
563 }
564 else {
565 if ( roundingMode == float_round_down ) roundIncrement = 0;
566 }
567 }
568 }
569 roundBits = zSig & 0x3FF;
bb98fe42 570 if ( 0x7FD <= (uint16_t) zExp ) {
158142c2
FB
571 if ( ( 0x7FD < zExp )
572 || ( ( zExp == 0x7FD )
bb98fe42 573 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
158142c2
FB
574 ) {
575 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
f090c9d4 576 return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
158142c2
FB
577 }
578 if ( zExp < 0 ) {
e6afc87f
PM
579 if (STATUS(flush_to_zero)) {
580 float_raise(float_flag_output_denormal STATUS_VAR);
581 return packFloat64(zSign, 0, 0);
582 }
158142c2
FB
583 isTiny =
584 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
585 || ( zExp < -1 )
586 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
587 shift64RightJamming( zSig, - zExp, &zSig );
588 zExp = 0;
589 roundBits = zSig & 0x3FF;
590 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
591 }
592 }
593 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
594 zSig = ( zSig + roundIncrement )>>10;
595 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
596 if ( zSig == 0 ) zExp = 0;
597 return packFloat64( zSign, zExp, zSig );
598
599}
600
601/*----------------------------------------------------------------------------
602| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
603| and significand `zSig', and returns the proper double-precision floating-
604| point value corresponding to the abstract input. This routine is just like
605| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
606| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
607| floating-point exponent.
608*----------------------------------------------------------------------------*/
609
610static float64
94a49d86 611 normalizeRoundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM)
158142c2
FB
612{
613 int8 shiftCount;
614
615 shiftCount = countLeadingZeros64( zSig ) - 1;
616 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
617
618}
619
158142c2
FB
620/*----------------------------------------------------------------------------
621| Returns the fraction bits of the extended double-precision floating-point
622| value `a'.
623*----------------------------------------------------------------------------*/
624
bb98fe42 625INLINE uint64_t extractFloatx80Frac( floatx80 a )
158142c2
FB
626{
627
628 return a.low;
629
630}
631
632/*----------------------------------------------------------------------------
633| Returns the exponent bits of the extended double-precision floating-point
634| value `a'.
635*----------------------------------------------------------------------------*/
636
637INLINE int32 extractFloatx80Exp( floatx80 a )
638{
639
640 return a.high & 0x7FFF;
641
642}
643
644/*----------------------------------------------------------------------------
645| Returns the sign bit of the extended double-precision floating-point value
646| `a'.
647*----------------------------------------------------------------------------*/
648
649INLINE flag extractFloatx80Sign( floatx80 a )
650{
651
652 return a.high>>15;
653
654}
655
656/*----------------------------------------------------------------------------
657| Normalizes the subnormal extended double-precision floating-point value
658| represented by the denormalized significand `aSig'. The normalized exponent
659| and significand are stored at the locations pointed to by `zExpPtr' and
660| `zSigPtr', respectively.
661*----------------------------------------------------------------------------*/
662
663static void
bb98fe42 664 normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr )
158142c2
FB
665{
666 int8 shiftCount;
667
668 shiftCount = countLeadingZeros64( aSig );
669 *zSigPtr = aSig<<shiftCount;
670 *zExpPtr = 1 - shiftCount;
671
672}
673
674/*----------------------------------------------------------------------------
675| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
676| extended double-precision floating-point value, returning the result.
677*----------------------------------------------------------------------------*/
678
bb98fe42 679INLINE floatx80 packFloatx80( flag zSign, int32 zExp, uint64_t zSig )
158142c2
FB
680{
681 floatx80 z;
682
683 z.low = zSig;
bb98fe42 684 z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
158142c2
FB
685 return z;
686
687}
688
689/*----------------------------------------------------------------------------
690| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
691| and extended significand formed by the concatenation of `zSig0' and `zSig1',
692| and returns the proper extended double-precision floating-point value
693| corresponding to the abstract input. Ordinarily, the abstract value is
694| rounded and packed into the extended double-precision format, with the
695| inexact exception raised if the abstract input cannot be represented
696| exactly. However, if the abstract value is too large, the overflow and
697| inexact exceptions are raised and an infinity or maximal finite value is
698| returned. If the abstract value is too small, the input value is rounded to
699| a subnormal number, and the underflow and inexact exceptions are raised if
700| the abstract input cannot be represented exactly as a subnormal extended
701| double-precision floating-point number.
702| If `roundingPrecision' is 32 or 64, the result is rounded to the same
703| number of bits as single or double precision, respectively. Otherwise, the
704| result is rounded to the full precision of the extended double-precision
705| format.
706| The input significand must be normalized or smaller. If the input
707| significand is not normalized, `zExp' must be 0; in that case, the result
708| returned is a subnormal number, and it must not require rounding. The
709| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
710| Floating-Point Arithmetic.
711*----------------------------------------------------------------------------*/
712
713static floatx80
714 roundAndPackFloatx80(
bb98fe42 715 int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
158142c2
FB
716 STATUS_PARAM)
717{
718 int8 roundingMode;
719 flag roundNearestEven, increment, isTiny;
720 int64 roundIncrement, roundMask, roundBits;
721
722 roundingMode = STATUS(float_rounding_mode);
723 roundNearestEven = ( roundingMode == float_round_nearest_even );
724 if ( roundingPrecision == 80 ) goto precision80;
725 if ( roundingPrecision == 64 ) {
726 roundIncrement = LIT64( 0x0000000000000400 );
727 roundMask = LIT64( 0x00000000000007FF );
728 }
729 else if ( roundingPrecision == 32 ) {
730 roundIncrement = LIT64( 0x0000008000000000 );
731 roundMask = LIT64( 0x000000FFFFFFFFFF );
732 }
733 else {
734 goto precision80;
735 }
736 zSig0 |= ( zSig1 != 0 );
737 if ( ! roundNearestEven ) {
738 if ( roundingMode == float_round_to_zero ) {
739 roundIncrement = 0;
740 }
741 else {
742 roundIncrement = roundMask;
743 if ( zSign ) {
744 if ( roundingMode == float_round_up ) roundIncrement = 0;
745 }
746 else {
747 if ( roundingMode == float_round_down ) roundIncrement = 0;
748 }
749 }
750 }
751 roundBits = zSig0 & roundMask;
bb98fe42 752 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
158142c2
FB
753 if ( ( 0x7FFE < zExp )
754 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
755 ) {
756 goto overflow;
757 }
758 if ( zExp <= 0 ) {
e6afc87f
PM
759 if (STATUS(flush_to_zero)) {
760 float_raise(float_flag_output_denormal STATUS_VAR);
761 return packFloatx80(zSign, 0, 0);
762 }
158142c2
FB
763 isTiny =
764 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
765 || ( zExp < 0 )
766 || ( zSig0 <= zSig0 + roundIncrement );
767 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
768 zExp = 0;
769 roundBits = zSig0 & roundMask;
770 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
771 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
772 zSig0 += roundIncrement;
bb98fe42 773 if ( (int64_t) zSig0 < 0 ) zExp = 1;
158142c2
FB
774 roundIncrement = roundMask + 1;
775 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
776 roundMask |= roundIncrement;
777 }
778 zSig0 &= ~ roundMask;
779 return packFloatx80( zSign, zExp, zSig0 );
780 }
781 }
782 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
783 zSig0 += roundIncrement;
784 if ( zSig0 < roundIncrement ) {
785 ++zExp;
786 zSig0 = LIT64( 0x8000000000000000 );
787 }
788 roundIncrement = roundMask + 1;
789 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
790 roundMask |= roundIncrement;
791 }
792 zSig0 &= ~ roundMask;
793 if ( zSig0 == 0 ) zExp = 0;
794 return packFloatx80( zSign, zExp, zSig0 );
795 precision80:
bb98fe42 796 increment = ( (int64_t) zSig1 < 0 );
158142c2
FB
797 if ( ! roundNearestEven ) {
798 if ( roundingMode == float_round_to_zero ) {
799 increment = 0;
800 }
801 else {
802 if ( zSign ) {
803 increment = ( roundingMode == float_round_down ) && zSig1;
804 }
805 else {
806 increment = ( roundingMode == float_round_up ) && zSig1;
807 }
808 }
809 }
bb98fe42 810 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
158142c2
FB
811 if ( ( 0x7FFE < zExp )
812 || ( ( zExp == 0x7FFE )
813 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
814 && increment
815 )
816 ) {
817 roundMask = 0;
818 overflow:
819 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
820 if ( ( roundingMode == float_round_to_zero )
821 || ( zSign && ( roundingMode == float_round_up ) )
822 || ( ! zSign && ( roundingMode == float_round_down ) )
823 ) {
824 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
825 }
826 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
827 }
828 if ( zExp <= 0 ) {
829 isTiny =
830 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
831 || ( zExp < 0 )
832 || ! increment
833 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
834 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
835 zExp = 0;
836 if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
837 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
838 if ( roundNearestEven ) {
bb98fe42 839 increment = ( (int64_t) zSig1 < 0 );
158142c2
FB
840 }
841 else {
842 if ( zSign ) {
843 increment = ( roundingMode == float_round_down ) && zSig1;
844 }
845 else {
846 increment = ( roundingMode == float_round_up ) && zSig1;
847 }
848 }
849 if ( increment ) {
850 ++zSig0;
851 zSig0 &=
bb98fe42
AF
852 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
853 if ( (int64_t) zSig0 < 0 ) zExp = 1;
158142c2
FB
854 }
855 return packFloatx80( zSign, zExp, zSig0 );
856 }
857 }
858 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
859 if ( increment ) {
860 ++zSig0;
861 if ( zSig0 == 0 ) {
862 ++zExp;
863 zSig0 = LIT64( 0x8000000000000000 );
864 }
865 else {
bb98fe42 866 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
158142c2
FB
867 }
868 }
869 else {
870 if ( zSig0 == 0 ) zExp = 0;
871 }
872 return packFloatx80( zSign, zExp, zSig0 );
873
874}
875
876/*----------------------------------------------------------------------------
877| Takes an abstract floating-point value having sign `zSign', exponent
878| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
879| and returns the proper extended double-precision floating-point value
880| corresponding to the abstract input. This routine is just like
881| `roundAndPackFloatx80' except that the input significand does not have to be
882| normalized.
883*----------------------------------------------------------------------------*/
884
885static floatx80
886 normalizeRoundAndPackFloatx80(
bb98fe42 887 int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
158142c2
FB
888 STATUS_PARAM)
889{
890 int8 shiftCount;
891
892 if ( zSig0 == 0 ) {
893 zSig0 = zSig1;
894 zSig1 = 0;
895 zExp -= 64;
896 }
897 shiftCount = countLeadingZeros64( zSig0 );
898 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
899 zExp -= shiftCount;
900 return
901 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
902
903}
904
158142c2
FB
905/*----------------------------------------------------------------------------
906| Returns the least-significant 64 fraction bits of the quadruple-precision
907| floating-point value `a'.
908*----------------------------------------------------------------------------*/
909
bb98fe42 910INLINE uint64_t extractFloat128Frac1( float128 a )
158142c2
FB
911{
912
913 return a.low;
914
915}
916
917/*----------------------------------------------------------------------------
918| Returns the most-significant 48 fraction bits of the quadruple-precision
919| floating-point value `a'.
920*----------------------------------------------------------------------------*/
921
bb98fe42 922INLINE uint64_t extractFloat128Frac0( float128 a )
158142c2
FB
923{
924
925 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
926
927}
928
929/*----------------------------------------------------------------------------
930| Returns the exponent bits of the quadruple-precision floating-point value
931| `a'.
932*----------------------------------------------------------------------------*/
933
934INLINE int32 extractFloat128Exp( float128 a )
935{
936
937 return ( a.high>>48 ) & 0x7FFF;
938
939}
940
941/*----------------------------------------------------------------------------
942| Returns the sign bit of the quadruple-precision floating-point value `a'.
943*----------------------------------------------------------------------------*/
944
945INLINE flag extractFloat128Sign( float128 a )
946{
947
948 return a.high>>63;
949
950}
951
952/*----------------------------------------------------------------------------
953| Normalizes the subnormal quadruple-precision floating-point value
954| represented by the denormalized significand formed by the concatenation of
955| `aSig0' and `aSig1'. The normalized exponent is stored at the location
956| pointed to by `zExpPtr'. The most significant 49 bits of the normalized
957| significand are stored at the location pointed to by `zSig0Ptr', and the
958| least significant 64 bits of the normalized significand are stored at the
959| location pointed to by `zSig1Ptr'.
960*----------------------------------------------------------------------------*/
961
962static void
963 normalizeFloat128Subnormal(
bb98fe42
AF
964 uint64_t aSig0,
965 uint64_t aSig1,
158142c2 966 int32 *zExpPtr,
bb98fe42
AF
967 uint64_t *zSig0Ptr,
968 uint64_t *zSig1Ptr
158142c2
FB
969 )
970{
971 int8 shiftCount;
972
973 if ( aSig0 == 0 ) {
974 shiftCount = countLeadingZeros64( aSig1 ) - 15;
975 if ( shiftCount < 0 ) {
976 *zSig0Ptr = aSig1>>( - shiftCount );
977 *zSig1Ptr = aSig1<<( shiftCount & 63 );
978 }
979 else {
980 *zSig0Ptr = aSig1<<shiftCount;
981 *zSig1Ptr = 0;
982 }
983 *zExpPtr = - shiftCount - 63;
984 }
985 else {
986 shiftCount = countLeadingZeros64( aSig0 ) - 15;
987 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
988 *zExpPtr = 1 - shiftCount;
989 }
990
991}
992
993/*----------------------------------------------------------------------------
994| Packs the sign `zSign', the exponent `zExp', and the significand formed
995| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
996| floating-point value, returning the result. After being shifted into the
997| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
998| added together to form the most significant 32 bits of the result. This
999| means that any integer portion of `zSig0' will be added into the exponent.
1000| Since a properly normalized significand will have an integer portion equal
1001| to 1, the `zExp' input should be 1 less than the desired result exponent
1002| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
1003| significand.
1004*----------------------------------------------------------------------------*/
1005
1006INLINE float128
bb98fe42 1007 packFloat128( flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 )
158142c2
FB
1008{
1009 float128 z;
1010
1011 z.low = zSig1;
bb98fe42 1012 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
158142c2
FB
1013 return z;
1014
1015}
1016
1017/*----------------------------------------------------------------------------
1018| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1019| and extended significand formed by the concatenation of `zSig0', `zSig1',
1020| and `zSig2', and returns the proper quadruple-precision floating-point value
1021| corresponding to the abstract input. Ordinarily, the abstract value is
1022| simply rounded and packed into the quadruple-precision format, with the
1023| inexact exception raised if the abstract input cannot be represented
1024| exactly. However, if the abstract value is too large, the overflow and
1025| inexact exceptions are raised and an infinity or maximal finite value is
1026| returned. If the abstract value is too small, the input value is rounded to
1027| a subnormal number, and the underflow and inexact exceptions are raised if
1028| the abstract input cannot be represented exactly as a subnormal quadruple-
1029| precision floating-point number.
1030| The input significand must be normalized or smaller. If the input
1031| significand is not normalized, `zExp' must be 0; in that case, the result
1032| returned is a subnormal number, and it must not require rounding. In the
1033| usual case that the input significand is normalized, `zExp' must be 1 less
1034| than the ``true'' floating-point exponent. The handling of underflow and
1035| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1036*----------------------------------------------------------------------------*/
1037
1038static float128
1039 roundAndPackFloat128(
bb98fe42 1040 flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1, uint64_t zSig2 STATUS_PARAM)
158142c2
FB
1041{
1042 int8 roundingMode;
1043 flag roundNearestEven, increment, isTiny;
1044
1045 roundingMode = STATUS(float_rounding_mode);
1046 roundNearestEven = ( roundingMode == float_round_nearest_even );
bb98fe42 1047 increment = ( (int64_t) zSig2 < 0 );
158142c2
FB
1048 if ( ! roundNearestEven ) {
1049 if ( roundingMode == float_round_to_zero ) {
1050 increment = 0;
1051 }
1052 else {
1053 if ( zSign ) {
1054 increment = ( roundingMode == float_round_down ) && zSig2;
1055 }
1056 else {
1057 increment = ( roundingMode == float_round_up ) && zSig2;
1058 }
1059 }
1060 }
bb98fe42 1061 if ( 0x7FFD <= (uint32_t) zExp ) {
158142c2
FB
1062 if ( ( 0x7FFD < zExp )
1063 || ( ( zExp == 0x7FFD )
1064 && eq128(
1065 LIT64( 0x0001FFFFFFFFFFFF ),
1066 LIT64( 0xFFFFFFFFFFFFFFFF ),
1067 zSig0,
1068 zSig1
1069 )
1070 && increment
1071 )
1072 ) {
1073 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
1074 if ( ( roundingMode == float_round_to_zero )
1075 || ( zSign && ( roundingMode == float_round_up ) )
1076 || ( ! zSign && ( roundingMode == float_round_down ) )
1077 ) {
1078 return
1079 packFloat128(
1080 zSign,
1081 0x7FFE,
1082 LIT64( 0x0000FFFFFFFFFFFF ),
1083 LIT64( 0xFFFFFFFFFFFFFFFF )
1084 );
1085 }
1086 return packFloat128( zSign, 0x7FFF, 0, 0 );
1087 }
1088 if ( zExp < 0 ) {
e6afc87f
PM
1089 if (STATUS(flush_to_zero)) {
1090 float_raise(float_flag_output_denormal STATUS_VAR);
1091 return packFloat128(zSign, 0, 0, 0);
1092 }
158142c2
FB
1093 isTiny =
1094 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
1095 || ( zExp < -1 )
1096 || ! increment
1097 || lt128(
1098 zSig0,
1099 zSig1,
1100 LIT64( 0x0001FFFFFFFFFFFF ),
1101 LIT64( 0xFFFFFFFFFFFFFFFF )
1102 );
1103 shift128ExtraRightJamming(
1104 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1105 zExp = 0;
1106 if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
1107 if ( roundNearestEven ) {
bb98fe42 1108 increment = ( (int64_t) zSig2 < 0 );
158142c2
FB
1109 }
1110 else {
1111 if ( zSign ) {
1112 increment = ( roundingMode == float_round_down ) && zSig2;
1113 }
1114 else {
1115 increment = ( roundingMode == float_round_up ) && zSig2;
1116 }
1117 }
1118 }
1119 }
1120 if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1121 if ( increment ) {
1122 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1123 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1124 }
1125 else {
1126 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1127 }
1128 return packFloat128( zSign, zExp, zSig0, zSig1 );
1129
1130}
1131
1132/*----------------------------------------------------------------------------
1133| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1134| and significand formed by the concatenation of `zSig0' and `zSig1', and
1135| returns the proper quadruple-precision floating-point value corresponding
1136| to the abstract input. This routine is just like `roundAndPackFloat128'
1137| except that the input significand has fewer bits and does not have to be
1138| normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1139| point exponent.
1140*----------------------------------------------------------------------------*/
1141
1142static float128
1143 normalizeRoundAndPackFloat128(
bb98fe42 1144 flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 STATUS_PARAM)
158142c2
FB
1145{
1146 int8 shiftCount;
bb98fe42 1147 uint64_t zSig2;
158142c2
FB
1148
1149 if ( zSig0 == 0 ) {
1150 zSig0 = zSig1;
1151 zSig1 = 0;
1152 zExp -= 64;
1153 }
1154 shiftCount = countLeadingZeros64( zSig0 ) - 15;
1155 if ( 0 <= shiftCount ) {
1156 zSig2 = 0;
1157 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1158 }
1159 else {
1160 shift128ExtraRightJamming(
1161 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1162 }
1163 zExp -= shiftCount;
1164 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1165
1166}
1167
158142c2
FB
1168/*----------------------------------------------------------------------------
1169| Returns the result of converting the 32-bit two's complement integer `a'
1170| to the single-precision floating-point format. The conversion is performed
1171| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1172*----------------------------------------------------------------------------*/
1173
c4850f9e 1174float32 int32_to_float32(int32_t a STATUS_PARAM)
158142c2
FB
1175{
1176 flag zSign;
1177
f090c9d4 1178 if ( a == 0 ) return float32_zero;
bb98fe42 1179 if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
158142c2
FB
1180 zSign = ( a < 0 );
1181 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1182
1183}
1184
1185/*----------------------------------------------------------------------------
1186| Returns the result of converting the 32-bit two's complement integer `a'
1187| to the double-precision floating-point format. The conversion is performed
1188| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1189*----------------------------------------------------------------------------*/
1190
c4850f9e 1191float64 int32_to_float64(int32_t a STATUS_PARAM)
158142c2
FB
1192{
1193 flag zSign;
1194 uint32 absA;
1195 int8 shiftCount;
bb98fe42 1196 uint64_t zSig;
158142c2 1197
f090c9d4 1198 if ( a == 0 ) return float64_zero;
158142c2
FB
1199 zSign = ( a < 0 );
1200 absA = zSign ? - a : a;
1201 shiftCount = countLeadingZeros32( absA ) + 21;
1202 zSig = absA;
1203 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1204
1205}
1206
158142c2
FB
1207/*----------------------------------------------------------------------------
1208| Returns the result of converting the 32-bit two's complement integer `a'
1209| to the extended double-precision floating-point format. The conversion
1210| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1211| Arithmetic.
1212*----------------------------------------------------------------------------*/
1213
c4850f9e 1214floatx80 int32_to_floatx80(int32_t a STATUS_PARAM)
158142c2
FB
1215{
1216 flag zSign;
1217 uint32 absA;
1218 int8 shiftCount;
bb98fe42 1219 uint64_t zSig;
158142c2
FB
1220
1221 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1222 zSign = ( a < 0 );
1223 absA = zSign ? - a : a;
1224 shiftCount = countLeadingZeros32( absA ) + 32;
1225 zSig = absA;
1226 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1227
1228}
1229
158142c2
FB
1230/*----------------------------------------------------------------------------
1231| Returns the result of converting the 32-bit two's complement integer `a' to
1232| the quadruple-precision floating-point format. The conversion is performed
1233| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1234*----------------------------------------------------------------------------*/
1235
c4850f9e 1236float128 int32_to_float128(int32_t a STATUS_PARAM)
158142c2
FB
1237{
1238 flag zSign;
1239 uint32 absA;
1240 int8 shiftCount;
bb98fe42 1241 uint64_t zSig0;
158142c2
FB
1242
1243 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1244 zSign = ( a < 0 );
1245 absA = zSign ? - a : a;
1246 shiftCount = countLeadingZeros32( absA ) + 17;
1247 zSig0 = absA;
1248 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1249
1250}
1251
158142c2
FB
1252/*----------------------------------------------------------------------------
1253| Returns the result of converting the 64-bit two's complement integer `a'
1254| to the single-precision floating-point format. The conversion is performed
1255| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1256*----------------------------------------------------------------------------*/
1257
c4850f9e 1258float32 int64_to_float32(int64_t a STATUS_PARAM)
158142c2
FB
1259{
1260 flag zSign;
1261 uint64 absA;
1262 int8 shiftCount;
1263
f090c9d4 1264 if ( a == 0 ) return float32_zero;
158142c2
FB
1265 zSign = ( a < 0 );
1266 absA = zSign ? - a : a;
1267 shiftCount = countLeadingZeros64( absA ) - 40;
1268 if ( 0 <= shiftCount ) {
1269 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1270 }
1271 else {
1272 shiftCount += 7;
1273 if ( shiftCount < 0 ) {
1274 shift64RightJamming( absA, - shiftCount, &absA );
1275 }
1276 else {
1277 absA <<= shiftCount;
1278 }
1279 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1280 }
1281
1282}
1283
c4850f9e 1284float32 uint64_to_float32(uint64_t a STATUS_PARAM)
75d62a58
JM
1285{
1286 int8 shiftCount;
1287
f090c9d4 1288 if ( a == 0 ) return float32_zero;
75d62a58
JM
1289 shiftCount = countLeadingZeros64( a ) - 40;
1290 if ( 0 <= shiftCount ) {
e744c06f 1291 return packFloat32(0, 0x95 - shiftCount, a<<shiftCount);
75d62a58
JM
1292 }
1293 else {
1294 shiftCount += 7;
1295 if ( shiftCount < 0 ) {
1296 shift64RightJamming( a, - shiftCount, &a );
1297 }
1298 else {
1299 a <<= shiftCount;
1300 }
e744c06f 1301 return roundAndPackFloat32(0, 0x9C - shiftCount, a STATUS_VAR);
75d62a58
JM
1302 }
1303}
1304
158142c2
FB
1305/*----------------------------------------------------------------------------
1306| Returns the result of converting the 64-bit two's complement integer `a'
1307| to the double-precision floating-point format. The conversion is performed
1308| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1309*----------------------------------------------------------------------------*/
1310
c4850f9e 1311float64 int64_to_float64(int64_t a STATUS_PARAM)
158142c2
FB
1312{
1313 flag zSign;
1314
f090c9d4 1315 if ( a == 0 ) return float64_zero;
bb98fe42 1316 if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
158142c2
FB
1317 return packFloat64( 1, 0x43E, 0 );
1318 }
1319 zSign = ( a < 0 );
1320 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1321
1322}
1323
c4850f9e 1324float64 uint64_to_float64(uint64_t a STATUS_PARAM)
75d62a58 1325{
17ed2293 1326 int exp = 0x43C;
75d62a58 1327
17ed2293
RH
1328 if (a == 0) {
1329 return float64_zero;
1330 }
1331 if ((int64_t)a < 0) {
1332 shift64RightJamming(a, 1, &a);
1333 exp += 1;
1334 }
1335 return normalizeRoundAndPackFloat64(0, exp, a STATUS_VAR);
75d62a58
JM
1336}
1337
158142c2
FB
1338/*----------------------------------------------------------------------------
1339| Returns the result of converting the 64-bit two's complement integer `a'
1340| to the extended double-precision floating-point format. The conversion
1341| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1342| Arithmetic.
1343*----------------------------------------------------------------------------*/
1344
c4850f9e 1345floatx80 int64_to_floatx80(int64_t a STATUS_PARAM)
158142c2
FB
1346{
1347 flag zSign;
1348 uint64 absA;
1349 int8 shiftCount;
1350
1351 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1352 zSign = ( a < 0 );
1353 absA = zSign ? - a : a;
1354 shiftCount = countLeadingZeros64( absA );
1355 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1356
1357}
1358
158142c2
FB
1359/*----------------------------------------------------------------------------
1360| Returns the result of converting the 64-bit two's complement integer `a' to
1361| the quadruple-precision floating-point format. The conversion is performed
1362| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1363*----------------------------------------------------------------------------*/
1364
c4850f9e 1365float128 int64_to_float128(int64_t a STATUS_PARAM)
158142c2
FB
1366{
1367 flag zSign;
1368 uint64 absA;
1369 int8 shiftCount;
1370 int32 zExp;
bb98fe42 1371 uint64_t zSig0, zSig1;
158142c2
FB
1372
1373 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1374 zSign = ( a < 0 );
1375 absA = zSign ? - a : a;
1376 shiftCount = countLeadingZeros64( absA ) + 49;
1377 zExp = 0x406E - shiftCount;
1378 if ( 64 <= shiftCount ) {
1379 zSig1 = 0;
1380 zSig0 = absA;
1381 shiftCount -= 64;
1382 }
1383 else {
1384 zSig1 = absA;
1385 zSig0 = 0;
1386 }
1387 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1388 return packFloat128( zSign, zExp, zSig0, zSig1 );
1389
1390}
1391
c4850f9e 1392float128 uint64_to_float128(uint64_t a STATUS_PARAM)
1e397ead
RH
1393{
1394 if (a == 0) {
1395 return float128_zero;
1396 }
1397 return normalizeRoundAndPackFloat128(0, 0x406E, a, 0 STATUS_VAR);
1398}
1399
158142c2
FB
1400/*----------------------------------------------------------------------------
1401| Returns the result of converting the single-precision floating-point value
1402| `a' to the 32-bit two's complement integer format. The conversion is
1403| performed according to the IEC/IEEE Standard for Binary Floating-Point
1404| Arithmetic---which means in particular that the conversion is rounded
1405| according to the current rounding mode. If `a' is a NaN, the largest
1406| positive integer is returned. Otherwise, if the conversion overflows, the
1407| largest integer with the same sign as `a' is returned.
1408*----------------------------------------------------------------------------*/
1409
1410int32 float32_to_int32( float32 a STATUS_PARAM )
1411{
1412 flag aSign;
94a49d86 1413 int_fast16_t aExp, shiftCount;
bb98fe42
AF
1414 uint32_t aSig;
1415 uint64_t aSig64;
158142c2 1416
37d18660 1417 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1418 aSig = extractFloat32Frac( a );
1419 aExp = extractFloat32Exp( a );
1420 aSign = extractFloat32Sign( a );
1421 if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1422 if ( aExp ) aSig |= 0x00800000;
1423 shiftCount = 0xAF - aExp;
1424 aSig64 = aSig;
1425 aSig64 <<= 32;
1426 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1427 return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1428
1429}
1430
1431/*----------------------------------------------------------------------------
1432| Returns the result of converting the single-precision floating-point value
1433| `a' to the 32-bit two's complement integer format. The conversion is
1434| performed according to the IEC/IEEE Standard for Binary Floating-Point
1435| Arithmetic, except that the conversion is always rounded toward zero.
1436| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1437| the conversion overflows, the largest integer with the same sign as `a' is
1438| returned.
1439*----------------------------------------------------------------------------*/
1440
1441int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1442{
1443 flag aSign;
94a49d86 1444 int_fast16_t aExp, shiftCount;
bb98fe42 1445 uint32_t aSig;
b3a6a2e0 1446 int32_t z;
37d18660 1447 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1448
1449 aSig = extractFloat32Frac( a );
1450 aExp = extractFloat32Exp( a );
1451 aSign = extractFloat32Sign( a );
1452 shiftCount = aExp - 0x9E;
1453 if ( 0 <= shiftCount ) {
f090c9d4 1454 if ( float32_val(a) != 0xCF000000 ) {
158142c2
FB
1455 float_raise( float_flag_invalid STATUS_VAR);
1456 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1457 }
bb98fe42 1458 return (int32_t) 0x80000000;
158142c2
FB
1459 }
1460 else if ( aExp <= 0x7E ) {
1461 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1462 return 0;
1463 }
1464 aSig = ( aSig | 0x00800000 )<<8;
1465 z = aSig>>( - shiftCount );
bb98fe42 1466 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
158142c2
FB
1467 STATUS(float_exception_flags) |= float_flag_inexact;
1468 }
1469 if ( aSign ) z = - z;
1470 return z;
1471
1472}
1473
cbcef455
PM
1474/*----------------------------------------------------------------------------
1475| Returns the result of converting the single-precision floating-point value
1476| `a' to the 16-bit two's complement integer format. The conversion is
1477| performed according to the IEC/IEEE Standard for Binary Floating-Point
1478| Arithmetic, except that the conversion is always rounded toward zero.
1479| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1480| the conversion overflows, the largest integer with the same sign as `a' is
1481| returned.
1482*----------------------------------------------------------------------------*/
1483
94a49d86 1484int_fast16_t float32_to_int16_round_to_zero(float32 a STATUS_PARAM)
cbcef455
PM
1485{
1486 flag aSign;
94a49d86 1487 int_fast16_t aExp, shiftCount;
bb98fe42 1488 uint32_t aSig;
cbcef455
PM
1489 int32 z;
1490
1491 aSig = extractFloat32Frac( a );
1492 aExp = extractFloat32Exp( a );
1493 aSign = extractFloat32Sign( a );
1494 shiftCount = aExp - 0x8E;
1495 if ( 0 <= shiftCount ) {
1496 if ( float32_val(a) != 0xC7000000 ) {
1497 float_raise( float_flag_invalid STATUS_VAR);
1498 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1499 return 0x7FFF;
1500 }
1501 }
bb98fe42 1502 return (int32_t) 0xffff8000;
cbcef455
PM
1503 }
1504 else if ( aExp <= 0x7E ) {
1505 if ( aExp | aSig ) {
1506 STATUS(float_exception_flags) |= float_flag_inexact;
1507 }
1508 return 0;
1509 }
1510 shiftCount -= 0x10;
1511 aSig = ( aSig | 0x00800000 )<<8;
1512 z = aSig>>( - shiftCount );
bb98fe42 1513 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
cbcef455
PM
1514 STATUS(float_exception_flags) |= float_flag_inexact;
1515 }
1516 if ( aSign ) {
1517 z = - z;
1518 }
1519 return z;
1520
1521}
1522
158142c2
FB
1523/*----------------------------------------------------------------------------
1524| Returns the result of converting the single-precision floating-point value
1525| `a' to the 64-bit two's complement integer format. The conversion is
1526| performed according to the IEC/IEEE Standard for Binary Floating-Point
1527| Arithmetic---which means in particular that the conversion is rounded
1528| according to the current rounding mode. If `a' is a NaN, the largest
1529| positive integer is returned. Otherwise, if the conversion overflows, the
1530| largest integer with the same sign as `a' is returned.
1531*----------------------------------------------------------------------------*/
1532
1533int64 float32_to_int64( float32 a STATUS_PARAM )
1534{
1535 flag aSign;
94a49d86 1536 int_fast16_t aExp, shiftCount;
bb98fe42
AF
1537 uint32_t aSig;
1538 uint64_t aSig64, aSigExtra;
37d18660 1539 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1540
1541 aSig = extractFloat32Frac( a );
1542 aExp = extractFloat32Exp( a );
1543 aSign = extractFloat32Sign( a );
1544 shiftCount = 0xBE - aExp;
1545 if ( shiftCount < 0 ) {
1546 float_raise( float_flag_invalid STATUS_VAR);
1547 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1548 return LIT64( 0x7FFFFFFFFFFFFFFF );
1549 }
bb98fe42 1550 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
1551 }
1552 if ( aExp ) aSig |= 0x00800000;
1553 aSig64 = aSig;
1554 aSig64 <<= 40;
1555 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1556 return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1557
1558}
1559
1560/*----------------------------------------------------------------------------
1561| Returns the result of converting the single-precision floating-point value
1562| `a' to the 64-bit two's complement integer format. The conversion is
1563| performed according to the IEC/IEEE Standard for Binary Floating-Point
1564| Arithmetic, except that the conversion is always rounded toward zero. If
1565| `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1566| conversion overflows, the largest integer with the same sign as `a' is
1567| returned.
1568*----------------------------------------------------------------------------*/
1569
1570int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1571{
1572 flag aSign;
94a49d86 1573 int_fast16_t aExp, shiftCount;
bb98fe42
AF
1574 uint32_t aSig;
1575 uint64_t aSig64;
158142c2 1576 int64 z;
37d18660 1577 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1578
1579 aSig = extractFloat32Frac( a );
1580 aExp = extractFloat32Exp( a );
1581 aSign = extractFloat32Sign( a );
1582 shiftCount = aExp - 0xBE;
1583 if ( 0 <= shiftCount ) {
f090c9d4 1584 if ( float32_val(a) != 0xDF000000 ) {
158142c2
FB
1585 float_raise( float_flag_invalid STATUS_VAR);
1586 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1587 return LIT64( 0x7FFFFFFFFFFFFFFF );
1588 }
1589 }
bb98fe42 1590 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
1591 }
1592 else if ( aExp <= 0x7E ) {
1593 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1594 return 0;
1595 }
1596 aSig64 = aSig | 0x00800000;
1597 aSig64 <<= 40;
1598 z = aSig64>>( - shiftCount );
bb98fe42 1599 if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
158142c2
FB
1600 STATUS(float_exception_flags) |= float_flag_inexact;
1601 }
1602 if ( aSign ) z = - z;
1603 return z;
1604
1605}
1606
1607/*----------------------------------------------------------------------------
1608| Returns the result of converting the single-precision floating-point value
1609| `a' to the double-precision floating-point format. The conversion is
1610| performed according to the IEC/IEEE Standard for Binary Floating-Point
1611| Arithmetic.
1612*----------------------------------------------------------------------------*/
1613
1614float64 float32_to_float64( float32 a STATUS_PARAM )
1615{
1616 flag aSign;
94a49d86 1617 int_fast16_t aExp;
bb98fe42 1618 uint32_t aSig;
37d18660 1619 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1620
1621 aSig = extractFloat32Frac( a );
1622 aExp = extractFloat32Exp( a );
1623 aSign = extractFloat32Sign( a );
1624 if ( aExp == 0xFF ) {
bcd4d9af 1625 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
1626 return packFloat64( aSign, 0x7FF, 0 );
1627 }
1628 if ( aExp == 0 ) {
1629 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1630 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1631 --aExp;
1632 }
bb98fe42 1633 return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
158142c2
FB
1634
1635}
1636
158142c2
FB
1637/*----------------------------------------------------------------------------
1638| Returns the result of converting the single-precision floating-point value
1639| `a' to the extended double-precision floating-point format. The conversion
1640| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1641| Arithmetic.
1642*----------------------------------------------------------------------------*/
1643
1644floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1645{
1646 flag aSign;
94a49d86 1647 int_fast16_t aExp;
bb98fe42 1648 uint32_t aSig;
158142c2 1649
37d18660 1650 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1651 aSig = extractFloat32Frac( a );
1652 aExp = extractFloat32Exp( a );
1653 aSign = extractFloat32Sign( a );
1654 if ( aExp == 0xFF ) {
bcd4d9af 1655 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
1656 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1657 }
1658 if ( aExp == 0 ) {
1659 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1660 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1661 }
1662 aSig |= 0x00800000;
bb98fe42 1663 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
158142c2
FB
1664
1665}
1666
158142c2
FB
1667/*----------------------------------------------------------------------------
1668| Returns the result of converting the single-precision floating-point value
1669| `a' to the double-precision floating-point format. The conversion is
1670| performed according to the IEC/IEEE Standard for Binary Floating-Point
1671| Arithmetic.
1672*----------------------------------------------------------------------------*/
1673
1674float128 float32_to_float128( float32 a STATUS_PARAM )
1675{
1676 flag aSign;
94a49d86 1677 int_fast16_t aExp;
bb98fe42 1678 uint32_t aSig;
158142c2 1679
37d18660 1680 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1681 aSig = extractFloat32Frac( a );
1682 aExp = extractFloat32Exp( a );
1683 aSign = extractFloat32Sign( a );
1684 if ( aExp == 0xFF ) {
bcd4d9af 1685 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
1686 return packFloat128( aSign, 0x7FFF, 0, 0 );
1687 }
1688 if ( aExp == 0 ) {
1689 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1690 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1691 --aExp;
1692 }
bb98fe42 1693 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
158142c2
FB
1694
1695}
1696
158142c2
FB
1697/*----------------------------------------------------------------------------
1698| Rounds the single-precision floating-point value `a' to an integer, and
1699| returns the result as a single-precision floating-point value. The
1700| operation is performed according to the IEC/IEEE Standard for Binary
1701| Floating-Point Arithmetic.
1702*----------------------------------------------------------------------------*/
1703
1704float32 float32_round_to_int( float32 a STATUS_PARAM)
1705{
1706 flag aSign;
94a49d86 1707 int_fast16_t aExp;
bb98fe42 1708 uint32_t lastBitMask, roundBitsMask;
158142c2 1709 int8 roundingMode;
bb98fe42 1710 uint32_t z;
37d18660 1711 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1712
1713 aExp = extractFloat32Exp( a );
1714 if ( 0x96 <= aExp ) {
1715 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1716 return propagateFloat32NaN( a, a STATUS_VAR );
1717 }
1718 return a;
1719 }
1720 if ( aExp <= 0x7E ) {
bb98fe42 1721 if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
158142c2
FB
1722 STATUS(float_exception_flags) |= float_flag_inexact;
1723 aSign = extractFloat32Sign( a );
1724 switch ( STATUS(float_rounding_mode) ) {
1725 case float_round_nearest_even:
1726 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1727 return packFloat32( aSign, 0x7F, 0 );
1728 }
1729 break;
1730 case float_round_down:
f090c9d4 1731 return make_float32(aSign ? 0xBF800000 : 0);
158142c2 1732 case float_round_up:
f090c9d4 1733 return make_float32(aSign ? 0x80000000 : 0x3F800000);
158142c2
FB
1734 }
1735 return packFloat32( aSign, 0, 0 );
1736 }
1737 lastBitMask = 1;
1738 lastBitMask <<= 0x96 - aExp;
1739 roundBitsMask = lastBitMask - 1;
f090c9d4 1740 z = float32_val(a);
158142c2
FB
1741 roundingMode = STATUS(float_rounding_mode);
1742 if ( roundingMode == float_round_nearest_even ) {
1743 z += lastBitMask>>1;
1744 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1745 }
1746 else if ( roundingMode != float_round_to_zero ) {
f090c9d4 1747 if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
158142c2
FB
1748 z += roundBitsMask;
1749 }
1750 }
1751 z &= ~ roundBitsMask;
f090c9d4
PB
1752 if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1753 return make_float32(z);
158142c2
FB
1754
1755}
1756
1757/*----------------------------------------------------------------------------
1758| Returns the result of adding the absolute values of the single-precision
1759| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1760| before being returned. `zSign' is ignored if the result is a NaN.
1761| The addition is performed according to the IEC/IEEE Standard for Binary
1762| Floating-Point Arithmetic.
1763*----------------------------------------------------------------------------*/
1764
1765static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1766{
94a49d86 1767 int_fast16_t aExp, bExp, zExp;
bb98fe42 1768 uint32_t aSig, bSig, zSig;
94a49d86 1769 int_fast16_t expDiff;
158142c2
FB
1770
1771 aSig = extractFloat32Frac( a );
1772 aExp = extractFloat32Exp( a );
1773 bSig = extractFloat32Frac( b );
1774 bExp = extractFloat32Exp( b );
1775 expDiff = aExp - bExp;
1776 aSig <<= 6;
1777 bSig <<= 6;
1778 if ( 0 < expDiff ) {
1779 if ( aExp == 0xFF ) {
1780 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1781 return a;
1782 }
1783 if ( bExp == 0 ) {
1784 --expDiff;
1785 }
1786 else {
1787 bSig |= 0x20000000;
1788 }
1789 shift32RightJamming( bSig, expDiff, &bSig );
1790 zExp = aExp;
1791 }
1792 else if ( expDiff < 0 ) {
1793 if ( bExp == 0xFF ) {
1794 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1795 return packFloat32( zSign, 0xFF, 0 );
1796 }
1797 if ( aExp == 0 ) {
1798 ++expDiff;
1799 }
1800 else {
1801 aSig |= 0x20000000;
1802 }
1803 shift32RightJamming( aSig, - expDiff, &aSig );
1804 zExp = bExp;
1805 }
1806 else {
1807 if ( aExp == 0xFF ) {
1808 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1809 return a;
1810 }
fe76d976 1811 if ( aExp == 0 ) {
e6afc87f
PM
1812 if (STATUS(flush_to_zero)) {
1813 if (aSig | bSig) {
1814 float_raise(float_flag_output_denormal STATUS_VAR);
1815 }
1816 return packFloat32(zSign, 0, 0);
1817 }
fe76d976
PB
1818 return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1819 }
158142c2
FB
1820 zSig = 0x40000000 + aSig + bSig;
1821 zExp = aExp;
1822 goto roundAndPack;
1823 }
1824 aSig |= 0x20000000;
1825 zSig = ( aSig + bSig )<<1;
1826 --zExp;
bb98fe42 1827 if ( (int32_t) zSig < 0 ) {
158142c2
FB
1828 zSig = aSig + bSig;
1829 ++zExp;
1830 }
1831 roundAndPack:
1832 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1833
1834}
1835
1836/*----------------------------------------------------------------------------
1837| Returns the result of subtracting the absolute values of the single-
1838| precision floating-point values `a' and `b'. If `zSign' is 1, the
1839| difference is negated before being returned. `zSign' is ignored if the
1840| result is a NaN. The subtraction is performed according to the IEC/IEEE
1841| Standard for Binary Floating-Point Arithmetic.
1842*----------------------------------------------------------------------------*/
1843
1844static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1845{
94a49d86 1846 int_fast16_t aExp, bExp, zExp;
bb98fe42 1847 uint32_t aSig, bSig, zSig;
94a49d86 1848 int_fast16_t expDiff;
158142c2
FB
1849
1850 aSig = extractFloat32Frac( a );
1851 aExp = extractFloat32Exp( a );
1852 bSig = extractFloat32Frac( b );
1853 bExp = extractFloat32Exp( b );
1854 expDiff = aExp - bExp;
1855 aSig <<= 7;
1856 bSig <<= 7;
1857 if ( 0 < expDiff ) goto aExpBigger;
1858 if ( expDiff < 0 ) goto bExpBigger;
1859 if ( aExp == 0xFF ) {
1860 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1861 float_raise( float_flag_invalid STATUS_VAR);
1862 return float32_default_nan;
1863 }
1864 if ( aExp == 0 ) {
1865 aExp = 1;
1866 bExp = 1;
1867 }
1868 if ( bSig < aSig ) goto aBigger;
1869 if ( aSig < bSig ) goto bBigger;
1870 return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1871 bExpBigger:
1872 if ( bExp == 0xFF ) {
1873 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1874 return packFloat32( zSign ^ 1, 0xFF, 0 );
1875 }
1876 if ( aExp == 0 ) {
1877 ++expDiff;
1878 }
1879 else {
1880 aSig |= 0x40000000;
1881 }
1882 shift32RightJamming( aSig, - expDiff, &aSig );
1883 bSig |= 0x40000000;
1884 bBigger:
1885 zSig = bSig - aSig;
1886 zExp = bExp;
1887 zSign ^= 1;
1888 goto normalizeRoundAndPack;
1889 aExpBigger:
1890 if ( aExp == 0xFF ) {
1891 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1892 return a;
1893 }
1894 if ( bExp == 0 ) {
1895 --expDiff;
1896 }
1897 else {
1898 bSig |= 0x40000000;
1899 }
1900 shift32RightJamming( bSig, expDiff, &bSig );
1901 aSig |= 0x40000000;
1902 aBigger:
1903 zSig = aSig - bSig;
1904 zExp = aExp;
1905 normalizeRoundAndPack:
1906 --zExp;
1907 return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1908
1909}
1910
1911/*----------------------------------------------------------------------------
1912| Returns the result of adding the single-precision floating-point values `a'
1913| and `b'. The operation is performed according to the IEC/IEEE Standard for
1914| Binary Floating-Point Arithmetic.
1915*----------------------------------------------------------------------------*/
1916
1917float32 float32_add( float32 a, float32 b STATUS_PARAM )
1918{
1919 flag aSign, bSign;
37d18660
PM
1920 a = float32_squash_input_denormal(a STATUS_VAR);
1921 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
1922
1923 aSign = extractFloat32Sign( a );
1924 bSign = extractFloat32Sign( b );
1925 if ( aSign == bSign ) {
1926 return addFloat32Sigs( a, b, aSign STATUS_VAR);
1927 }
1928 else {
1929 return subFloat32Sigs( a, b, aSign STATUS_VAR );
1930 }
1931
1932}
1933
1934/*----------------------------------------------------------------------------
1935| Returns the result of subtracting the single-precision floating-point values
1936| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1937| for Binary Floating-Point Arithmetic.
1938*----------------------------------------------------------------------------*/
1939
1940float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1941{
1942 flag aSign, bSign;
37d18660
PM
1943 a = float32_squash_input_denormal(a STATUS_VAR);
1944 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
1945
1946 aSign = extractFloat32Sign( a );
1947 bSign = extractFloat32Sign( b );
1948 if ( aSign == bSign ) {
1949 return subFloat32Sigs( a, b, aSign STATUS_VAR );
1950 }
1951 else {
1952 return addFloat32Sigs( a, b, aSign STATUS_VAR );
1953 }
1954
1955}
1956
1957/*----------------------------------------------------------------------------
1958| Returns the result of multiplying the single-precision floating-point values
1959| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1960| for Binary Floating-Point Arithmetic.
1961*----------------------------------------------------------------------------*/
1962
1963float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1964{
1965 flag aSign, bSign, zSign;
94a49d86 1966 int_fast16_t aExp, bExp, zExp;
bb98fe42
AF
1967 uint32_t aSig, bSig;
1968 uint64_t zSig64;
1969 uint32_t zSig;
158142c2 1970
37d18660
PM
1971 a = float32_squash_input_denormal(a STATUS_VAR);
1972 b = float32_squash_input_denormal(b STATUS_VAR);
1973
158142c2
FB
1974 aSig = extractFloat32Frac( a );
1975 aExp = extractFloat32Exp( a );
1976 aSign = extractFloat32Sign( a );
1977 bSig = extractFloat32Frac( b );
1978 bExp = extractFloat32Exp( b );
1979 bSign = extractFloat32Sign( b );
1980 zSign = aSign ^ bSign;
1981 if ( aExp == 0xFF ) {
1982 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1983 return propagateFloat32NaN( a, b STATUS_VAR );
1984 }
1985 if ( ( bExp | bSig ) == 0 ) {
1986 float_raise( float_flag_invalid STATUS_VAR);
1987 return float32_default_nan;
1988 }
1989 return packFloat32( zSign, 0xFF, 0 );
1990 }
1991 if ( bExp == 0xFF ) {
1992 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1993 if ( ( aExp | aSig ) == 0 ) {
1994 float_raise( float_flag_invalid STATUS_VAR);
1995 return float32_default_nan;
1996 }
1997 return packFloat32( zSign, 0xFF, 0 );
1998 }
1999 if ( aExp == 0 ) {
2000 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2001 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2002 }
2003 if ( bExp == 0 ) {
2004 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
2005 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2006 }
2007 zExp = aExp + bExp - 0x7F;
2008 aSig = ( aSig | 0x00800000 )<<7;
2009 bSig = ( bSig | 0x00800000 )<<8;
bb98fe42 2010 shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
158142c2 2011 zSig = zSig64;
bb98fe42 2012 if ( 0 <= (int32_t) ( zSig<<1 ) ) {
158142c2
FB
2013 zSig <<= 1;
2014 --zExp;
2015 }
2016 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2017
2018}
2019
2020/*----------------------------------------------------------------------------
2021| Returns the result of dividing the single-precision floating-point value `a'
2022| by the corresponding value `b'. The operation is performed according to the
2023| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2024*----------------------------------------------------------------------------*/
2025
2026float32 float32_div( float32 a, float32 b STATUS_PARAM )
2027{
2028 flag aSign, bSign, zSign;
94a49d86 2029 int_fast16_t aExp, bExp, zExp;
bb98fe42 2030 uint32_t aSig, bSig, zSig;
37d18660
PM
2031 a = float32_squash_input_denormal(a STATUS_VAR);
2032 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2033
2034 aSig = extractFloat32Frac( a );
2035 aExp = extractFloat32Exp( a );
2036 aSign = extractFloat32Sign( a );
2037 bSig = extractFloat32Frac( b );
2038 bExp = extractFloat32Exp( b );
2039 bSign = extractFloat32Sign( b );
2040 zSign = aSign ^ bSign;
2041 if ( aExp == 0xFF ) {
2042 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2043 if ( bExp == 0xFF ) {
2044 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2045 float_raise( float_flag_invalid STATUS_VAR);
2046 return float32_default_nan;
2047 }
2048 return packFloat32( zSign, 0xFF, 0 );
2049 }
2050 if ( bExp == 0xFF ) {
2051 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2052 return packFloat32( zSign, 0, 0 );
2053 }
2054 if ( bExp == 0 ) {
2055 if ( bSig == 0 ) {
2056 if ( ( aExp | aSig ) == 0 ) {
2057 float_raise( float_flag_invalid STATUS_VAR);
2058 return float32_default_nan;
2059 }
2060 float_raise( float_flag_divbyzero STATUS_VAR);
2061 return packFloat32( zSign, 0xFF, 0 );
2062 }
2063 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2064 }
2065 if ( aExp == 0 ) {
2066 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2067 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2068 }
2069 zExp = aExp - bExp + 0x7D;
2070 aSig = ( aSig | 0x00800000 )<<7;
2071 bSig = ( bSig | 0x00800000 )<<8;
2072 if ( bSig <= ( aSig + aSig ) ) {
2073 aSig >>= 1;
2074 ++zExp;
2075 }
bb98fe42 2076 zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
158142c2 2077 if ( ( zSig & 0x3F ) == 0 ) {
bb98fe42 2078 zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
158142c2
FB
2079 }
2080 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2081
2082}
2083
2084/*----------------------------------------------------------------------------
2085| Returns the remainder of the single-precision floating-point value `a'
2086| with respect to the corresponding value `b'. The operation is performed
2087| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2088*----------------------------------------------------------------------------*/
2089
2090float32 float32_rem( float32 a, float32 b STATUS_PARAM )
2091{
ed086f3d 2092 flag aSign, zSign;
94a49d86 2093 int_fast16_t aExp, bExp, expDiff;
bb98fe42
AF
2094 uint32_t aSig, bSig;
2095 uint32_t q;
2096 uint64_t aSig64, bSig64, q64;
2097 uint32_t alternateASig;
2098 int32_t sigMean;
37d18660
PM
2099 a = float32_squash_input_denormal(a STATUS_VAR);
2100 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2101
2102 aSig = extractFloat32Frac( a );
2103 aExp = extractFloat32Exp( a );
2104 aSign = extractFloat32Sign( a );
2105 bSig = extractFloat32Frac( b );
2106 bExp = extractFloat32Exp( b );
158142c2
FB
2107 if ( aExp == 0xFF ) {
2108 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2109 return propagateFloat32NaN( a, b STATUS_VAR );
2110 }
2111 float_raise( float_flag_invalid STATUS_VAR);
2112 return float32_default_nan;
2113 }
2114 if ( bExp == 0xFF ) {
2115 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2116 return a;
2117 }
2118 if ( bExp == 0 ) {
2119 if ( bSig == 0 ) {
2120 float_raise( float_flag_invalid STATUS_VAR);
2121 return float32_default_nan;
2122 }
2123 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2124 }
2125 if ( aExp == 0 ) {
2126 if ( aSig == 0 ) return a;
2127 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2128 }
2129 expDiff = aExp - bExp;
2130 aSig |= 0x00800000;
2131 bSig |= 0x00800000;
2132 if ( expDiff < 32 ) {
2133 aSig <<= 8;
2134 bSig <<= 8;
2135 if ( expDiff < 0 ) {
2136 if ( expDiff < -1 ) return a;
2137 aSig >>= 1;
2138 }
2139 q = ( bSig <= aSig );
2140 if ( q ) aSig -= bSig;
2141 if ( 0 < expDiff ) {
bb98fe42 2142 q = ( ( (uint64_t) aSig )<<32 ) / bSig;
158142c2
FB
2143 q >>= 32 - expDiff;
2144 bSig >>= 2;
2145 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2146 }
2147 else {
2148 aSig >>= 2;
2149 bSig >>= 2;
2150 }
2151 }
2152 else {
2153 if ( bSig <= aSig ) aSig -= bSig;
bb98fe42
AF
2154 aSig64 = ( (uint64_t) aSig )<<40;
2155 bSig64 = ( (uint64_t) bSig )<<40;
158142c2
FB
2156 expDiff -= 64;
2157 while ( 0 < expDiff ) {
2158 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2159 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2160 aSig64 = - ( ( bSig * q64 )<<38 );
2161 expDiff -= 62;
2162 }
2163 expDiff += 64;
2164 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2165 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2166 q = q64>>( 64 - expDiff );
2167 bSig <<= 6;
2168 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2169 }
2170 do {
2171 alternateASig = aSig;
2172 ++q;
2173 aSig -= bSig;
bb98fe42 2174 } while ( 0 <= (int32_t) aSig );
158142c2
FB
2175 sigMean = aSig + alternateASig;
2176 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2177 aSig = alternateASig;
2178 }
bb98fe42 2179 zSign = ( (int32_t) aSig < 0 );
158142c2
FB
2180 if ( zSign ) aSig = - aSig;
2181 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2182
2183}
2184
369be8f6
PM
2185/*----------------------------------------------------------------------------
2186| Returns the result of multiplying the single-precision floating-point values
2187| `a' and `b' then adding 'c', with no intermediate rounding step after the
2188| multiplication. The operation is performed according to the IEC/IEEE
2189| Standard for Binary Floating-Point Arithmetic 754-2008.
2190| The flags argument allows the caller to select negation of the
2191| addend, the intermediate product, or the final result. (The difference
2192| between this and having the caller do a separate negation is that negating
2193| externally will flip the sign bit on NaNs.)
2194*----------------------------------------------------------------------------*/
2195
2196float32 float32_muladd(float32 a, float32 b, float32 c, int flags STATUS_PARAM)
2197{
2198 flag aSign, bSign, cSign, zSign;
94a49d86 2199 int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
369be8f6
PM
2200 uint32_t aSig, bSig, cSig;
2201 flag pInf, pZero, pSign;
2202 uint64_t pSig64, cSig64, zSig64;
2203 uint32_t pSig;
2204 int shiftcount;
2205 flag signflip, infzero;
2206
2207 a = float32_squash_input_denormal(a STATUS_VAR);
2208 b = float32_squash_input_denormal(b STATUS_VAR);
2209 c = float32_squash_input_denormal(c STATUS_VAR);
2210 aSig = extractFloat32Frac(a);
2211 aExp = extractFloat32Exp(a);
2212 aSign = extractFloat32Sign(a);
2213 bSig = extractFloat32Frac(b);
2214 bExp = extractFloat32Exp(b);
2215 bSign = extractFloat32Sign(b);
2216 cSig = extractFloat32Frac(c);
2217 cExp = extractFloat32Exp(c);
2218 cSign = extractFloat32Sign(c);
2219
2220 infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) ||
2221 (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0));
2222
2223 /* It is implementation-defined whether the cases of (0,inf,qnan)
2224 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
2225 * they return if they do), so we have to hand this information
2226 * off to the target-specific pick-a-NaN routine.
2227 */
2228 if (((aExp == 0xff) && aSig) ||
2229 ((bExp == 0xff) && bSig) ||
2230 ((cExp == 0xff) && cSig)) {
2231 return propagateFloat32MulAddNaN(a, b, c, infzero STATUS_VAR);
2232 }
2233
2234 if (infzero) {
2235 float_raise(float_flag_invalid STATUS_VAR);
2236 return float32_default_nan;
2237 }
2238
2239 if (flags & float_muladd_negate_c) {
2240 cSign ^= 1;
2241 }
2242
2243 signflip = (flags & float_muladd_negate_result) ? 1 : 0;
2244
2245 /* Work out the sign and type of the product */
2246 pSign = aSign ^ bSign;
2247 if (flags & float_muladd_negate_product) {
2248 pSign ^= 1;
2249 }
2250 pInf = (aExp == 0xff) || (bExp == 0xff);
2251 pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
2252
2253 if (cExp == 0xff) {
2254 if (pInf && (pSign ^ cSign)) {
2255 /* addition of opposite-signed infinities => InvalidOperation */
2256 float_raise(float_flag_invalid STATUS_VAR);
2257 return float32_default_nan;
2258 }
2259 /* Otherwise generate an infinity of the same sign */
2260 return packFloat32(cSign ^ signflip, 0xff, 0);
2261 }
2262
2263 if (pInf) {
2264 return packFloat32(pSign ^ signflip, 0xff, 0);
2265 }
2266
2267 if (pZero) {
2268 if (cExp == 0) {
2269 if (cSig == 0) {
2270 /* Adding two exact zeroes */
2271 if (pSign == cSign) {
2272 zSign = pSign;
2273 } else if (STATUS(float_rounding_mode) == float_round_down) {
2274 zSign = 1;
2275 } else {
2276 zSign = 0;
2277 }
2278 return packFloat32(zSign ^ signflip, 0, 0);
2279 }
2280 /* Exact zero plus a denorm */
2281 if (STATUS(flush_to_zero)) {
2282 float_raise(float_flag_output_denormal STATUS_VAR);
2283 return packFloat32(cSign ^ signflip, 0, 0);
2284 }
2285 }
2286 /* Zero plus something non-zero : just return the something */
a6e7c184 2287 return packFloat32(cSign ^ signflip, cExp, cSig);
369be8f6
PM
2288 }
2289
2290 if (aExp == 0) {
2291 normalizeFloat32Subnormal(aSig, &aExp, &aSig);
2292 }
2293 if (bExp == 0) {
2294 normalizeFloat32Subnormal(bSig, &bExp, &bSig);
2295 }
2296
2297 /* Calculate the actual result a * b + c */
2298
2299 /* Multiply first; this is easy. */
2300 /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
2301 * because we want the true exponent, not the "one-less-than"
2302 * flavour that roundAndPackFloat32() takes.
2303 */
2304 pExp = aExp + bExp - 0x7e;
2305 aSig = (aSig | 0x00800000) << 7;
2306 bSig = (bSig | 0x00800000) << 8;
2307 pSig64 = (uint64_t)aSig * bSig;
2308 if ((int64_t)(pSig64 << 1) >= 0) {
2309 pSig64 <<= 1;
2310 pExp--;
2311 }
2312
2313 zSign = pSign ^ signflip;
2314
2315 /* Now pSig64 is the significand of the multiply, with the explicit bit in
2316 * position 62.
2317 */
2318 if (cExp == 0) {
2319 if (!cSig) {
2320 /* Throw out the special case of c being an exact zero now */
2321 shift64RightJamming(pSig64, 32, &pSig64);
2322 pSig = pSig64;
2323 return roundAndPackFloat32(zSign, pExp - 1,
2324 pSig STATUS_VAR);
2325 }
2326 normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2327 }
2328
2329 cSig64 = (uint64_t)cSig << (62 - 23);
2330 cSig64 |= LIT64(0x4000000000000000);
2331 expDiff = pExp - cExp;
2332
2333 if (pSign == cSign) {
2334 /* Addition */
2335 if (expDiff > 0) {
2336 /* scale c to match p */
2337 shift64RightJamming(cSig64, expDiff, &cSig64);
2338 zExp = pExp;
2339 } else if (expDiff < 0) {
2340 /* scale p to match c */
2341 shift64RightJamming(pSig64, -expDiff, &pSig64);
2342 zExp = cExp;
2343 } else {
2344 /* no scaling needed */
2345 zExp = cExp;
2346 }
2347 /* Add significands and make sure explicit bit ends up in posn 62 */
2348 zSig64 = pSig64 + cSig64;
2349 if ((int64_t)zSig64 < 0) {
2350 shift64RightJamming(zSig64, 1, &zSig64);
2351 } else {
2352 zExp--;
2353 }
2354 } else {
2355 /* Subtraction */
2356 if (expDiff > 0) {
2357 shift64RightJamming(cSig64, expDiff, &cSig64);
2358 zSig64 = pSig64 - cSig64;
2359 zExp = pExp;
2360 } else if (expDiff < 0) {
2361 shift64RightJamming(pSig64, -expDiff, &pSig64);
2362 zSig64 = cSig64 - pSig64;
2363 zExp = cExp;
2364 zSign ^= 1;
2365 } else {
2366 zExp = pExp;
2367 if (cSig64 < pSig64) {
2368 zSig64 = pSig64 - cSig64;
2369 } else if (pSig64 < cSig64) {
2370 zSig64 = cSig64 - pSig64;
2371 zSign ^= 1;
2372 } else {
2373 /* Exact zero */
2374 zSign = signflip;
2375 if (STATUS(float_rounding_mode) == float_round_down) {
2376 zSign ^= 1;
2377 }
2378 return packFloat32(zSign, 0, 0);
2379 }
2380 }
2381 --zExp;
2382 /* Normalize to put the explicit bit back into bit 62. */
2383 shiftcount = countLeadingZeros64(zSig64) - 1;
2384 zSig64 <<= shiftcount;
2385 zExp -= shiftcount;
2386 }
2387 shift64RightJamming(zSig64, 32, &zSig64);
2388 return roundAndPackFloat32(zSign, zExp, zSig64 STATUS_VAR);
2389}
2390
2391
158142c2
FB
2392/*----------------------------------------------------------------------------
2393| Returns the square root of the single-precision floating-point value `a'.
2394| The operation is performed according to the IEC/IEEE Standard for Binary
2395| Floating-Point Arithmetic.
2396*----------------------------------------------------------------------------*/
2397
2398float32 float32_sqrt( float32 a STATUS_PARAM )
2399{
2400 flag aSign;
94a49d86 2401 int_fast16_t aExp, zExp;
bb98fe42
AF
2402 uint32_t aSig, zSig;
2403 uint64_t rem, term;
37d18660 2404 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
2405
2406 aSig = extractFloat32Frac( a );
2407 aExp = extractFloat32Exp( a );
2408 aSign = extractFloat32Sign( a );
2409 if ( aExp == 0xFF ) {
f090c9d4 2410 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
158142c2
FB
2411 if ( ! aSign ) return a;
2412 float_raise( float_flag_invalid STATUS_VAR);
2413 return float32_default_nan;
2414 }
2415 if ( aSign ) {
2416 if ( ( aExp | aSig ) == 0 ) return a;
2417 float_raise( float_flag_invalid STATUS_VAR);
2418 return float32_default_nan;
2419 }
2420 if ( aExp == 0 ) {
f090c9d4 2421 if ( aSig == 0 ) return float32_zero;
158142c2
FB
2422 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2423 }
2424 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2425 aSig = ( aSig | 0x00800000 )<<8;
2426 zSig = estimateSqrt32( aExp, aSig ) + 2;
2427 if ( ( zSig & 0x7F ) <= 5 ) {
2428 if ( zSig < 2 ) {
2429 zSig = 0x7FFFFFFF;
2430 goto roundAndPack;
2431 }
2432 aSig >>= aExp & 1;
bb98fe42
AF
2433 term = ( (uint64_t) zSig ) * zSig;
2434 rem = ( ( (uint64_t) aSig )<<32 ) - term;
2435 while ( (int64_t) rem < 0 ) {
158142c2 2436 --zSig;
bb98fe42 2437 rem += ( ( (uint64_t) zSig )<<1 ) | 1;
158142c2
FB
2438 }
2439 zSig |= ( rem != 0 );
2440 }
2441 shift32RightJamming( zSig, 1, &zSig );
2442 roundAndPack:
2443 return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2444
2445}
2446
8229c991
AJ
2447/*----------------------------------------------------------------------------
2448| Returns the binary exponential of the single-precision floating-point value
2449| `a'. The operation is performed according to the IEC/IEEE Standard for
2450| Binary Floating-Point Arithmetic.
2451|
2452| Uses the following identities:
2453|
2454| 1. -------------------------------------------------------------------------
2455| x x*ln(2)
2456| 2 = e
2457|
2458| 2. -------------------------------------------------------------------------
2459| 2 3 4 5 n
2460| x x x x x x x
2461| e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2462| 1! 2! 3! 4! 5! n!
2463*----------------------------------------------------------------------------*/
2464
2465static const float64 float32_exp2_coefficients[15] =
2466{
d5138cf4
PM
2467 const_float64( 0x3ff0000000000000ll ), /* 1 */
2468 const_float64( 0x3fe0000000000000ll ), /* 2 */
2469 const_float64( 0x3fc5555555555555ll ), /* 3 */
2470 const_float64( 0x3fa5555555555555ll ), /* 4 */
2471 const_float64( 0x3f81111111111111ll ), /* 5 */
2472 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
2473 const_float64( 0x3f2a01a01a01a01all ), /* 7 */
2474 const_float64( 0x3efa01a01a01a01all ), /* 8 */
2475 const_float64( 0x3ec71de3a556c734ll ), /* 9 */
2476 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2477 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2478 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2479 const_float64( 0x3de6124613a86d09ll ), /* 13 */
2480 const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2481 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
8229c991
AJ
2482};
2483
2484float32 float32_exp2( float32 a STATUS_PARAM )
2485{
2486 flag aSign;
94a49d86 2487 int_fast16_t aExp;
bb98fe42 2488 uint32_t aSig;
8229c991
AJ
2489 float64 r, x, xn;
2490 int i;
37d18660 2491 a = float32_squash_input_denormal(a STATUS_VAR);
8229c991
AJ
2492
2493 aSig = extractFloat32Frac( a );
2494 aExp = extractFloat32Exp( a );
2495 aSign = extractFloat32Sign( a );
2496
2497 if ( aExp == 0xFF) {
2498 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2499 return (aSign) ? float32_zero : a;
2500 }
2501 if (aExp == 0) {
2502 if (aSig == 0) return float32_one;
2503 }
2504
2505 float_raise( float_flag_inexact STATUS_VAR);
2506
2507 /* ******************************* */
2508 /* using float64 for approximation */
2509 /* ******************************* */
2510 x = float32_to_float64(a STATUS_VAR);
2511 x = float64_mul(x, float64_ln2 STATUS_VAR);
2512
2513 xn = x;
2514 r = float64_one;
2515 for (i = 0 ; i < 15 ; i++) {
2516 float64 f;
2517
2518 f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2519 r = float64_add(r, f STATUS_VAR);
2520
2521 xn = float64_mul(xn, x STATUS_VAR);
2522 }
2523
2524 return float64_to_float32(r, status);
2525}
2526
374dfc33
AJ
2527/*----------------------------------------------------------------------------
2528| Returns the binary log of the single-precision floating-point value `a'.
2529| The operation is performed according to the IEC/IEEE Standard for Binary
2530| Floating-Point Arithmetic.
2531*----------------------------------------------------------------------------*/
2532float32 float32_log2( float32 a STATUS_PARAM )
2533{
2534 flag aSign, zSign;
94a49d86 2535 int_fast16_t aExp;
bb98fe42 2536 uint32_t aSig, zSig, i;
374dfc33 2537
37d18660 2538 a = float32_squash_input_denormal(a STATUS_VAR);
374dfc33
AJ
2539 aSig = extractFloat32Frac( a );
2540 aExp = extractFloat32Exp( a );
2541 aSign = extractFloat32Sign( a );
2542
2543 if ( aExp == 0 ) {
2544 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2545 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2546 }
2547 if ( aSign ) {
2548 float_raise( float_flag_invalid STATUS_VAR);
2549 return float32_default_nan;
2550 }
2551 if ( aExp == 0xFF ) {
2552 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2553 return a;
2554 }
2555
2556 aExp -= 0x7F;
2557 aSig |= 0x00800000;
2558 zSign = aExp < 0;
2559 zSig = aExp << 23;
2560
2561 for (i = 1 << 22; i > 0; i >>= 1) {
bb98fe42 2562 aSig = ( (uint64_t)aSig * aSig ) >> 23;
374dfc33
AJ
2563 if ( aSig & 0x01000000 ) {
2564 aSig >>= 1;
2565 zSig |= i;
2566 }
2567 }
2568
2569 if ( zSign )
2570 zSig = -zSig;
2571
2572 return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2573}
2574
158142c2
FB
2575/*----------------------------------------------------------------------------
2576| Returns 1 if the single-precision floating-point value `a' is equal to
b689362d
AJ
2577| the corresponding value `b', and 0 otherwise. The invalid exception is
2578| raised if either operand is a NaN. Otherwise, the comparison is performed
158142c2
FB
2579| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2580*----------------------------------------------------------------------------*/
2581
b689362d 2582int float32_eq( float32 a, float32 b STATUS_PARAM )
158142c2 2583{
b689362d 2584 uint32_t av, bv;
37d18660
PM
2585 a = float32_squash_input_denormal(a STATUS_VAR);
2586 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2587
2588 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2589 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2590 ) {
b689362d 2591 float_raise( float_flag_invalid STATUS_VAR);
158142c2
FB
2592 return 0;
2593 }
b689362d
AJ
2594 av = float32_val(a);
2595 bv = float32_val(b);
2596 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
158142c2
FB
2597}
2598
2599/*----------------------------------------------------------------------------
2600| Returns 1 if the single-precision floating-point value `a' is less than
f5a64251
AJ
2601| or equal to the corresponding value `b', and 0 otherwise. The invalid
2602| exception is raised if either operand is a NaN. The comparison is performed
2603| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
2604*----------------------------------------------------------------------------*/
2605
750afe93 2606int float32_le( float32 a, float32 b STATUS_PARAM )
158142c2
FB
2607{
2608 flag aSign, bSign;
bb98fe42 2609 uint32_t av, bv;
37d18660
PM
2610 a = float32_squash_input_denormal(a STATUS_VAR);
2611 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2612
2613 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2614 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2615 ) {
2616 float_raise( float_flag_invalid STATUS_VAR);
2617 return 0;
2618 }
2619 aSign = extractFloat32Sign( a );
2620 bSign = extractFloat32Sign( b );
f090c9d4
PB
2621 av = float32_val(a);
2622 bv = float32_val(b);
bb98fe42 2623 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 2624 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
2625
2626}
2627
2628/*----------------------------------------------------------------------------
2629| Returns 1 if the single-precision floating-point value `a' is less than
f5a64251
AJ
2630| the corresponding value `b', and 0 otherwise. The invalid exception is
2631| raised if either operand is a NaN. The comparison is performed according
2632| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
2633*----------------------------------------------------------------------------*/
2634
750afe93 2635int float32_lt( float32 a, float32 b STATUS_PARAM )
158142c2
FB
2636{
2637 flag aSign, bSign;
bb98fe42 2638 uint32_t av, bv;
37d18660
PM
2639 a = float32_squash_input_denormal(a STATUS_VAR);
2640 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2641
2642 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2643 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2644 ) {
2645 float_raise( float_flag_invalid STATUS_VAR);
2646 return 0;
2647 }
2648 aSign = extractFloat32Sign( a );
2649 bSign = extractFloat32Sign( b );
f090c9d4
PB
2650 av = float32_val(a);
2651 bv = float32_val(b);
bb98fe42 2652 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 2653 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
2654
2655}
2656
67b7861d
AJ
2657/*----------------------------------------------------------------------------
2658| Returns 1 if the single-precision floating-point values `a' and `b' cannot
f5a64251
AJ
2659| be compared, and 0 otherwise. The invalid exception is raised if either
2660| operand is a NaN. The comparison is performed according to the IEC/IEEE
2661| Standard for Binary Floating-Point Arithmetic.
67b7861d
AJ
2662*----------------------------------------------------------------------------*/
2663
2664int float32_unordered( float32 a, float32 b STATUS_PARAM )
2665{
2666 a = float32_squash_input_denormal(a STATUS_VAR);
2667 b = float32_squash_input_denormal(b STATUS_VAR);
2668
2669 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2670 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2671 ) {
2672 float_raise( float_flag_invalid STATUS_VAR);
2673 return 1;
2674 }
2675 return 0;
2676}
b689362d 2677
158142c2
FB
2678/*----------------------------------------------------------------------------
2679| Returns 1 if the single-precision floating-point value `a' is equal to
f5a64251
AJ
2680| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2681| exception. The comparison is performed according to the IEC/IEEE Standard
2682| for Binary Floating-Point Arithmetic.
158142c2
FB
2683*----------------------------------------------------------------------------*/
2684
b689362d 2685int float32_eq_quiet( float32 a, float32 b STATUS_PARAM )
158142c2 2686{
37d18660
PM
2687 a = float32_squash_input_denormal(a STATUS_VAR);
2688 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2689
2690 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2691 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2692 ) {
b689362d
AJ
2693 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2694 float_raise( float_flag_invalid STATUS_VAR);
2695 }
158142c2
FB
2696 return 0;
2697 }
b689362d
AJ
2698 return ( float32_val(a) == float32_val(b) ) ||
2699 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
158142c2
FB
2700}
2701
2702/*----------------------------------------------------------------------------
2703| Returns 1 if the single-precision floating-point value `a' is less than or
2704| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2705| cause an exception. Otherwise, the comparison is performed according to the
2706| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2707*----------------------------------------------------------------------------*/
2708
750afe93 2709int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
158142c2
FB
2710{
2711 flag aSign, bSign;
bb98fe42 2712 uint32_t av, bv;
37d18660
PM
2713 a = float32_squash_input_denormal(a STATUS_VAR);
2714 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2715
2716 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2717 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2718 ) {
2719 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2720 float_raise( float_flag_invalid STATUS_VAR);
2721 }
2722 return 0;
2723 }
2724 aSign = extractFloat32Sign( a );
2725 bSign = extractFloat32Sign( b );
f090c9d4
PB
2726 av = float32_val(a);
2727 bv = float32_val(b);
bb98fe42 2728 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 2729 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
2730
2731}
2732
2733/*----------------------------------------------------------------------------
2734| Returns 1 if the single-precision floating-point value `a' is less than
2735| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2736| exception. Otherwise, the comparison is performed according to the IEC/IEEE
2737| Standard for Binary Floating-Point Arithmetic.
2738*----------------------------------------------------------------------------*/
2739
750afe93 2740int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
158142c2
FB
2741{
2742 flag aSign, bSign;
bb98fe42 2743 uint32_t av, bv;
37d18660
PM
2744 a = float32_squash_input_denormal(a STATUS_VAR);
2745 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2746
2747 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2748 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2749 ) {
2750 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2751 float_raise( float_flag_invalid STATUS_VAR);
2752 }
2753 return 0;
2754 }
2755 aSign = extractFloat32Sign( a );
2756 bSign = extractFloat32Sign( b );
f090c9d4
PB
2757 av = float32_val(a);
2758 bv = float32_val(b);
bb98fe42 2759 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 2760 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
2761
2762}
2763
67b7861d
AJ
2764/*----------------------------------------------------------------------------
2765| Returns 1 if the single-precision floating-point values `a' and `b' cannot
2766| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
2767| comparison is performed according to the IEC/IEEE Standard for Binary
2768| Floating-Point Arithmetic.
2769*----------------------------------------------------------------------------*/
2770
2771int float32_unordered_quiet( float32 a, float32 b STATUS_PARAM )
2772{
2773 a = float32_squash_input_denormal(a STATUS_VAR);
2774 b = float32_squash_input_denormal(b STATUS_VAR);
2775
2776 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2777 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2778 ) {
2779 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2780 float_raise( float_flag_invalid STATUS_VAR);
2781 }
2782 return 1;
2783 }
2784 return 0;
2785}
2786
158142c2
FB
2787/*----------------------------------------------------------------------------
2788| Returns the result of converting the double-precision floating-point value
2789| `a' to the 32-bit two's complement integer format. The conversion is
2790| performed according to the IEC/IEEE Standard for Binary Floating-Point
2791| Arithmetic---which means in particular that the conversion is rounded
2792| according to the current rounding mode. If `a' is a NaN, the largest
2793| positive integer is returned. Otherwise, if the conversion overflows, the
2794| largest integer with the same sign as `a' is returned.
2795*----------------------------------------------------------------------------*/
2796
2797int32 float64_to_int32( float64 a STATUS_PARAM )
2798{
2799 flag aSign;
94a49d86 2800 int_fast16_t aExp, shiftCount;
bb98fe42 2801 uint64_t aSig;
37d18660 2802 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
2803
2804 aSig = extractFloat64Frac( a );
2805 aExp = extractFloat64Exp( a );
2806 aSign = extractFloat64Sign( a );
2807 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2808 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2809 shiftCount = 0x42C - aExp;
2810 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2811 return roundAndPackInt32( aSign, aSig STATUS_VAR );
2812
2813}
2814
2815/*----------------------------------------------------------------------------
2816| Returns the result of converting the double-precision floating-point value
2817| `a' to the 32-bit two's complement integer format. The conversion is
2818| performed according to the IEC/IEEE Standard for Binary Floating-Point
2819| Arithmetic, except that the conversion is always rounded toward zero.
2820| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2821| the conversion overflows, the largest integer with the same sign as `a' is
2822| returned.
2823*----------------------------------------------------------------------------*/
2824
2825int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2826{
2827 flag aSign;
94a49d86 2828 int_fast16_t aExp, shiftCount;
bb98fe42 2829 uint64_t aSig, savedASig;
b3a6a2e0 2830 int32_t z;
37d18660 2831 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
2832
2833 aSig = extractFloat64Frac( a );
2834 aExp = extractFloat64Exp( a );
2835 aSign = extractFloat64Sign( a );
2836 if ( 0x41E < aExp ) {
2837 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2838 goto invalid;
2839 }
2840 else if ( aExp < 0x3FF ) {
2841 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2842 return 0;
2843 }
2844 aSig |= LIT64( 0x0010000000000000 );
2845 shiftCount = 0x433 - aExp;
2846 savedASig = aSig;
2847 aSig >>= shiftCount;
2848 z = aSig;
2849 if ( aSign ) z = - z;
2850 if ( ( z < 0 ) ^ aSign ) {
2851 invalid:
2852 float_raise( float_flag_invalid STATUS_VAR);
bb98fe42 2853 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2
FB
2854 }
2855 if ( ( aSig<<shiftCount ) != savedASig ) {
2856 STATUS(float_exception_flags) |= float_flag_inexact;
2857 }
2858 return z;
2859
2860}
2861
cbcef455
PM
2862/*----------------------------------------------------------------------------
2863| Returns the result of converting the double-precision floating-point value
2864| `a' to the 16-bit two's complement integer format. The conversion is
2865| performed according to the IEC/IEEE Standard for Binary Floating-Point
2866| Arithmetic, except that the conversion is always rounded toward zero.
2867| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2868| the conversion overflows, the largest integer with the same sign as `a' is
2869| returned.
2870*----------------------------------------------------------------------------*/
2871
94a49d86 2872int_fast16_t float64_to_int16_round_to_zero(float64 a STATUS_PARAM)
cbcef455
PM
2873{
2874 flag aSign;
94a49d86 2875 int_fast16_t aExp, shiftCount;
bb98fe42 2876 uint64_t aSig, savedASig;
cbcef455
PM
2877 int32 z;
2878
2879 aSig = extractFloat64Frac( a );
2880 aExp = extractFloat64Exp( a );
2881 aSign = extractFloat64Sign( a );
2882 if ( 0x40E < aExp ) {
2883 if ( ( aExp == 0x7FF ) && aSig ) {
2884 aSign = 0;
2885 }
2886 goto invalid;
2887 }
2888 else if ( aExp < 0x3FF ) {
2889 if ( aExp || aSig ) {
2890 STATUS(float_exception_flags) |= float_flag_inexact;
2891 }
2892 return 0;
2893 }
2894 aSig |= LIT64( 0x0010000000000000 );
2895 shiftCount = 0x433 - aExp;
2896 savedASig = aSig;
2897 aSig >>= shiftCount;
2898 z = aSig;
2899 if ( aSign ) {
2900 z = - z;
2901 }
2902 if ( ( (int16_t)z < 0 ) ^ aSign ) {
2903 invalid:
2904 float_raise( float_flag_invalid STATUS_VAR);
bb98fe42 2905 return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
cbcef455
PM
2906 }
2907 if ( ( aSig<<shiftCount ) != savedASig ) {
2908 STATUS(float_exception_flags) |= float_flag_inexact;
2909 }
2910 return z;
2911}
2912
158142c2
FB
2913/*----------------------------------------------------------------------------
2914| Returns the result of converting the double-precision floating-point value
2915| `a' to the 64-bit two's complement integer format. The conversion is
2916| performed according to the IEC/IEEE Standard for Binary Floating-Point
2917| Arithmetic---which means in particular that the conversion is rounded
2918| according to the current rounding mode. If `a' is a NaN, the largest
2919| positive integer is returned. Otherwise, if the conversion overflows, the
2920| largest integer with the same sign as `a' is returned.
2921*----------------------------------------------------------------------------*/
2922
2923int64 float64_to_int64( float64 a STATUS_PARAM )
2924{
2925 flag aSign;
94a49d86 2926 int_fast16_t aExp, shiftCount;
bb98fe42 2927 uint64_t aSig, aSigExtra;
37d18660 2928 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
2929
2930 aSig = extractFloat64Frac( a );
2931 aExp = extractFloat64Exp( a );
2932 aSign = extractFloat64Sign( a );
2933 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2934 shiftCount = 0x433 - aExp;
2935 if ( shiftCount <= 0 ) {
2936 if ( 0x43E < aExp ) {
2937 float_raise( float_flag_invalid STATUS_VAR);
2938 if ( ! aSign
2939 || ( ( aExp == 0x7FF )
2940 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2941 ) {
2942 return LIT64( 0x7FFFFFFFFFFFFFFF );
2943 }
bb98fe42 2944 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
2945 }
2946 aSigExtra = 0;
2947 aSig <<= - shiftCount;
2948 }
2949 else {
2950 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2951 }
2952 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2953
2954}
2955
2956/*----------------------------------------------------------------------------
2957| Returns the result of converting the double-precision floating-point value
2958| `a' to the 64-bit two's complement integer format. The conversion is
2959| performed according to the IEC/IEEE Standard for Binary Floating-Point
2960| Arithmetic, except that the conversion is always rounded toward zero.
2961| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2962| the conversion overflows, the largest integer with the same sign as `a' is
2963| returned.
2964*----------------------------------------------------------------------------*/
2965
2966int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2967{
2968 flag aSign;
94a49d86 2969 int_fast16_t aExp, shiftCount;
bb98fe42 2970 uint64_t aSig;
158142c2 2971 int64 z;
37d18660 2972 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
2973
2974 aSig = extractFloat64Frac( a );
2975 aExp = extractFloat64Exp( a );
2976 aSign = extractFloat64Sign( a );
2977 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2978 shiftCount = aExp - 0x433;
2979 if ( 0 <= shiftCount ) {
2980 if ( 0x43E <= aExp ) {
f090c9d4 2981 if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
158142c2
FB
2982 float_raise( float_flag_invalid STATUS_VAR);
2983 if ( ! aSign
2984 || ( ( aExp == 0x7FF )
2985 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2986 ) {
2987 return LIT64( 0x7FFFFFFFFFFFFFFF );
2988 }
2989 }
bb98fe42 2990 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
2991 }
2992 z = aSig<<shiftCount;
2993 }
2994 else {
2995 if ( aExp < 0x3FE ) {
2996 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2997 return 0;
2998 }
2999 z = aSig>>( - shiftCount );
bb98fe42 3000 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
158142c2
FB
3001 STATUS(float_exception_flags) |= float_flag_inexact;
3002 }
3003 }
3004 if ( aSign ) z = - z;
3005 return z;
3006
3007}
3008
3009/*----------------------------------------------------------------------------
3010| Returns the result of converting the double-precision floating-point value
3011| `a' to the single-precision floating-point format. The conversion is
3012| performed according to the IEC/IEEE Standard for Binary Floating-Point
3013| Arithmetic.
3014*----------------------------------------------------------------------------*/
3015
3016float32 float64_to_float32( float64 a STATUS_PARAM )
3017{
3018 flag aSign;
94a49d86 3019 int_fast16_t aExp;
bb98fe42
AF
3020 uint64_t aSig;
3021 uint32_t zSig;
37d18660 3022 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
3023
3024 aSig = extractFloat64Frac( a );
3025 aExp = extractFloat64Exp( a );
3026 aSign = extractFloat64Sign( a );
3027 if ( aExp == 0x7FF ) {
bcd4d9af 3028 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
3029 return packFloat32( aSign, 0xFF, 0 );
3030 }
3031 shift64RightJamming( aSig, 22, &aSig );
3032 zSig = aSig;
3033 if ( aExp || zSig ) {
3034 zSig |= 0x40000000;
3035 aExp -= 0x381;
3036 }
3037 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
3038
3039}
3040
60011498
PB
3041
3042/*----------------------------------------------------------------------------
3043| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3044| half-precision floating-point value, returning the result. After being
3045| shifted into the proper positions, the three fields are simply added
3046| together to form the result. This means that any integer portion of `zSig'
3047| will be added into the exponent. Since a properly normalized significand
3048| will have an integer portion equal to 1, the `zExp' input should be 1 less
3049| than the desired result exponent whenever `zSig' is a complete, normalized
3050| significand.
3051*----------------------------------------------------------------------------*/
94a49d86 3052static float16 packFloat16(flag zSign, int_fast16_t zExp, uint16_t zSig)
60011498 3053{
bb4d4bb3 3054 return make_float16(
bb98fe42 3055 (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
60011498
PB
3056}
3057
3058/* Half precision floats come in two formats: standard IEEE and "ARM" format.
3059 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
bb4d4bb3
PM
3060
3061float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM)
60011498
PB
3062{
3063 flag aSign;
94a49d86 3064 int_fast16_t aExp;
bb98fe42 3065 uint32_t aSig;
60011498 3066
bb4d4bb3
PM
3067 aSign = extractFloat16Sign(a);
3068 aExp = extractFloat16Exp(a);
3069 aSig = extractFloat16Frac(a);
60011498
PB
3070
3071 if (aExp == 0x1f && ieee) {
3072 if (aSig) {
f591e1be 3073 return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
60011498 3074 }
4be8eeac 3075 return packFloat32(aSign, 0xff, 0);
60011498
PB
3076 }
3077 if (aExp == 0) {
3078 int8 shiftCount;
3079
3080 if (aSig == 0) {
3081 return packFloat32(aSign, 0, 0);
3082 }
3083
3084 shiftCount = countLeadingZeros32( aSig ) - 21;
3085 aSig = aSig << shiftCount;
3086 aExp = -shiftCount;
3087 }
3088 return packFloat32( aSign, aExp + 0x70, aSig << 13);
3089}
3090
bb4d4bb3 3091float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM)
60011498
PB
3092{
3093 flag aSign;
94a49d86 3094 int_fast16_t aExp;
bb98fe42
AF
3095 uint32_t aSig;
3096 uint32_t mask;
3097 uint32_t increment;
60011498 3098 int8 roundingMode;
38970efa
PM
3099 int maxexp = ieee ? 15 : 16;
3100 bool rounding_bumps_exp;
3101 bool is_tiny = false;
3102
37d18660 3103 a = float32_squash_input_denormal(a STATUS_VAR);
60011498
PB
3104
3105 aSig = extractFloat32Frac( a );
3106 aExp = extractFloat32Exp( a );
3107 aSign = extractFloat32Sign( a );
3108 if ( aExp == 0xFF ) {
3109 if (aSig) {
600e30d2 3110 /* Input is a NaN */
600e30d2 3111 if (!ieee) {
38970efa 3112 float_raise(float_flag_invalid STATUS_VAR);
600e30d2
PM
3113 return packFloat16(aSign, 0, 0);
3114 }
38970efa
PM
3115 return commonNaNToFloat16(
3116 float32ToCommonNaN(a STATUS_VAR) STATUS_VAR);
60011498 3117 }
600e30d2
PM
3118 /* Infinity */
3119 if (!ieee) {
3120 float_raise(float_flag_invalid STATUS_VAR);
3121 return packFloat16(aSign, 0x1f, 0x3ff);
3122 }
3123 return packFloat16(aSign, 0x1f, 0);
60011498 3124 }
600e30d2 3125 if (aExp == 0 && aSig == 0) {
60011498
PB
3126 return packFloat16(aSign, 0, 0);
3127 }
38970efa
PM
3128 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3129 * even if the input is denormal; however this is harmless because
3130 * the largest possible single-precision denormal is still smaller
3131 * than the smallest representable half-precision denormal, and so we
3132 * will end up ignoring aSig and returning via the "always return zero"
3133 * codepath.
3134 */
60011498
PB
3135 aSig |= 0x00800000;
3136 aExp -= 0x7f;
38970efa
PM
3137 /* Calculate the mask of bits of the mantissa which are not
3138 * representable in half-precision and will be lost.
3139 */
60011498 3140 if (aExp < -14) {
38970efa 3141 /* Will be denormal in halfprec */
600e30d2
PM
3142 mask = 0x00ffffff;
3143 if (aExp >= -24) {
3144 mask >>= 25 + aExp;
60011498
PB
3145 }
3146 } else {
38970efa 3147 /* Normal number in halfprec */
60011498
PB
3148 mask = 0x00001fff;
3149 }
60011498 3150
38970efa
PM
3151 roundingMode = STATUS(float_rounding_mode);
3152 switch (roundingMode) {
3153 case float_round_nearest_even:
3154 increment = (mask + 1) >> 1;
3155 if ((aSig & mask) == increment) {
3156 increment = aSig & (increment << 1);
3157 }
3158 break;
3159 case float_round_up:
3160 increment = aSign ? 0 : mask;
3161 break;
3162 case float_round_down:
3163 increment = aSign ? mask : 0;
3164 break;
3165 default: /* round_to_zero */
3166 increment = 0;
3167 break;
3168 }
3169
3170 rounding_bumps_exp = (aSig + increment >= 0x01000000);
3171
3172 if (aExp > maxexp || (aExp == maxexp && rounding_bumps_exp)) {
3173 if (ieee) {
3174 float_raise(float_flag_overflow | float_flag_inexact STATUS_VAR);
60011498 3175 return packFloat16(aSign, 0x1f, 0);
38970efa
PM
3176 } else {
3177 float_raise(float_flag_invalid STATUS_VAR);
60011498
PB
3178 return packFloat16(aSign, 0x1f, 0x3ff);
3179 }
3180 }
38970efa
PM
3181
3182 if (aExp < -14) {
3183 /* Note that flush-to-zero does not affect half-precision results */
3184 is_tiny =
3185 (STATUS(float_detect_tininess) == float_tininess_before_rounding)
3186 || (aExp < -15)
3187 || (!rounding_bumps_exp);
3188 }
3189 if (aSig & mask) {
3190 float_raise(float_flag_inexact STATUS_VAR);
3191 if (is_tiny) {
3192 float_raise(float_flag_underflow STATUS_VAR);
3193 }
3194 }
3195
3196 aSig += increment;
3197 if (rounding_bumps_exp) {
3198 aSig >>= 1;
3199 aExp++;
3200 }
3201
60011498
PB
3202 if (aExp < -24) {
3203 return packFloat16(aSign, 0, 0);
3204 }
3205 if (aExp < -14) {
3206 aSig >>= -14 - aExp;
3207 aExp = -14;
3208 }
3209 return packFloat16(aSign, aExp + 14, aSig >> 13);
3210}
3211
158142c2
FB
3212/*----------------------------------------------------------------------------
3213| Returns the result of converting the double-precision floating-point value
3214| `a' to the extended double-precision floating-point format. The conversion
3215| is performed according to the IEC/IEEE Standard for Binary Floating-Point
3216| Arithmetic.
3217*----------------------------------------------------------------------------*/
3218
3219floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
3220{
3221 flag aSign;
94a49d86 3222 int_fast16_t aExp;
bb98fe42 3223 uint64_t aSig;
158142c2 3224
37d18660 3225 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
3226 aSig = extractFloat64Frac( a );
3227 aExp = extractFloat64Exp( a );
3228 aSign = extractFloat64Sign( a );
3229 if ( aExp == 0x7FF ) {
bcd4d9af 3230 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
3231 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3232 }
3233 if ( aExp == 0 ) {
3234 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3235 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3236 }
3237 return
3238 packFloatx80(
3239 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
3240
3241}
3242
158142c2
FB
3243/*----------------------------------------------------------------------------
3244| Returns the result of converting the double-precision floating-point value
3245| `a' to the quadruple-precision floating-point format. The conversion is
3246| performed according to the IEC/IEEE Standard for Binary Floating-Point
3247| Arithmetic.
3248*----------------------------------------------------------------------------*/
3249
3250float128 float64_to_float128( float64 a STATUS_PARAM )
3251{
3252 flag aSign;
94a49d86 3253 int_fast16_t aExp;
bb98fe42 3254 uint64_t aSig, zSig0, zSig1;
158142c2 3255
37d18660 3256 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
3257 aSig = extractFloat64Frac( a );
3258 aExp = extractFloat64Exp( a );
3259 aSign = extractFloat64Sign( a );
3260 if ( aExp == 0x7FF ) {
bcd4d9af 3261 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
3262 return packFloat128( aSign, 0x7FFF, 0, 0 );
3263 }
3264 if ( aExp == 0 ) {
3265 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3266 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3267 --aExp;
3268 }
3269 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
3270 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
3271
3272}
3273
158142c2
FB
3274/*----------------------------------------------------------------------------
3275| Rounds the double-precision floating-point value `a' to an integer, and
3276| returns the result as a double-precision floating-point value. The
3277| operation is performed according to the IEC/IEEE Standard for Binary
3278| Floating-Point Arithmetic.
3279*----------------------------------------------------------------------------*/
3280
3281float64 float64_round_to_int( float64 a STATUS_PARAM )
3282{
3283 flag aSign;
94a49d86 3284 int_fast16_t aExp;
bb98fe42 3285 uint64_t lastBitMask, roundBitsMask;
158142c2 3286 int8 roundingMode;
bb98fe42 3287 uint64_t z;
37d18660 3288 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
3289
3290 aExp = extractFloat64Exp( a );
3291 if ( 0x433 <= aExp ) {
3292 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
3293 return propagateFloat64NaN( a, a STATUS_VAR );
3294 }
3295 return a;
3296 }
3297 if ( aExp < 0x3FF ) {
bb98fe42 3298 if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
158142c2
FB
3299 STATUS(float_exception_flags) |= float_flag_inexact;
3300 aSign = extractFloat64Sign( a );
3301 switch ( STATUS(float_rounding_mode) ) {
3302 case float_round_nearest_even:
3303 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3304 return packFloat64( aSign, 0x3FF, 0 );
3305 }
3306 break;
3307 case float_round_down:
f090c9d4 3308 return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
158142c2 3309 case float_round_up:
f090c9d4
PB
3310 return make_float64(
3311 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
158142c2
FB
3312 }
3313 return packFloat64( aSign, 0, 0 );
3314 }
3315 lastBitMask = 1;
3316 lastBitMask <<= 0x433 - aExp;
3317 roundBitsMask = lastBitMask - 1;
f090c9d4 3318 z = float64_val(a);
158142c2
FB
3319 roundingMode = STATUS(float_rounding_mode);
3320 if ( roundingMode == float_round_nearest_even ) {
3321 z += lastBitMask>>1;
3322 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
3323 }
3324 else if ( roundingMode != float_round_to_zero ) {
f090c9d4 3325 if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
158142c2
FB
3326 z += roundBitsMask;
3327 }
3328 }
3329 z &= ~ roundBitsMask;
f090c9d4
PB
3330 if ( z != float64_val(a) )
3331 STATUS(float_exception_flags) |= float_flag_inexact;
3332 return make_float64(z);
158142c2
FB
3333
3334}
3335
e6e5906b
PB
3336float64 float64_trunc_to_int( float64 a STATUS_PARAM)
3337{
3338 int oldmode;
3339 float64 res;
3340 oldmode = STATUS(float_rounding_mode);
3341 STATUS(float_rounding_mode) = float_round_to_zero;
3342 res = float64_round_to_int(a STATUS_VAR);
3343 STATUS(float_rounding_mode) = oldmode;
3344 return res;
3345}
3346
158142c2
FB
3347/*----------------------------------------------------------------------------
3348| Returns the result of adding the absolute values of the double-precision
3349| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
3350| before being returned. `zSign' is ignored if the result is a NaN.
3351| The addition is performed according to the IEC/IEEE Standard for Binary
3352| Floating-Point Arithmetic.
3353*----------------------------------------------------------------------------*/
3354
3355static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3356{
94a49d86 3357 int_fast16_t aExp, bExp, zExp;
bb98fe42 3358 uint64_t aSig, bSig, zSig;
94a49d86 3359 int_fast16_t expDiff;
158142c2
FB
3360
3361 aSig = extractFloat64Frac( a );
3362 aExp = extractFloat64Exp( a );
3363 bSig = extractFloat64Frac( b );
3364 bExp = extractFloat64Exp( b );
3365 expDiff = aExp - bExp;
3366 aSig <<= 9;
3367 bSig <<= 9;
3368 if ( 0 < expDiff ) {
3369 if ( aExp == 0x7FF ) {
3370 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3371 return a;
3372 }
3373 if ( bExp == 0 ) {
3374 --expDiff;
3375 }
3376 else {
3377 bSig |= LIT64( 0x2000000000000000 );
3378 }
3379 shift64RightJamming( bSig, expDiff, &bSig );
3380 zExp = aExp;
3381 }
3382 else if ( expDiff < 0 ) {
3383 if ( bExp == 0x7FF ) {
3384 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3385 return packFloat64( zSign, 0x7FF, 0 );
3386 }
3387 if ( aExp == 0 ) {
3388 ++expDiff;
3389 }
3390 else {
3391 aSig |= LIT64( 0x2000000000000000 );
3392 }
3393 shift64RightJamming( aSig, - expDiff, &aSig );
3394 zExp = bExp;
3395 }
3396 else {
3397 if ( aExp == 0x7FF ) {
3398 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3399 return a;
3400 }
fe76d976 3401 if ( aExp == 0 ) {
e6afc87f
PM
3402 if (STATUS(flush_to_zero)) {
3403 if (aSig | bSig) {
3404 float_raise(float_flag_output_denormal STATUS_VAR);
3405 }
3406 return packFloat64(zSign, 0, 0);
3407 }
fe76d976
PB
3408 return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3409 }
158142c2
FB
3410 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3411 zExp = aExp;
3412 goto roundAndPack;
3413 }
3414 aSig |= LIT64( 0x2000000000000000 );
3415 zSig = ( aSig + bSig )<<1;
3416 --zExp;
bb98fe42 3417 if ( (int64_t) zSig < 0 ) {
158142c2
FB
3418 zSig = aSig + bSig;
3419 ++zExp;
3420 }
3421 roundAndPack:
3422 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3423
3424}
3425
3426/*----------------------------------------------------------------------------
3427| Returns the result of subtracting the absolute values of the double-
3428| precision floating-point values `a' and `b'. If `zSign' is 1, the
3429| difference is negated before being returned. `zSign' is ignored if the
3430| result is a NaN. The subtraction is performed according to the IEC/IEEE
3431| Standard for Binary Floating-Point Arithmetic.
3432*----------------------------------------------------------------------------*/
3433
3434static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3435{
94a49d86 3436 int_fast16_t aExp, bExp, zExp;
bb98fe42 3437 uint64_t aSig, bSig, zSig;
94a49d86 3438 int_fast16_t expDiff;
158142c2
FB
3439
3440 aSig = extractFloat64Frac( a );
3441 aExp = extractFloat64Exp( a );
3442 bSig = extractFloat64Frac( b );
3443 bExp = extractFloat64Exp( b );
3444 expDiff = aExp - bExp;
3445 aSig <<= 10;
3446 bSig <<= 10;
3447 if ( 0 < expDiff ) goto aExpBigger;
3448 if ( expDiff < 0 ) goto bExpBigger;
3449 if ( aExp == 0x7FF ) {
3450 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3451 float_raise( float_flag_invalid STATUS_VAR);
3452 return float64_default_nan;
3453 }
3454 if ( aExp == 0 ) {
3455 aExp = 1;
3456 bExp = 1;
3457 }
3458 if ( bSig < aSig ) goto aBigger;
3459 if ( aSig < bSig ) goto bBigger;
3460 return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3461 bExpBigger:
3462 if ( bExp == 0x7FF ) {
3463 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3464 return packFloat64( zSign ^ 1, 0x7FF, 0 );
3465 }
3466 if ( aExp == 0 ) {
3467 ++expDiff;
3468 }
3469 else {
3470 aSig |= LIT64( 0x4000000000000000 );
3471 }
3472 shift64RightJamming( aSig, - expDiff, &aSig );
3473 bSig |= LIT64( 0x4000000000000000 );
3474 bBigger:
3475 zSig = bSig - aSig;
3476 zExp = bExp;
3477 zSign ^= 1;
3478 goto normalizeRoundAndPack;
3479 aExpBigger:
3480 if ( aExp == 0x7FF ) {
3481 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3482 return a;
3483 }
3484 if ( bExp == 0 ) {
3485 --expDiff;
3486 }
3487 else {
3488 bSig |= LIT64( 0x4000000000000000 );
3489 }
3490 shift64RightJamming( bSig, expDiff, &bSig );
3491 aSig |= LIT64( 0x4000000000000000 );
3492 aBigger:
3493 zSig = aSig - bSig;
3494 zExp = aExp;
3495 normalizeRoundAndPack:
3496 --zExp;
3497 return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3498
3499}
3500
3501/*----------------------------------------------------------------------------
3502| Returns the result of adding the double-precision floating-point values `a'
3503| and `b'. The operation is performed according to the IEC/IEEE Standard for
3504| Binary Floating-Point Arithmetic.
3505*----------------------------------------------------------------------------*/
3506
3507float64 float64_add( float64 a, float64 b STATUS_PARAM )
3508{
3509 flag aSign, bSign;
37d18660
PM
3510 a = float64_squash_input_denormal(a STATUS_VAR);
3511 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
3512
3513 aSign = extractFloat64Sign( a );
3514 bSign = extractFloat64Sign( b );
3515 if ( aSign == bSign ) {
3516 return addFloat64Sigs( a, b, aSign STATUS_VAR );
3517 }
3518 else {
3519 return subFloat64Sigs( a, b, aSign STATUS_VAR );
3520 }
3521
3522}
3523
3524/*----------------------------------------------------------------------------
3525| Returns the result of subtracting the double-precision floating-point values
3526| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3527| for Binary Floating-Point Arithmetic.
3528*----------------------------------------------------------------------------*/
3529
3530float64 float64_sub( float64 a, float64 b STATUS_PARAM )
3531{
3532 flag aSign, bSign;
37d18660
PM
3533 a = float64_squash_input_denormal(a STATUS_VAR);
3534 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
3535
3536 aSign = extractFloat64Sign( a );
3537 bSign = extractFloat64Sign( b );
3538 if ( aSign == bSign ) {
3539 return subFloat64Sigs( a, b, aSign STATUS_VAR );
3540 }
3541 else {
3542 return addFloat64Sigs( a, b, aSign STATUS_VAR );
3543 }
3544
3545}
3546
3547/*----------------------------------------------------------------------------
3548| Returns the result of multiplying the double-precision floating-point values
3549| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3550| for Binary Floating-Point Arithmetic.
3551*----------------------------------------------------------------------------*/
3552
3553float64 float64_mul( float64 a, float64 b STATUS_PARAM )
3554{
3555 flag aSign, bSign, zSign;
94a49d86 3556 int_fast16_t aExp, bExp, zExp;
bb98fe42 3557 uint64_t aSig, bSig, zSig0, zSig1;
158142c2 3558
37d18660
PM
3559 a = float64_squash_input_denormal(a STATUS_VAR);
3560 b = float64_squash_input_denormal(b STATUS_VAR);
3561
158142c2
FB
3562 aSig = extractFloat64Frac( a );
3563 aExp = extractFloat64Exp( a );
3564 aSign = extractFloat64Sign( a );
3565 bSig = extractFloat64Frac( b );
3566 bExp = extractFloat64Exp( b );
3567 bSign = extractFloat64Sign( b );
3568 zSign = aSign ^ bSign;
3569 if ( aExp == 0x7FF ) {
3570 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3571 return propagateFloat64NaN( a, b STATUS_VAR );
3572 }
3573 if ( ( bExp | bSig ) == 0 ) {
3574 float_raise( float_flag_invalid STATUS_VAR);
3575 return float64_default_nan;
3576 }
3577 return packFloat64( zSign, 0x7FF, 0 );
3578 }
3579 if ( bExp == 0x7FF ) {
3580 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3581 if ( ( aExp | aSig ) == 0 ) {
3582 float_raise( float_flag_invalid STATUS_VAR);
3583 return float64_default_nan;
3584 }
3585 return packFloat64( zSign, 0x7FF, 0 );
3586 }
3587 if ( aExp == 0 ) {
3588 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3589 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3590 }
3591 if ( bExp == 0 ) {
3592 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
3593 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3594 }
3595 zExp = aExp + bExp - 0x3FF;
3596 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3597 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3598 mul64To128( aSig, bSig, &zSig0, &zSig1 );
3599 zSig0 |= ( zSig1 != 0 );
bb98fe42 3600 if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
158142c2
FB
3601 zSig0 <<= 1;
3602 --zExp;
3603 }
3604 return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
3605
3606}
3607
3608/*----------------------------------------------------------------------------
3609| Returns the result of dividing the double-precision floating-point value `a'
3610| by the corresponding value `b'. The operation is performed according to
3611| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3612*----------------------------------------------------------------------------*/
3613
3614float64 float64_div( float64 a, float64 b STATUS_PARAM )
3615{
3616 flag aSign, bSign, zSign;
94a49d86 3617 int_fast16_t aExp, bExp, zExp;
bb98fe42
AF
3618 uint64_t aSig, bSig, zSig;
3619 uint64_t rem0, rem1;
3620 uint64_t term0, term1;
37d18660
PM
3621 a = float64_squash_input_denormal(a STATUS_VAR);
3622 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
3623
3624 aSig = extractFloat64Frac( a );
3625 aExp = extractFloat64Exp( a );
3626 aSign = extractFloat64Sign( a );
3627 bSig = extractFloat64Frac( b );
3628 bExp = extractFloat64Exp( b );
3629 bSign = extractFloat64Sign( b );
3630 zSign = aSign ^ bSign;
3631 if ( aExp == 0x7FF ) {
3632 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3633 if ( bExp == 0x7FF ) {
3634 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3635 float_raise( float_flag_invalid STATUS_VAR);
3636 return float64_default_nan;
3637 }
3638 return packFloat64( zSign, 0x7FF, 0 );
3639 }
3640 if ( bExp == 0x7FF ) {
3641 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3642 return packFloat64( zSign, 0, 0 );
3643 }
3644 if ( bExp == 0 ) {
3645 if ( bSig == 0 ) {
3646 if ( ( aExp | aSig ) == 0 ) {
3647 float_raise( float_flag_invalid STATUS_VAR);
3648 return float64_default_nan;
3649 }
3650 float_raise( float_flag_divbyzero STATUS_VAR);
3651 return packFloat64( zSign, 0x7FF, 0 );
3652 }
3653 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3654 }
3655 if ( aExp == 0 ) {
3656 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3657 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3658 }
3659 zExp = aExp - bExp + 0x3FD;
3660 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3661 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3662 if ( bSig <= ( aSig + aSig ) ) {
3663 aSig >>= 1;
3664 ++zExp;
3665 }
3666 zSig = estimateDiv128To64( aSig, 0, bSig );
3667 if ( ( zSig & 0x1FF ) <= 2 ) {
3668 mul64To128( bSig, zSig, &term0, &term1 );
3669 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
bb98fe42 3670 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
3671 --zSig;
3672 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3673 }
3674 zSig |= ( rem1 != 0 );
3675 }
3676 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3677
3678}
3679
3680/*----------------------------------------------------------------------------
3681| Returns the remainder of the double-precision floating-point value `a'
3682| with respect to the corresponding value `b'. The operation is performed
3683| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3684*----------------------------------------------------------------------------*/
3685
3686float64 float64_rem( float64 a, float64 b STATUS_PARAM )
3687{
ed086f3d 3688 flag aSign, zSign;
94a49d86 3689 int_fast16_t aExp, bExp, expDiff;
bb98fe42
AF
3690 uint64_t aSig, bSig;
3691 uint64_t q, alternateASig;
3692 int64_t sigMean;
158142c2 3693
37d18660
PM
3694 a = float64_squash_input_denormal(a STATUS_VAR);
3695 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
3696 aSig = extractFloat64Frac( a );
3697 aExp = extractFloat64Exp( a );
3698 aSign = extractFloat64Sign( a );
3699 bSig = extractFloat64Frac( b );
3700 bExp = extractFloat64Exp( b );
158142c2
FB
3701 if ( aExp == 0x7FF ) {
3702 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3703 return propagateFloat64NaN( a, b STATUS_VAR );
3704 }
3705 float_raise( float_flag_invalid STATUS_VAR);
3706 return float64_default_nan;
3707 }
3708 if ( bExp == 0x7FF ) {
3709 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3710 return a;
3711 }
3712 if ( bExp == 0 ) {
3713 if ( bSig == 0 ) {
3714 float_raise( float_flag_invalid STATUS_VAR);
3715 return float64_default_nan;
3716 }
3717 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3718 }
3719 if ( aExp == 0 ) {
3720 if ( aSig == 0 ) return a;
3721 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3722 }
3723 expDiff = aExp - bExp;
3724 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3725 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3726 if ( expDiff < 0 ) {
3727 if ( expDiff < -1 ) return a;
3728 aSig >>= 1;
3729 }
3730 q = ( bSig <= aSig );
3731 if ( q ) aSig -= bSig;
3732 expDiff -= 64;
3733 while ( 0 < expDiff ) {
3734 q = estimateDiv128To64( aSig, 0, bSig );
3735 q = ( 2 < q ) ? q - 2 : 0;
3736 aSig = - ( ( bSig>>2 ) * q );
3737 expDiff -= 62;
3738 }
3739 expDiff += 64;
3740 if ( 0 < expDiff ) {
3741 q = estimateDiv128To64( aSig, 0, bSig );
3742 q = ( 2 < q ) ? q - 2 : 0;
3743 q >>= 64 - expDiff;
3744 bSig >>= 2;
3745 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3746 }
3747 else {
3748 aSig >>= 2;
3749 bSig >>= 2;
3750 }
3751 do {
3752 alternateASig = aSig;
3753 ++q;
3754 aSig -= bSig;
bb98fe42 3755 } while ( 0 <= (int64_t) aSig );
158142c2
FB
3756 sigMean = aSig + alternateASig;
3757 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3758 aSig = alternateASig;
3759 }
bb98fe42 3760 zSign = ( (int64_t) aSig < 0 );
158142c2
FB
3761 if ( zSign ) aSig = - aSig;
3762 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
3763
3764}
3765
369be8f6
PM
3766/*----------------------------------------------------------------------------
3767| Returns the result of multiplying the double-precision floating-point values
3768| `a' and `b' then adding 'c', with no intermediate rounding step after the
3769| multiplication. The operation is performed according to the IEC/IEEE
3770| Standard for Binary Floating-Point Arithmetic 754-2008.
3771| The flags argument allows the caller to select negation of the
3772| addend, the intermediate product, or the final result. (The difference
3773| between this and having the caller do a separate negation is that negating
3774| externally will flip the sign bit on NaNs.)
3775*----------------------------------------------------------------------------*/
3776
3777float64 float64_muladd(float64 a, float64 b, float64 c, int flags STATUS_PARAM)
3778{
3779 flag aSign, bSign, cSign, zSign;
94a49d86 3780 int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
369be8f6
PM
3781 uint64_t aSig, bSig, cSig;
3782 flag pInf, pZero, pSign;
3783 uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1;
3784 int shiftcount;
3785 flag signflip, infzero;
3786
3787 a = float64_squash_input_denormal(a STATUS_VAR);
3788 b = float64_squash_input_denormal(b STATUS_VAR);
3789 c = float64_squash_input_denormal(c STATUS_VAR);
3790 aSig = extractFloat64Frac(a);
3791 aExp = extractFloat64Exp(a);
3792 aSign = extractFloat64Sign(a);
3793 bSig = extractFloat64Frac(b);
3794 bExp = extractFloat64Exp(b);
3795 bSign = extractFloat64Sign(b);
3796 cSig = extractFloat64Frac(c);
3797 cExp = extractFloat64Exp(c);
3798 cSign = extractFloat64Sign(c);
3799
3800 infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) ||
3801 (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0));
3802
3803 /* It is implementation-defined whether the cases of (0,inf,qnan)
3804 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
3805 * they return if they do), so we have to hand this information
3806 * off to the target-specific pick-a-NaN routine.
3807 */
3808 if (((aExp == 0x7ff) && aSig) ||
3809 ((bExp == 0x7ff) && bSig) ||
3810 ((cExp == 0x7ff) && cSig)) {
3811 return propagateFloat64MulAddNaN(a, b, c, infzero STATUS_VAR);
3812 }
3813
3814 if (infzero) {
3815 float_raise(float_flag_invalid STATUS_VAR);
3816 return float64_default_nan;
3817 }
3818
3819 if (flags & float_muladd_negate_c) {
3820 cSign ^= 1;
3821 }
3822
3823 signflip = (flags & float_muladd_negate_result) ? 1 : 0;
3824
3825 /* Work out the sign and type of the product */
3826 pSign = aSign ^ bSign;
3827 if (flags & float_muladd_negate_product) {
3828 pSign ^= 1;
3829 }
3830 pInf = (aExp == 0x7ff) || (bExp == 0x7ff);
3831 pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
3832
3833 if (cExp == 0x7ff) {
3834 if (pInf && (pSign ^ cSign)) {
3835 /* addition of opposite-signed infinities => InvalidOperation */
3836 float_raise(float_flag_invalid STATUS_VAR);
3837 return float64_default_nan;
3838 }
3839 /* Otherwise generate an infinity of the same sign */
3840 return packFloat64(cSign ^ signflip, 0x7ff, 0);
3841 }
3842
3843 if (pInf) {
3844 return packFloat64(pSign ^ signflip, 0x7ff, 0);
3845 }
3846
3847 if (pZero) {
3848 if (cExp == 0) {
3849 if (cSig == 0) {
3850 /* Adding two exact zeroes */
3851 if (pSign == cSign) {
3852 zSign = pSign;
3853 } else if (STATUS(float_rounding_mode) == float_round_down) {
3854 zSign = 1;
3855 } else {
3856 zSign = 0;
3857 }
3858 return packFloat64(zSign ^ signflip, 0, 0);
3859 }
3860 /* Exact zero plus a denorm */
3861 if (STATUS(flush_to_zero)) {
3862 float_raise(float_flag_output_denormal STATUS_VAR);
3863 return packFloat64(cSign ^ signflip, 0, 0);
3864 }
3865 }
3866 /* Zero plus something non-zero : just return the something */
a6e7c184 3867 return packFloat64(cSign ^ signflip, cExp, cSig);
369be8f6
PM
3868 }
3869
3870 if (aExp == 0) {
3871 normalizeFloat64Subnormal(aSig, &aExp, &aSig);
3872 }
3873 if (bExp == 0) {
3874 normalizeFloat64Subnormal(bSig, &bExp, &bSig);
3875 }
3876
3877 /* Calculate the actual result a * b + c */
3878
3879 /* Multiply first; this is easy. */
3880 /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff
3881 * because we want the true exponent, not the "one-less-than"
3882 * flavour that roundAndPackFloat64() takes.
3883 */
3884 pExp = aExp + bExp - 0x3fe;
3885 aSig = (aSig | LIT64(0x0010000000000000))<<10;
3886 bSig = (bSig | LIT64(0x0010000000000000))<<11;
3887 mul64To128(aSig, bSig, &pSig0, &pSig1);
3888 if ((int64_t)(pSig0 << 1) >= 0) {
3889 shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1);
3890 pExp--;
3891 }
3892
3893 zSign = pSign ^ signflip;
3894
3895 /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit
3896 * bit in position 126.
3897 */
3898 if (cExp == 0) {
3899 if (!cSig) {
3900 /* Throw out the special case of c being an exact zero now */
3901 shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1);
3902 return roundAndPackFloat64(zSign, pExp - 1,
3903 pSig1 STATUS_VAR);
3904 }
3905 normalizeFloat64Subnormal(cSig, &cExp, &cSig);
3906 }
3907
3908 /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the
3909 * significand of the addend, with the explicit bit in position 126.
3910 */
3911 cSig0 = cSig << (126 - 64 - 52);
3912 cSig1 = 0;
3913 cSig0 |= LIT64(0x4000000000000000);
3914 expDiff = pExp - cExp;
3915
3916 if (pSign == cSign) {
3917 /* Addition */
3918 if (expDiff > 0) {
3919 /* scale c to match p */
3920 shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
3921 zExp = pExp;
3922 } else if (expDiff < 0) {
3923 /* scale p to match c */
3924 shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
3925 zExp = cExp;
3926 } else {
3927 /* no scaling needed */
3928 zExp = cExp;
3929 }
3930 /* Add significands and make sure explicit bit ends up in posn 126 */
3931 add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
3932 if ((int64_t)zSig0 < 0) {
3933 shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);
3934 } else {
3935 zExp--;
3936 }
3937 shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1);
3938 return roundAndPackFloat64(zSign, zExp, zSig1 STATUS_VAR);
3939 } else {
3940 /* Subtraction */
3941 if (expDiff > 0) {
3942 shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
3943 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
3944 zExp = pExp;
3945 } else if (expDiff < 0) {
3946 shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
3947 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
3948 zExp = cExp;
3949 zSign ^= 1;
3950 } else {
3951 zExp = pExp;
3952 if (lt128(cSig0, cSig1, pSig0, pSig1)) {
3953 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
3954 } else if (lt128(pSig0, pSig1, cSig0, cSig1)) {
3955 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
3956 zSign ^= 1;
3957 } else {
3958 /* Exact zero */
3959 zSign = signflip;
3960 if (STATUS(float_rounding_mode) == float_round_down) {
3961 zSign ^= 1;
3962 }
3963 return packFloat64(zSign, 0, 0);
3964 }
3965 }
3966 --zExp;
3967 /* Do the equivalent of normalizeRoundAndPackFloat64() but
3968 * starting with the significand in a pair of uint64_t.
3969 */
3970 if (zSig0) {
3971 shiftcount = countLeadingZeros64(zSig0) - 1;
3972 shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1);
3973 if (zSig1) {
3974 zSig0 |= 1;
3975 }
3976 zExp -= shiftcount;
3977 } else {
e3d142d0
PM
3978 shiftcount = countLeadingZeros64(zSig1);
3979 if (shiftcount == 0) {
3980 zSig0 = (zSig1 >> 1) | (zSig1 & 1);
3981 zExp -= 63;
3982 } else {
3983 shiftcount--;
3984 zSig0 = zSig1 << shiftcount;
3985 zExp -= (shiftcount + 64);
3986 }
369be8f6
PM
3987 }
3988 return roundAndPackFloat64(zSign, zExp, zSig0 STATUS_VAR);
3989 }
3990}
3991
158142c2
FB
3992/*----------------------------------------------------------------------------
3993| Returns the square root of the double-precision floating-point value `a'.
3994| The operation is performed according to the IEC/IEEE Standard for Binary
3995| Floating-Point Arithmetic.
3996*----------------------------------------------------------------------------*/
3997
3998float64 float64_sqrt( float64 a STATUS_PARAM )
3999{
4000 flag aSign;
94a49d86 4001 int_fast16_t aExp, zExp;
bb98fe42
AF
4002 uint64_t aSig, zSig, doubleZSig;
4003 uint64_t rem0, rem1, term0, term1;
37d18660 4004 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
4005
4006 aSig = extractFloat64Frac( a );
4007 aExp = extractFloat64Exp( a );
4008 aSign = extractFloat64Sign( a );
4009 if ( aExp == 0x7FF ) {
4010 if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
4011 if ( ! aSign ) return a;
4012 float_raise( float_flag_invalid STATUS_VAR);
4013 return float64_default_nan;
4014 }
4015 if ( aSign ) {
4016 if ( ( aExp | aSig ) == 0 ) return a;
4017 float_raise( float_flag_invalid STATUS_VAR);
4018 return float64_default_nan;
4019 }
4020 if ( aExp == 0 ) {
f090c9d4 4021 if ( aSig == 0 ) return float64_zero;
158142c2
FB
4022 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4023 }
4024 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
4025 aSig |= LIT64( 0x0010000000000000 );
4026 zSig = estimateSqrt32( aExp, aSig>>21 );
4027 aSig <<= 9 - ( aExp & 1 );
4028 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
4029 if ( ( zSig & 0x1FF ) <= 5 ) {
4030 doubleZSig = zSig<<1;
4031 mul64To128( zSig, zSig, &term0, &term1 );
4032 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
bb98fe42 4033 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
4034 --zSig;
4035 doubleZSig -= 2;
4036 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
4037 }
4038 zSig |= ( ( rem0 | rem1 ) != 0 );
4039 }
4040 return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
4041
4042}
4043
374dfc33
AJ
4044/*----------------------------------------------------------------------------
4045| Returns the binary log of the double-precision floating-point value `a'.
4046| The operation is performed according to the IEC/IEEE Standard for Binary
4047| Floating-Point Arithmetic.
4048*----------------------------------------------------------------------------*/
4049float64 float64_log2( float64 a STATUS_PARAM )
4050{
4051 flag aSign, zSign;
94a49d86 4052 int_fast16_t aExp;
bb98fe42 4053 uint64_t aSig, aSig0, aSig1, zSig, i;
37d18660 4054 a = float64_squash_input_denormal(a STATUS_VAR);
374dfc33
AJ
4055
4056 aSig = extractFloat64Frac( a );
4057 aExp = extractFloat64Exp( a );
4058 aSign = extractFloat64Sign( a );
4059
4060 if ( aExp == 0 ) {
4061 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4062 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4063 }
4064 if ( aSign ) {
4065 float_raise( float_flag_invalid STATUS_VAR);
4066 return float64_default_nan;
4067 }
4068 if ( aExp == 0x7FF ) {
4069 if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
4070 return a;
4071 }
4072
4073 aExp -= 0x3FF;
4074 aSig |= LIT64( 0x0010000000000000 );
4075 zSign = aExp < 0;
bb98fe42 4076 zSig = (uint64_t)aExp << 52;
374dfc33
AJ
4077 for (i = 1LL << 51; i > 0; i >>= 1) {
4078 mul64To128( aSig, aSig, &aSig0, &aSig1 );
4079 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4080 if ( aSig & LIT64( 0x0020000000000000 ) ) {
4081 aSig >>= 1;
4082 zSig |= i;
4083 }
4084 }
4085
4086 if ( zSign )
4087 zSig = -zSig;
4088 return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
4089}
4090
158142c2
FB
4091/*----------------------------------------------------------------------------
4092| Returns 1 if the double-precision floating-point value `a' is equal to the
b689362d
AJ
4093| corresponding value `b', and 0 otherwise. The invalid exception is raised
4094| if either operand is a NaN. Otherwise, the comparison is performed
158142c2
FB
4095| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4096*----------------------------------------------------------------------------*/
4097
b689362d 4098int float64_eq( float64 a, float64 b STATUS_PARAM )
158142c2 4099{
bb98fe42 4100 uint64_t av, bv;
37d18660
PM
4101 a = float64_squash_input_denormal(a STATUS_VAR);
4102 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
4103
4104 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4105 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4106 ) {
b689362d 4107 float_raise( float_flag_invalid STATUS_VAR);
158142c2
FB
4108 return 0;
4109 }
f090c9d4 4110 av = float64_val(a);
a1b91bb4 4111 bv = float64_val(b);
bb98fe42 4112 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
158142c2
FB
4113
4114}
4115
4116/*----------------------------------------------------------------------------
4117| Returns 1 if the double-precision floating-point value `a' is less than or
f5a64251
AJ
4118| equal to the corresponding value `b', and 0 otherwise. The invalid
4119| exception is raised if either operand is a NaN. The comparison is performed
4120| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
4121*----------------------------------------------------------------------------*/
4122
750afe93 4123int float64_le( float64 a, float64 b STATUS_PARAM )
158142c2
FB
4124{
4125 flag aSign, bSign;
bb98fe42 4126 uint64_t av, bv;
37d18660
PM
4127 a = float64_squash_input_denormal(a STATUS_VAR);
4128 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
4129
4130 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4131 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4132 ) {
4133 float_raise( float_flag_invalid STATUS_VAR);
4134 return 0;
4135 }
4136 aSign = extractFloat64Sign( a );
4137 bSign = extractFloat64Sign( b );
f090c9d4 4138 av = float64_val(a);
a1b91bb4 4139 bv = float64_val(b);
bb98fe42 4140 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 4141 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
4142
4143}
4144
4145/*----------------------------------------------------------------------------
4146| Returns 1 if the double-precision floating-point value `a' is less than
f5a64251
AJ
4147| the corresponding value `b', and 0 otherwise. The invalid exception is
4148| raised if either operand is a NaN. The comparison is performed according
4149| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
4150*----------------------------------------------------------------------------*/
4151
750afe93 4152int float64_lt( float64 a, float64 b STATUS_PARAM )
158142c2
FB
4153{
4154 flag aSign, bSign;
bb98fe42 4155 uint64_t av, bv;
158142c2 4156
37d18660
PM
4157 a = float64_squash_input_denormal(a STATUS_VAR);
4158 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
4159 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4160 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4161 ) {
4162 float_raise( float_flag_invalid STATUS_VAR);
4163 return 0;
4164 }
4165 aSign = extractFloat64Sign( a );
4166 bSign = extractFloat64Sign( b );
f090c9d4 4167 av = float64_val(a);
a1b91bb4 4168 bv = float64_val(b);
bb98fe42 4169 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 4170 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
4171
4172}
4173
67b7861d
AJ
4174/*----------------------------------------------------------------------------
4175| Returns 1 if the double-precision floating-point values `a' and `b' cannot
f5a64251
AJ
4176| be compared, and 0 otherwise. The invalid exception is raised if either
4177| operand is a NaN. The comparison is performed according to the IEC/IEEE
4178| Standard for Binary Floating-Point Arithmetic.
67b7861d
AJ
4179*----------------------------------------------------------------------------*/
4180
4181int float64_unordered( float64 a, float64 b STATUS_PARAM )
4182{
4183 a = float64_squash_input_denormal(a STATUS_VAR);
4184 b = float64_squash_input_denormal(b STATUS_VAR);
4185
4186 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4187 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4188 ) {
4189 float_raise( float_flag_invalid STATUS_VAR);
4190 return 1;
4191 }
4192 return 0;
4193}
4194
158142c2
FB
4195/*----------------------------------------------------------------------------
4196| Returns 1 if the double-precision floating-point value `a' is equal to the
f5a64251
AJ
4197| corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4198| exception.The comparison is performed according to the IEC/IEEE Standard
4199| for Binary Floating-Point Arithmetic.
158142c2
FB
4200*----------------------------------------------------------------------------*/
4201
b689362d 4202int float64_eq_quiet( float64 a, float64 b STATUS_PARAM )
158142c2 4203{
bb98fe42 4204 uint64_t av, bv;
37d18660
PM
4205 a = float64_squash_input_denormal(a STATUS_VAR);
4206 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
4207
4208 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4209 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4210 ) {
b689362d
AJ
4211 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4212 float_raise( float_flag_invalid STATUS_VAR);
4213 }
158142c2
FB
4214 return 0;
4215 }
f090c9d4 4216 av = float64_val(a);
a1b91bb4 4217 bv = float64_val(b);
bb98fe42 4218 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
158142c2
FB
4219
4220}
4221
4222/*----------------------------------------------------------------------------
4223| Returns 1 if the double-precision floating-point value `a' is less than or
4224| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4225| cause an exception. Otherwise, the comparison is performed according to the
4226| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4227*----------------------------------------------------------------------------*/
4228
750afe93 4229int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
158142c2
FB
4230{
4231 flag aSign, bSign;
bb98fe42 4232 uint64_t av, bv;
37d18660
PM
4233 a = float64_squash_input_denormal(a STATUS_VAR);
4234 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
4235
4236 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4237 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4238 ) {
4239 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4240 float_raise( float_flag_invalid STATUS_VAR);
4241 }
4242 return 0;
4243 }
4244 aSign = extractFloat64Sign( a );
4245 bSign = extractFloat64Sign( b );
f090c9d4 4246 av = float64_val(a);
a1b91bb4 4247 bv = float64_val(b);
bb98fe42 4248 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 4249 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
4250
4251}
4252
4253/*----------------------------------------------------------------------------
4254| Returns 1 if the double-precision floating-point value `a' is less than
4255| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4256| exception. Otherwise, the comparison is performed according to the IEC/IEEE
4257| Standard for Binary Floating-Point Arithmetic.
4258*----------------------------------------------------------------------------*/
4259
750afe93 4260int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
158142c2
FB
4261{
4262 flag aSign, bSign;
bb98fe42 4263 uint64_t av, bv;
37d18660
PM
4264 a = float64_squash_input_denormal(a STATUS_VAR);
4265 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
4266
4267 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4268 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4269 ) {
4270 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4271 float_raise( float_flag_invalid STATUS_VAR);
4272 }
4273 return 0;
4274 }
4275 aSign = extractFloat64Sign( a );
4276 bSign = extractFloat64Sign( b );
f090c9d4 4277 av = float64_val(a);
a1b91bb4 4278 bv = float64_val(b);
bb98fe42 4279 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 4280 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
4281
4282}
4283
67b7861d
AJ
4284/*----------------------------------------------------------------------------
4285| Returns 1 if the double-precision floating-point values `a' and `b' cannot
4286| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4287| comparison is performed according to the IEC/IEEE Standard for Binary
4288| Floating-Point Arithmetic.
4289*----------------------------------------------------------------------------*/
4290
4291int float64_unordered_quiet( float64 a, float64 b STATUS_PARAM )
4292{
4293 a = float64_squash_input_denormal(a STATUS_VAR);
4294 b = float64_squash_input_denormal(b STATUS_VAR);
4295
4296 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4297 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4298 ) {
4299 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4300 float_raise( float_flag_invalid STATUS_VAR);
4301 }
4302 return 1;
4303 }
4304 return 0;
4305}
4306
158142c2
FB
4307/*----------------------------------------------------------------------------
4308| Returns the result of converting the extended double-precision floating-
4309| point value `a' to the 32-bit two's complement integer format. The
4310| conversion is performed according to the IEC/IEEE Standard for Binary
4311| Floating-Point Arithmetic---which means in particular that the conversion
4312| is rounded according to the current rounding mode. If `a' is a NaN, the
4313| largest positive integer is returned. Otherwise, if the conversion
4314| overflows, the largest integer with the same sign as `a' is returned.
4315*----------------------------------------------------------------------------*/
4316
4317int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
4318{
4319 flag aSign;
4320 int32 aExp, shiftCount;
bb98fe42 4321 uint64_t aSig;
158142c2
FB
4322
4323 aSig = extractFloatx80Frac( a );
4324 aExp = extractFloatx80Exp( a );
4325 aSign = extractFloatx80Sign( a );
bb98fe42 4326 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
158142c2
FB
4327 shiftCount = 0x4037 - aExp;
4328 if ( shiftCount <= 0 ) shiftCount = 1;
4329 shift64RightJamming( aSig, shiftCount, &aSig );
4330 return roundAndPackInt32( aSign, aSig STATUS_VAR );
4331
4332}
4333
4334/*----------------------------------------------------------------------------
4335| Returns the result of converting the extended double-precision floating-
4336| point value `a' to the 32-bit two's complement integer format. The
4337| conversion is performed according to the IEC/IEEE Standard for Binary
4338| Floating-Point Arithmetic, except that the conversion is always rounded
4339| toward zero. If `a' is a NaN, the largest positive integer is returned.
4340| Otherwise, if the conversion overflows, the largest integer with the same
4341| sign as `a' is returned.
4342*----------------------------------------------------------------------------*/
4343
4344int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
4345{
4346 flag aSign;
4347 int32 aExp, shiftCount;
bb98fe42 4348 uint64_t aSig, savedASig;
b3a6a2e0 4349 int32_t z;
158142c2
FB
4350
4351 aSig = extractFloatx80Frac( a );
4352 aExp = extractFloatx80Exp( a );
4353 aSign = extractFloatx80Sign( a );
4354 if ( 0x401E < aExp ) {
bb98fe42 4355 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
158142c2
FB
4356 goto invalid;
4357 }
4358 else if ( aExp < 0x3FFF ) {
4359 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
4360 return 0;
4361 }
4362 shiftCount = 0x403E - aExp;
4363 savedASig = aSig;
4364 aSig >>= shiftCount;
4365 z = aSig;
4366 if ( aSign ) z = - z;
4367 if ( ( z < 0 ) ^ aSign ) {
4368 invalid:
4369 float_raise( float_flag_invalid STATUS_VAR);
bb98fe42 4370 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2
FB
4371 }
4372 if ( ( aSig<<shiftCount ) != savedASig ) {
4373 STATUS(float_exception_flags) |= float_flag_inexact;
4374 }
4375 return z;
4376
4377}
4378
4379/*----------------------------------------------------------------------------
4380| Returns the result of converting the extended double-precision floating-
4381| point value `a' to the 64-bit two's complement integer format. The
4382| conversion is performed according to the IEC/IEEE Standard for Binary
4383| Floating-Point Arithmetic---which means in particular that the conversion
4384| is rounded according to the current rounding mode. If `a' is a NaN,
4385| the largest positive integer is returned. Otherwise, if the conversion
4386| overflows, the largest integer with the same sign as `a' is returned.
4387*----------------------------------------------------------------------------*/
4388
4389int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
4390{
4391 flag aSign;
4392 int32 aExp, shiftCount;
bb98fe42 4393 uint64_t aSig, aSigExtra;
158142c2
FB
4394
4395 aSig = extractFloatx80Frac( a );
4396 aExp = extractFloatx80Exp( a );
4397 aSign = extractFloatx80Sign( a );
4398 shiftCount = 0x403E - aExp;
4399 if ( shiftCount <= 0 ) {
4400 if ( shiftCount ) {
4401 float_raise( float_flag_invalid STATUS_VAR);
4402 if ( ! aSign
4403 || ( ( aExp == 0x7FFF )
4404 && ( aSig != LIT64( 0x8000000000000000 ) ) )
4405 ) {
4406 return LIT64( 0x7FFFFFFFFFFFFFFF );
4407 }
bb98fe42 4408 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
4409 }
4410 aSigExtra = 0;
4411 }
4412 else {
4413 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4414 }
4415 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
4416
4417}
4418
4419/*----------------------------------------------------------------------------
4420| Returns the result of converting the extended double-precision floating-
4421| point value `a' to the 64-bit two's complement integer format. The
4422| conversion is performed according to the IEC/IEEE Standard for Binary
4423| Floating-Point Arithmetic, except that the conversion is always rounded
4424| toward zero. If `a' is a NaN, the largest positive integer is returned.
4425| Otherwise, if the conversion overflows, the largest integer with the same
4426| sign as `a' is returned.
4427*----------------------------------------------------------------------------*/
4428
4429int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
4430{
4431 flag aSign;
4432 int32 aExp, shiftCount;
bb98fe42 4433 uint64_t aSig;
158142c2
FB
4434 int64 z;
4435
4436 aSig = extractFloatx80Frac( a );
4437 aExp = extractFloatx80Exp( a );
4438 aSign = extractFloatx80Sign( a );
4439 shiftCount = aExp - 0x403E;
4440 if ( 0 <= shiftCount ) {
4441 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4442 if ( ( a.high != 0xC03E ) || aSig ) {
4443 float_raise( float_flag_invalid STATUS_VAR);
4444 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4445 return LIT64( 0x7FFFFFFFFFFFFFFF );
4446 }
4447 }
bb98fe42 4448 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
4449 }
4450 else if ( aExp < 0x3FFF ) {
4451 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
4452 return 0;
4453 }
4454 z = aSig>>( - shiftCount );
bb98fe42 4455 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
158142c2
FB
4456 STATUS(float_exception_flags) |= float_flag_inexact;
4457 }
4458 if ( aSign ) z = - z;
4459 return z;
4460
4461}
4462
4463/*----------------------------------------------------------------------------
4464| Returns the result of converting the extended double-precision floating-
4465| point value `a' to the single-precision floating-point format. The
4466| conversion is performed according to the IEC/IEEE Standard for Binary
4467| Floating-Point Arithmetic.
4468*----------------------------------------------------------------------------*/
4469
4470float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
4471{
4472 flag aSign;
4473 int32 aExp;
bb98fe42 4474 uint64_t aSig;
158142c2
FB
4475
4476 aSig = extractFloatx80Frac( a );
4477 aExp = extractFloatx80Exp( a );
4478 aSign = extractFloatx80Sign( a );
4479 if ( aExp == 0x7FFF ) {
bb98fe42 4480 if ( (uint64_t) ( aSig<<1 ) ) {
bcd4d9af 4481 return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
4482 }
4483 return packFloat32( aSign, 0xFF, 0 );
4484 }
4485 shift64RightJamming( aSig, 33, &aSig );
4486 if ( aExp || aSig ) aExp -= 0x3F81;
4487 return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
4488
4489}
4490
4491/*----------------------------------------------------------------------------
4492| Returns the result of converting the extended double-precision floating-
4493| point value `a' to the double-precision floating-point format. The
4494| conversion is performed according to the IEC/IEEE Standard for Binary
4495| Floating-Point Arithmetic.
4496*----------------------------------------------------------------------------*/
4497
4498float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
4499{
4500 flag aSign;
4501 int32 aExp;
bb98fe42 4502 uint64_t aSig, zSig;
158142c2
FB
4503
4504 aSig = extractFloatx80Frac( a );
4505 aExp = extractFloatx80Exp( a );
4506 aSign = extractFloatx80Sign( a );
4507 if ( aExp == 0x7FFF ) {
bb98fe42 4508 if ( (uint64_t) ( aSig<<1 ) ) {
bcd4d9af 4509 return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
4510 }
4511 return packFloat64( aSign, 0x7FF, 0 );
4512 }
4513 shift64RightJamming( aSig, 1, &zSig );
4514 if ( aExp || aSig ) aExp -= 0x3C01;
4515 return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
4516
4517}
4518
158142c2
FB
4519/*----------------------------------------------------------------------------
4520| Returns the result of converting the extended double-precision floating-
4521| point value `a' to the quadruple-precision floating-point format. The
4522| conversion is performed according to the IEC/IEEE Standard for Binary
4523| Floating-Point Arithmetic.
4524*----------------------------------------------------------------------------*/
4525
4526float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
4527{
4528 flag aSign;
94a49d86 4529 int_fast16_t aExp;
bb98fe42 4530 uint64_t aSig, zSig0, zSig1;
158142c2
FB
4531
4532 aSig = extractFloatx80Frac( a );
4533 aExp = extractFloatx80Exp( a );
4534 aSign = extractFloatx80Sign( a );
bb98fe42 4535 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
bcd4d9af 4536 return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
4537 }
4538 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4539 return packFloat128( aSign, aExp, zSig0, zSig1 );
4540
4541}
4542
158142c2
FB
4543/*----------------------------------------------------------------------------
4544| Rounds the extended double-precision floating-point value `a' to an integer,
4545| and returns the result as an extended quadruple-precision floating-point
4546| value. The operation is performed according to the IEC/IEEE Standard for
4547| Binary Floating-Point Arithmetic.
4548*----------------------------------------------------------------------------*/
4549
4550floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
4551{
4552 flag aSign;
4553 int32 aExp;
bb98fe42 4554 uint64_t lastBitMask, roundBitsMask;
158142c2
FB
4555 int8 roundingMode;
4556 floatx80 z;
4557
4558 aExp = extractFloatx80Exp( a );
4559 if ( 0x403E <= aExp ) {
bb98fe42 4560 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
158142c2
FB
4561 return propagateFloatx80NaN( a, a STATUS_VAR );
4562 }
4563 return a;
4564 }
4565 if ( aExp < 0x3FFF ) {
4566 if ( ( aExp == 0 )
bb98fe42 4567 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
158142c2
FB
4568 return a;
4569 }
4570 STATUS(float_exception_flags) |= float_flag_inexact;
4571 aSign = extractFloatx80Sign( a );
4572 switch ( STATUS(float_rounding_mode) ) {
4573 case float_round_nearest_even:
bb98fe42 4574 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
158142c2
FB
4575 ) {
4576 return
4577 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4578 }
4579 break;
4580 case float_round_down:
4581 return
4582 aSign ?
4583 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4584 : packFloatx80( 0, 0, 0 );
4585 case float_round_up:
4586 return
4587 aSign ? packFloatx80( 1, 0, 0 )
4588 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4589 }
4590 return packFloatx80( aSign, 0, 0 );
4591 }
4592 lastBitMask = 1;
4593 lastBitMask <<= 0x403E - aExp;
4594 roundBitsMask = lastBitMask - 1;
4595 z = a;
4596 roundingMode = STATUS(float_rounding_mode);
4597 if ( roundingMode == float_round_nearest_even ) {
4598 z.low += lastBitMask>>1;
4599 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4600 }
4601 else if ( roundingMode != float_round_to_zero ) {
4602 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
4603 z.low += roundBitsMask;
4604 }
4605 }
4606 z.low &= ~ roundBitsMask;
4607 if ( z.low == 0 ) {
4608 ++z.high;
4609 z.low = LIT64( 0x8000000000000000 );
4610 }
4611 if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
4612 return z;
4613
4614}
4615
4616/*----------------------------------------------------------------------------
4617| Returns the result of adding the absolute values of the extended double-
4618| precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
4619| negated before being returned. `zSign' is ignored if the result is a NaN.
4620| The addition is performed according to the IEC/IEEE Standard for Binary
4621| Floating-Point Arithmetic.
4622*----------------------------------------------------------------------------*/
4623
4624static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
4625{
4626 int32 aExp, bExp, zExp;
bb98fe42 4627 uint64_t aSig, bSig, zSig0, zSig1;
158142c2
FB
4628 int32 expDiff;
4629
4630 aSig = extractFloatx80Frac( a );
4631 aExp = extractFloatx80Exp( a );
4632 bSig = extractFloatx80Frac( b );
4633 bExp = extractFloatx80Exp( b );
4634 expDiff = aExp - bExp;
4635 if ( 0 < expDiff ) {
4636 if ( aExp == 0x7FFF ) {
bb98fe42 4637 if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
4638 return a;
4639 }
4640 if ( bExp == 0 ) --expDiff;
4641 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4642 zExp = aExp;
4643 }
4644 else if ( expDiff < 0 ) {
4645 if ( bExp == 0x7FFF ) {
bb98fe42 4646 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
4647 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4648 }
4649 if ( aExp == 0 ) ++expDiff;
4650 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4651 zExp = bExp;
4652 }
4653 else {
4654 if ( aExp == 0x7FFF ) {
bb98fe42 4655 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
158142c2
FB
4656 return propagateFloatx80NaN( a, b STATUS_VAR );
4657 }
4658 return a;
4659 }
4660 zSig1 = 0;
4661 zSig0 = aSig + bSig;
4662 if ( aExp == 0 ) {
4663 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4664 goto roundAndPack;
4665 }
4666 zExp = aExp;
4667 goto shiftRight1;
4668 }
4669 zSig0 = aSig + bSig;
bb98fe42 4670 if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
158142c2
FB
4671 shiftRight1:
4672 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4673 zSig0 |= LIT64( 0x8000000000000000 );
4674 ++zExp;
4675 roundAndPack:
4676 return
4677 roundAndPackFloatx80(
4678 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4679
4680}
4681
4682/*----------------------------------------------------------------------------
4683| Returns the result of subtracting the absolute values of the extended
4684| double-precision floating-point values `a' and `b'. If `zSign' is 1, the
4685| difference is negated before being returned. `zSign' is ignored if the
4686| result is a NaN. The subtraction is performed according to the IEC/IEEE
4687| Standard for Binary Floating-Point Arithmetic.
4688*----------------------------------------------------------------------------*/
4689
4690static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
4691{
4692 int32 aExp, bExp, zExp;
bb98fe42 4693 uint64_t aSig, bSig, zSig0, zSig1;
158142c2
FB
4694 int32 expDiff;
4695 floatx80 z;
4696
4697 aSig = extractFloatx80Frac( a );
4698 aExp = extractFloatx80Exp( a );
4699 bSig = extractFloatx80Frac( b );
4700 bExp = extractFloatx80Exp( b );
4701 expDiff = aExp - bExp;
4702 if ( 0 < expDiff ) goto aExpBigger;
4703 if ( expDiff < 0 ) goto bExpBigger;
4704 if ( aExp == 0x7FFF ) {
bb98fe42 4705 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
158142c2
FB
4706 return propagateFloatx80NaN( a, b STATUS_VAR );
4707 }
4708 float_raise( float_flag_invalid STATUS_VAR);
4709 z.low = floatx80_default_nan_low;
4710 z.high = floatx80_default_nan_high;
4711 return z;
4712 }
4713 if ( aExp == 0 ) {
4714 aExp = 1;
4715 bExp = 1;
4716 }
4717 zSig1 = 0;
4718 if ( bSig < aSig ) goto aBigger;
4719 if ( aSig < bSig ) goto bBigger;
4720 return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
4721 bExpBigger:
4722 if ( bExp == 0x7FFF ) {
bb98fe42 4723 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
4724 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
4725 }
4726 if ( aExp == 0 ) ++expDiff;
4727 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4728 bBigger:
4729 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4730 zExp = bExp;
4731 zSign ^= 1;
4732 goto normalizeRoundAndPack;
4733 aExpBigger:
4734 if ( aExp == 0x7FFF ) {
bb98fe42 4735 if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
4736 return a;
4737 }
4738 if ( bExp == 0 ) --expDiff;
4739 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4740 aBigger:
4741 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4742 zExp = aExp;
4743 normalizeRoundAndPack:
4744 return
4745 normalizeRoundAndPackFloatx80(
4746 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4747
4748}
4749
4750/*----------------------------------------------------------------------------
4751| Returns the result of adding the extended double-precision floating-point
4752| values `a' and `b'. The operation is performed according to the IEC/IEEE
4753| Standard for Binary Floating-Point Arithmetic.
4754*----------------------------------------------------------------------------*/
4755
4756floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
4757{
4758 flag aSign, bSign;
4759
4760 aSign = extractFloatx80Sign( a );
4761 bSign = extractFloatx80Sign( b );
4762 if ( aSign == bSign ) {
4763 return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4764 }
4765 else {
4766 return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4767 }
4768
4769}
4770
4771/*----------------------------------------------------------------------------
4772| Returns the result of subtracting the extended double-precision floating-
4773| point values `a' and `b'. The operation is performed according to the
4774| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4775*----------------------------------------------------------------------------*/
4776
4777floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
4778{
4779 flag aSign, bSign;
4780
4781 aSign = extractFloatx80Sign( a );
4782 bSign = extractFloatx80Sign( b );
4783 if ( aSign == bSign ) {
4784 return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4785 }
4786 else {
4787 return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4788 }
4789
4790}
4791
4792/*----------------------------------------------------------------------------
4793| Returns the result of multiplying the extended double-precision floating-
4794| point values `a' and `b'. The operation is performed according to the
4795| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4796*----------------------------------------------------------------------------*/
4797
4798floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
4799{
4800 flag aSign, bSign, zSign;
4801 int32 aExp, bExp, zExp;
bb98fe42 4802 uint64_t aSig, bSig, zSig0, zSig1;
158142c2
FB
4803 floatx80 z;
4804
4805 aSig = extractFloatx80Frac( a );
4806 aExp = extractFloatx80Exp( a );
4807 aSign = extractFloatx80Sign( a );
4808 bSig = extractFloatx80Frac( b );
4809 bExp = extractFloatx80Exp( b );
4810 bSign = extractFloatx80Sign( b );
4811 zSign = aSign ^ bSign;
4812 if ( aExp == 0x7FFF ) {
bb98fe42
AF
4813 if ( (uint64_t) ( aSig<<1 )
4814 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
158142c2
FB
4815 return propagateFloatx80NaN( a, b STATUS_VAR );
4816 }
4817 if ( ( bExp | bSig ) == 0 ) goto invalid;
4818 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4819 }
4820 if ( bExp == 0x7FFF ) {
bb98fe42 4821 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
4822 if ( ( aExp | aSig ) == 0 ) {
4823 invalid:
4824 float_raise( float_flag_invalid STATUS_VAR);
4825 z.low = floatx80_default_nan_low;
4826 z.high = floatx80_default_nan_high;
4827 return z;
4828 }
4829 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4830 }
4831 if ( aExp == 0 ) {
4832 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4833 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4834 }
4835 if ( bExp == 0 ) {
4836 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
4837 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4838 }
4839 zExp = aExp + bExp - 0x3FFE;
4840 mul64To128( aSig, bSig, &zSig0, &zSig1 );
bb98fe42 4841 if ( 0 < (int64_t) zSig0 ) {
158142c2
FB
4842 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
4843 --zExp;
4844 }
4845 return
4846 roundAndPackFloatx80(
4847 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4848
4849}
4850
4851/*----------------------------------------------------------------------------
4852| Returns the result of dividing the extended double-precision floating-point
4853| value `a' by the corresponding value `b'. The operation is performed
4854| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4855*----------------------------------------------------------------------------*/
4856
4857floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
4858{
4859 flag aSign, bSign, zSign;
4860 int32 aExp, bExp, zExp;
bb98fe42
AF
4861 uint64_t aSig, bSig, zSig0, zSig1;
4862 uint64_t rem0, rem1, rem2, term0, term1, term2;
158142c2
FB
4863 floatx80 z;
4864
4865 aSig = extractFloatx80Frac( a );
4866 aExp = extractFloatx80Exp( a );
4867 aSign = extractFloatx80Sign( a );
4868 bSig = extractFloatx80Frac( b );
4869 bExp = extractFloatx80Exp( b );
4870 bSign = extractFloatx80Sign( b );
4871 zSign = aSign ^ bSign;
4872 if ( aExp == 0x7FFF ) {
bb98fe42 4873 if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2 4874 if ( bExp == 0x7FFF ) {
bb98fe42 4875 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
4876 goto invalid;
4877 }
4878 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4879 }
4880 if ( bExp == 0x7FFF ) {
bb98fe42 4881 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
4882 return packFloatx80( zSign, 0, 0 );
4883 }
4884 if ( bExp == 0 ) {
4885 if ( bSig == 0 ) {
4886 if ( ( aExp | aSig ) == 0 ) {
4887 invalid:
4888 float_raise( float_flag_invalid STATUS_VAR);
4889 z.low = floatx80_default_nan_low;
4890 z.high = floatx80_default_nan_high;
4891 return z;
4892 }
4893 float_raise( float_flag_divbyzero STATUS_VAR);
4894 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4895 }
4896 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4897 }
4898 if ( aExp == 0 ) {
4899 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4900 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4901 }
4902 zExp = aExp - bExp + 0x3FFE;
4903 rem1 = 0;
4904 if ( bSig <= aSig ) {
4905 shift128Right( aSig, 0, 1, &aSig, &rem1 );
4906 ++zExp;
4907 }
4908 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
4909 mul64To128( bSig, zSig0, &term0, &term1 );
4910 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
bb98fe42 4911 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
4912 --zSig0;
4913 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4914 }
4915 zSig1 = estimateDiv128To64( rem1, 0, bSig );
bb98fe42 4916 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
158142c2
FB
4917 mul64To128( bSig, zSig1, &term1, &term2 );
4918 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
bb98fe42 4919 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
4920 --zSig1;
4921 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
4922 }
4923 zSig1 |= ( ( rem1 | rem2 ) != 0 );
4924 }
4925 return
4926 roundAndPackFloatx80(
4927 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4928
4929}
4930
4931/*----------------------------------------------------------------------------
4932| Returns the remainder of the extended double-precision floating-point value
4933| `a' with respect to the corresponding value `b'. The operation is performed
4934| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4935*----------------------------------------------------------------------------*/
4936
4937floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
4938{
ed086f3d 4939 flag aSign, zSign;
158142c2 4940 int32 aExp, bExp, expDiff;
bb98fe42
AF
4941 uint64_t aSig0, aSig1, bSig;
4942 uint64_t q, term0, term1, alternateASig0, alternateASig1;
158142c2
FB
4943 floatx80 z;
4944
4945 aSig0 = extractFloatx80Frac( a );
4946 aExp = extractFloatx80Exp( a );
4947 aSign = extractFloatx80Sign( a );
4948 bSig = extractFloatx80Frac( b );
4949 bExp = extractFloatx80Exp( b );
158142c2 4950 if ( aExp == 0x7FFF ) {
bb98fe42
AF
4951 if ( (uint64_t) ( aSig0<<1 )
4952 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
158142c2
FB
4953 return propagateFloatx80NaN( a, b STATUS_VAR );
4954 }
4955 goto invalid;
4956 }
4957 if ( bExp == 0x7FFF ) {
bb98fe42 4958 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
4959 return a;
4960 }
4961 if ( bExp == 0 ) {
4962 if ( bSig == 0 ) {
4963 invalid:
4964 float_raise( float_flag_invalid STATUS_VAR);
4965 z.low = floatx80_default_nan_low;
4966 z.high = floatx80_default_nan_high;
4967 return z;
4968 }
4969 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4970 }
4971 if ( aExp == 0 ) {
bb98fe42 4972 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
158142c2
FB
4973 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4974 }
4975 bSig |= LIT64( 0x8000000000000000 );
4976 zSign = aSign;
4977 expDiff = aExp - bExp;
4978 aSig1 = 0;
4979 if ( expDiff < 0 ) {
4980 if ( expDiff < -1 ) return a;
4981 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
4982 expDiff = 0;
4983 }
4984 q = ( bSig <= aSig0 );
4985 if ( q ) aSig0 -= bSig;
4986 expDiff -= 64;
4987 while ( 0 < expDiff ) {
4988 q = estimateDiv128To64( aSig0, aSig1, bSig );
4989 q = ( 2 < q ) ? q - 2 : 0;
4990 mul64To128( bSig, q, &term0, &term1 );
4991 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4992 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
4993 expDiff -= 62;
4994 }
4995 expDiff += 64;
4996 if ( 0 < expDiff ) {
4997 q = estimateDiv128To64( aSig0, aSig1, bSig );
4998 q = ( 2 < q ) ? q - 2 : 0;
4999 q >>= 64 - expDiff;
5000 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5001 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5002 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5003 while ( le128( term0, term1, aSig0, aSig1 ) ) {
5004 ++q;
5005 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5006 }
5007 }
5008 else {
5009 term1 = 0;
5010 term0 = bSig;
5011 }
5012 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5013 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5014 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5015 && ( q & 1 ) )
5016 ) {
5017 aSig0 = alternateASig0;
5018 aSig1 = alternateASig1;
5019 zSign = ! zSign;
5020 }
5021 return
5022 normalizeRoundAndPackFloatx80(
5023 80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
5024
5025}
5026
5027/*----------------------------------------------------------------------------
5028| Returns the square root of the extended double-precision floating-point
5029| value `a'. The operation is performed according to the IEC/IEEE Standard
5030| for Binary Floating-Point Arithmetic.
5031*----------------------------------------------------------------------------*/
5032
5033floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
5034{
5035 flag aSign;
5036 int32 aExp, zExp;
bb98fe42
AF
5037 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5038 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
158142c2
FB
5039 floatx80 z;
5040
5041 aSig0 = extractFloatx80Frac( a );
5042 aExp = extractFloatx80Exp( a );
5043 aSign = extractFloatx80Sign( a );
5044 if ( aExp == 0x7FFF ) {
bb98fe42 5045 if ( (uint64_t) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
158142c2
FB
5046 if ( ! aSign ) return a;
5047 goto invalid;
5048 }
5049 if ( aSign ) {
5050 if ( ( aExp | aSig0 ) == 0 ) return a;
5051 invalid:
5052 float_raise( float_flag_invalid STATUS_VAR);
5053 z.low = floatx80_default_nan_low;
5054 z.high = floatx80_default_nan_high;
5055 return z;
5056 }
5057 if ( aExp == 0 ) {
5058 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5059 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5060 }
5061 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5062 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5063 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5064 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5065 doubleZSig0 = zSig0<<1;
5066 mul64To128( zSig0, zSig0, &term0, &term1 );
5067 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
bb98fe42 5068 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
5069 --zSig0;
5070 doubleZSig0 -= 2;
5071 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5072 }
5073 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5074 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5075 if ( zSig1 == 0 ) zSig1 = 1;
5076 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5077 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5078 mul64To128( zSig1, zSig1, &term2, &term3 );
5079 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
bb98fe42 5080 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
5081 --zSig1;
5082 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5083 term3 |= 1;
5084 term2 |= doubleZSig0;
5085 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5086 }
5087 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5088 }
5089 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5090 zSig0 |= doubleZSig0;
5091 return
5092 roundAndPackFloatx80(
5093 STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
5094
5095}
5096
5097/*----------------------------------------------------------------------------
b689362d
AJ
5098| Returns 1 if the extended double-precision floating-point value `a' is equal
5099| to the corresponding value `b', and 0 otherwise. The invalid exception is
5100| raised if either operand is a NaN. Otherwise, the comparison is performed
5101| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
5102*----------------------------------------------------------------------------*/
5103
b689362d 5104int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
158142c2
FB
5105{
5106
5107 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5108 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5109 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5110 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2 5111 ) {
b689362d 5112 float_raise( float_flag_invalid STATUS_VAR);
158142c2
FB
5113 return 0;
5114 }
5115 return
5116 ( a.low == b.low )
5117 && ( ( a.high == b.high )
5118 || ( ( a.low == 0 )
bb98fe42 5119 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
5120 );
5121
5122}
5123
5124/*----------------------------------------------------------------------------
5125| Returns 1 if the extended double-precision floating-point value `a' is
5126| less than or equal to the corresponding value `b', and 0 otherwise. The
f5a64251
AJ
5127| invalid exception is raised if either operand is a NaN. The comparison is
5128| performed according to the IEC/IEEE Standard for Binary Floating-Point
5129| Arithmetic.
158142c2
FB
5130*----------------------------------------------------------------------------*/
5131
750afe93 5132int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
158142c2
FB
5133{
5134 flag aSign, bSign;
5135
5136 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5137 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5138 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5139 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2
FB
5140 ) {
5141 float_raise( float_flag_invalid STATUS_VAR);
5142 return 0;
5143 }
5144 aSign = extractFloatx80Sign( a );
5145 bSign = extractFloatx80Sign( b );
5146 if ( aSign != bSign ) {
5147 return
5148 aSign
bb98fe42 5149 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5150 == 0 );
5151 }
5152 return
5153 aSign ? le128( b.high, b.low, a.high, a.low )
5154 : le128( a.high, a.low, b.high, b.low );
5155
5156}
5157
5158/*----------------------------------------------------------------------------
5159| Returns 1 if the extended double-precision floating-point value `a' is
f5a64251
AJ
5160| less than the corresponding value `b', and 0 otherwise. The invalid
5161| exception is raised if either operand is a NaN. The comparison is performed
5162| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
5163*----------------------------------------------------------------------------*/
5164
750afe93 5165int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
158142c2
FB
5166{
5167 flag aSign, bSign;
5168
5169 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5170 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5171 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5172 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2
FB
5173 ) {
5174 float_raise( float_flag_invalid STATUS_VAR);
5175 return 0;
5176 }
5177 aSign = extractFloatx80Sign( a );
5178 bSign = extractFloatx80Sign( b );
5179 if ( aSign != bSign ) {
5180 return
5181 aSign
bb98fe42 5182 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5183 != 0 );
5184 }
5185 return
5186 aSign ? lt128( b.high, b.low, a.high, a.low )
5187 : lt128( a.high, a.low, b.high, b.low );
5188
5189}
5190
67b7861d
AJ
5191/*----------------------------------------------------------------------------
5192| Returns 1 if the extended double-precision floating-point values `a' and `b'
f5a64251
AJ
5193| cannot be compared, and 0 otherwise. The invalid exception is raised if
5194| either operand is a NaN. The comparison is performed according to the
5195| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
67b7861d
AJ
5196*----------------------------------------------------------------------------*/
5197int floatx80_unordered( floatx80 a, floatx80 b STATUS_PARAM )
5198{
5199 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5200 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5201 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5202 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5203 ) {
5204 float_raise( float_flag_invalid STATUS_VAR);
5205 return 1;
5206 }
5207 return 0;
5208}
5209
158142c2 5210/*----------------------------------------------------------------------------
b689362d 5211| Returns 1 if the extended double-precision floating-point value `a' is
f5a64251
AJ
5212| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5213| cause an exception. The comparison is performed according to the IEC/IEEE
5214| Standard for Binary Floating-Point Arithmetic.
158142c2
FB
5215*----------------------------------------------------------------------------*/
5216
b689362d 5217int floatx80_eq_quiet( floatx80 a, floatx80 b STATUS_PARAM )
158142c2
FB
5218{
5219
5220 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5221 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5222 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5223 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2 5224 ) {
b689362d
AJ
5225 if ( floatx80_is_signaling_nan( a )
5226 || floatx80_is_signaling_nan( b ) ) {
5227 float_raise( float_flag_invalid STATUS_VAR);
5228 }
158142c2
FB
5229 return 0;
5230 }
5231 return
5232 ( a.low == b.low )
5233 && ( ( a.high == b.high )
5234 || ( ( a.low == 0 )
bb98fe42 5235 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
5236 );
5237
5238}
5239
5240/*----------------------------------------------------------------------------
5241| Returns 1 if the extended double-precision floating-point value `a' is less
5242| than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
5243| do not cause an exception. Otherwise, the comparison is performed according
5244| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5245*----------------------------------------------------------------------------*/
5246
750afe93 5247int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
158142c2
FB
5248{
5249 flag aSign, bSign;
5250
5251 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5252 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5253 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5254 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2
FB
5255 ) {
5256 if ( floatx80_is_signaling_nan( a )
5257 || floatx80_is_signaling_nan( b ) ) {
5258 float_raise( float_flag_invalid STATUS_VAR);
5259 }
5260 return 0;
5261 }
5262 aSign = extractFloatx80Sign( a );
5263 bSign = extractFloatx80Sign( b );
5264 if ( aSign != bSign ) {
5265 return
5266 aSign
bb98fe42 5267 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5268 == 0 );
5269 }
5270 return
5271 aSign ? le128( b.high, b.low, a.high, a.low )
5272 : le128( a.high, a.low, b.high, b.low );
5273
5274}
5275
5276/*----------------------------------------------------------------------------
5277| Returns 1 if the extended double-precision floating-point value `a' is less
5278| than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
5279| an exception. Otherwise, the comparison is performed according to the
5280| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5281*----------------------------------------------------------------------------*/
5282
750afe93 5283int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
158142c2
FB
5284{
5285 flag aSign, bSign;
5286
5287 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5288 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5289 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5290 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2
FB
5291 ) {
5292 if ( floatx80_is_signaling_nan( a )
5293 || floatx80_is_signaling_nan( b ) ) {
5294 float_raise( float_flag_invalid STATUS_VAR);
5295 }
5296 return 0;
5297 }
5298 aSign = extractFloatx80Sign( a );
5299 bSign = extractFloatx80Sign( b );
5300 if ( aSign != bSign ) {
5301 return
5302 aSign
bb98fe42 5303 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5304 != 0 );
5305 }
5306 return
5307 aSign ? lt128( b.high, b.low, a.high, a.low )
5308 : lt128( a.high, a.low, b.high, b.low );
5309
5310}
5311
67b7861d
AJ
5312/*----------------------------------------------------------------------------
5313| Returns 1 if the extended double-precision floating-point values `a' and `b'
5314| cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
5315| The comparison is performed according to the IEC/IEEE Standard for Binary
5316| Floating-Point Arithmetic.
5317*----------------------------------------------------------------------------*/
5318int floatx80_unordered_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5319{
5320 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5321 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5322 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5323 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5324 ) {
5325 if ( floatx80_is_signaling_nan( a )
5326 || floatx80_is_signaling_nan( b ) ) {
5327 float_raise( float_flag_invalid STATUS_VAR);
5328 }
5329 return 1;
5330 }
5331 return 0;
5332}
5333
158142c2
FB
5334/*----------------------------------------------------------------------------
5335| Returns the result of converting the quadruple-precision floating-point
5336| value `a' to the 32-bit two's complement integer format. The conversion
5337| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5338| Arithmetic---which means in particular that the conversion is rounded
5339| according to the current rounding mode. If `a' is a NaN, the largest
5340| positive integer is returned. Otherwise, if the conversion overflows, the
5341| largest integer with the same sign as `a' is returned.
5342*----------------------------------------------------------------------------*/
5343
5344int32 float128_to_int32( float128 a STATUS_PARAM )
5345{
5346 flag aSign;
5347 int32 aExp, shiftCount;
bb98fe42 5348 uint64_t aSig0, aSig1;
158142c2
FB
5349
5350 aSig1 = extractFloat128Frac1( a );
5351 aSig0 = extractFloat128Frac0( a );
5352 aExp = extractFloat128Exp( a );
5353 aSign = extractFloat128Sign( a );
5354 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5355 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5356 aSig0 |= ( aSig1 != 0 );
5357 shiftCount = 0x4028 - aExp;
5358 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5359 return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
5360
5361}
5362
5363/*----------------------------------------------------------------------------
5364| Returns the result of converting the quadruple-precision floating-point
5365| value `a' to the 32-bit two's complement integer format. The conversion
5366| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5367| Arithmetic, except that the conversion is always rounded toward zero. If
5368| `a' is a NaN, the largest positive integer is returned. Otherwise, if the
5369| conversion overflows, the largest integer with the same sign as `a' is
5370| returned.
5371*----------------------------------------------------------------------------*/
5372
5373int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
5374{
5375 flag aSign;
5376 int32 aExp, shiftCount;
bb98fe42 5377 uint64_t aSig0, aSig1, savedASig;
b3a6a2e0 5378 int32_t z;
158142c2
FB
5379
5380 aSig1 = extractFloat128Frac1( a );
5381 aSig0 = extractFloat128Frac0( a );
5382 aExp = extractFloat128Exp( a );
5383 aSign = extractFloat128Sign( a );
5384 aSig0 |= ( aSig1 != 0 );
5385 if ( 0x401E < aExp ) {
5386 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5387 goto invalid;
5388 }
5389 else if ( aExp < 0x3FFF ) {
5390 if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
5391 return 0;
5392 }
5393 aSig0 |= LIT64( 0x0001000000000000 );
5394 shiftCount = 0x402F - aExp;
5395 savedASig = aSig0;
5396 aSig0 >>= shiftCount;
5397 z = aSig0;
5398 if ( aSign ) z = - z;
5399 if ( ( z < 0 ) ^ aSign ) {
5400 invalid:
5401 float_raise( float_flag_invalid STATUS_VAR);
bb98fe42 5402 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2
FB
5403 }
5404 if ( ( aSig0<<shiftCount ) != savedASig ) {
5405 STATUS(float_exception_flags) |= float_flag_inexact;
5406 }
5407 return z;
5408
5409}
5410
5411/*----------------------------------------------------------------------------
5412| Returns the result of converting the quadruple-precision floating-point
5413| value `a' to the 64-bit two's complement integer format. The conversion
5414| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5415| Arithmetic---which means in particular that the conversion is rounded
5416| according to the current rounding mode. If `a' is a NaN, the largest
5417| positive integer is returned. Otherwise, if the conversion overflows, the
5418| largest integer with the same sign as `a' is returned.
5419*----------------------------------------------------------------------------*/
5420
5421int64 float128_to_int64( float128 a STATUS_PARAM )
5422{
5423 flag aSign;
5424 int32 aExp, shiftCount;
bb98fe42 5425 uint64_t aSig0, aSig1;
158142c2
FB
5426
5427 aSig1 = extractFloat128Frac1( a );
5428 aSig0 = extractFloat128Frac0( a );
5429 aExp = extractFloat128Exp( a );
5430 aSign = extractFloat128Sign( a );
5431 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5432 shiftCount = 0x402F - aExp;
5433 if ( shiftCount <= 0 ) {
5434 if ( 0x403E < aExp ) {
5435 float_raise( float_flag_invalid STATUS_VAR);
5436 if ( ! aSign
5437 || ( ( aExp == 0x7FFF )
5438 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5439 )
5440 ) {
5441 return LIT64( 0x7FFFFFFFFFFFFFFF );
5442 }
bb98fe42 5443 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
5444 }
5445 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5446 }
5447 else {
5448 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5449 }
5450 return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
5451
5452}
5453
5454/*----------------------------------------------------------------------------
5455| Returns the result of converting the quadruple-precision floating-point
5456| value `a' to the 64-bit two's complement integer format. The conversion
5457| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5458| Arithmetic, except that the conversion is always rounded toward zero.
5459| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
5460| the conversion overflows, the largest integer with the same sign as `a' is
5461| returned.
5462*----------------------------------------------------------------------------*/
5463
5464int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
5465{
5466 flag aSign;
5467 int32 aExp, shiftCount;
bb98fe42 5468 uint64_t aSig0, aSig1;
158142c2
FB
5469 int64 z;
5470
5471 aSig1 = extractFloat128Frac1( a );
5472 aSig0 = extractFloat128Frac0( a );
5473 aExp = extractFloat128Exp( a );
5474 aSign = extractFloat128Sign( a );
5475 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5476 shiftCount = aExp - 0x402F;
5477 if ( 0 < shiftCount ) {
5478 if ( 0x403E <= aExp ) {
5479 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5480 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
5481 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5482 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
5483 }
5484 else {
5485 float_raise( float_flag_invalid STATUS_VAR);
5486 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5487 return LIT64( 0x7FFFFFFFFFFFFFFF );
5488 }
5489 }
bb98fe42 5490 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
5491 }
5492 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
bb98fe42 5493 if ( (uint64_t) ( aSig1<<shiftCount ) ) {
158142c2
FB
5494 STATUS(float_exception_flags) |= float_flag_inexact;
5495 }
5496 }
5497 else {
5498 if ( aExp < 0x3FFF ) {
5499 if ( aExp | aSig0 | aSig1 ) {
5500 STATUS(float_exception_flags) |= float_flag_inexact;
5501 }
5502 return 0;
5503 }
5504 z = aSig0>>( - shiftCount );
5505 if ( aSig1
bb98fe42 5506 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
158142c2
FB
5507 STATUS(float_exception_flags) |= float_flag_inexact;
5508 }
5509 }
5510 if ( aSign ) z = - z;
5511 return z;
5512
5513}
5514
5515/*----------------------------------------------------------------------------
5516| Returns the result of converting the quadruple-precision floating-point
5517| value `a' to the single-precision floating-point format. The conversion
5518| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5519| Arithmetic.
5520*----------------------------------------------------------------------------*/
5521
5522float32 float128_to_float32( float128 a STATUS_PARAM )
5523{
5524 flag aSign;
5525 int32 aExp;
bb98fe42
AF
5526 uint64_t aSig0, aSig1;
5527 uint32_t zSig;
158142c2
FB
5528
5529 aSig1 = extractFloat128Frac1( a );
5530 aSig0 = extractFloat128Frac0( a );
5531 aExp = extractFloat128Exp( a );
5532 aSign = extractFloat128Sign( a );
5533 if ( aExp == 0x7FFF ) {
5534 if ( aSig0 | aSig1 ) {
bcd4d9af 5535 return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
5536 }
5537 return packFloat32( aSign, 0xFF, 0 );
5538 }
5539 aSig0 |= ( aSig1 != 0 );
5540 shift64RightJamming( aSig0, 18, &aSig0 );
5541 zSig = aSig0;
5542 if ( aExp || zSig ) {
5543 zSig |= 0x40000000;
5544 aExp -= 0x3F81;
5545 }
5546 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
5547
5548}
5549
5550/*----------------------------------------------------------------------------
5551| Returns the result of converting the quadruple-precision floating-point
5552| value `a' to the double-precision floating-point format. The conversion
5553| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5554| Arithmetic.
5555*----------------------------------------------------------------------------*/
5556
5557float64 float128_to_float64( float128 a STATUS_PARAM )
5558{
5559 flag aSign;
5560 int32 aExp;
bb98fe42 5561 uint64_t aSig0, aSig1;
158142c2
FB
5562
5563 aSig1 = extractFloat128Frac1( a );
5564 aSig0 = extractFloat128Frac0( a );
5565 aExp = extractFloat128Exp( a );
5566 aSign = extractFloat128Sign( a );
5567 if ( aExp == 0x7FFF ) {
5568 if ( aSig0 | aSig1 ) {
bcd4d9af 5569 return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
5570 }
5571 return packFloat64( aSign, 0x7FF, 0 );
5572 }
5573 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5574 aSig0 |= ( aSig1 != 0 );
5575 if ( aExp || aSig0 ) {
5576 aSig0 |= LIT64( 0x4000000000000000 );
5577 aExp -= 0x3C01;
5578 }
5579 return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
5580
5581}
5582
158142c2
FB
5583/*----------------------------------------------------------------------------
5584| Returns the result of converting the quadruple-precision floating-point
5585| value `a' to the extended double-precision floating-point format. The
5586| conversion is performed according to the IEC/IEEE Standard for Binary
5587| Floating-Point Arithmetic.
5588*----------------------------------------------------------------------------*/
5589
5590floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
5591{
5592 flag aSign;
5593 int32 aExp;
bb98fe42 5594 uint64_t aSig0, aSig1;
158142c2
FB
5595
5596 aSig1 = extractFloat128Frac1( a );
5597 aSig0 = extractFloat128Frac0( a );
5598 aExp = extractFloat128Exp( a );
5599 aSign = extractFloat128Sign( a );
5600 if ( aExp == 0x7FFF ) {
5601 if ( aSig0 | aSig1 ) {
bcd4d9af 5602 return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
5603 }
5604 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5605 }
5606 if ( aExp == 0 ) {
5607 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5608 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5609 }
5610 else {
5611 aSig0 |= LIT64( 0x0001000000000000 );
5612 }
5613 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5614 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
5615
5616}
5617
158142c2
FB
5618/*----------------------------------------------------------------------------
5619| Rounds the quadruple-precision floating-point value `a' to an integer, and
5620| returns the result as a quadruple-precision floating-point value. The
5621| operation is performed according to the IEC/IEEE Standard for Binary
5622| Floating-Point Arithmetic.
5623*----------------------------------------------------------------------------*/
5624
5625float128 float128_round_to_int( float128 a STATUS_PARAM )
5626{
5627 flag aSign;
5628 int32 aExp;
bb98fe42 5629 uint64_t lastBitMask, roundBitsMask;
158142c2
FB
5630 int8 roundingMode;
5631 float128 z;
5632
5633 aExp = extractFloat128Exp( a );
5634 if ( 0x402F <= aExp ) {
5635 if ( 0x406F <= aExp ) {
5636 if ( ( aExp == 0x7FFF )
5637 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5638 ) {
5639 return propagateFloat128NaN( a, a STATUS_VAR );
5640 }
5641 return a;
5642 }
5643 lastBitMask = 1;
5644 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5645 roundBitsMask = lastBitMask - 1;
5646 z = a;
5647 roundingMode = STATUS(float_rounding_mode);
5648 if ( roundingMode == float_round_nearest_even ) {
5649 if ( lastBitMask ) {
5650 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
5651 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
5652 }
5653 else {
bb98fe42 5654 if ( (int64_t) z.low < 0 ) {
158142c2 5655 ++z.high;
bb98fe42 5656 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
158142c2
FB
5657 }
5658 }
5659 }
5660 else if ( roundingMode != float_round_to_zero ) {
5661 if ( extractFloat128Sign( z )
5662 ^ ( roundingMode == float_round_up ) ) {
5663 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
5664 }
5665 }
5666 z.low &= ~ roundBitsMask;
5667 }
5668 else {
5669 if ( aExp < 0x3FFF ) {
bb98fe42 5670 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
158142c2
FB
5671 STATUS(float_exception_flags) |= float_flag_inexact;
5672 aSign = extractFloat128Sign( a );
5673 switch ( STATUS(float_rounding_mode) ) {
5674 case float_round_nearest_even:
5675 if ( ( aExp == 0x3FFE )
5676 && ( extractFloat128Frac0( a )
5677 | extractFloat128Frac1( a ) )
5678 ) {
5679 return packFloat128( aSign, 0x3FFF, 0, 0 );
5680 }
5681 break;
5682 case float_round_down:
5683 return
5684 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
5685 : packFloat128( 0, 0, 0, 0 );
5686 case float_round_up:
5687 return
5688 aSign ? packFloat128( 1, 0, 0, 0 )
5689 : packFloat128( 0, 0x3FFF, 0, 0 );
5690 }
5691 return packFloat128( aSign, 0, 0, 0 );
5692 }
5693 lastBitMask = 1;
5694 lastBitMask <<= 0x402F - aExp;
5695 roundBitsMask = lastBitMask - 1;
5696 z.low = 0;
5697 z.high = a.high;
5698 roundingMode = STATUS(float_rounding_mode);
5699 if ( roundingMode == float_round_nearest_even ) {
5700 z.high += lastBitMask>>1;
5701 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
5702 z.high &= ~ lastBitMask;
5703 }
5704 }
5705 else if ( roundingMode != float_round_to_zero ) {
5706 if ( extractFloat128Sign( z )
5707 ^ ( roundingMode == float_round_up ) ) {
5708 z.high |= ( a.low != 0 );
5709 z.high += roundBitsMask;
5710 }
5711 }
5712 z.high &= ~ roundBitsMask;
5713 }
5714 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
5715 STATUS(float_exception_flags) |= float_flag_inexact;
5716 }
5717 return z;
5718
5719}
5720
5721/*----------------------------------------------------------------------------
5722| Returns the result of adding the absolute values of the quadruple-precision
5723| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
5724| before being returned. `zSign' is ignored if the result is a NaN.
5725| The addition is performed according to the IEC/IEEE Standard for Binary
5726| Floating-Point Arithmetic.
5727*----------------------------------------------------------------------------*/
5728
5729static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5730{
5731 int32 aExp, bExp, zExp;
bb98fe42 5732 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
158142c2
FB
5733 int32 expDiff;
5734
5735 aSig1 = extractFloat128Frac1( a );
5736 aSig0 = extractFloat128Frac0( a );
5737 aExp = extractFloat128Exp( a );
5738 bSig1 = extractFloat128Frac1( b );
5739 bSig0 = extractFloat128Frac0( b );
5740 bExp = extractFloat128Exp( b );
5741 expDiff = aExp - bExp;
5742 if ( 0 < expDiff ) {
5743 if ( aExp == 0x7FFF ) {
5744 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5745 return a;
5746 }
5747 if ( bExp == 0 ) {
5748 --expDiff;
5749 }
5750 else {
5751 bSig0 |= LIT64( 0x0001000000000000 );
5752 }
5753 shift128ExtraRightJamming(
5754 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
5755 zExp = aExp;
5756 }
5757 else if ( expDiff < 0 ) {
5758 if ( bExp == 0x7FFF ) {
5759 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5760 return packFloat128( zSign, 0x7FFF, 0, 0 );
5761 }
5762 if ( aExp == 0 ) {
5763 ++expDiff;
5764 }
5765 else {
5766 aSig0 |= LIT64( 0x0001000000000000 );
5767 }
5768 shift128ExtraRightJamming(
5769 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
5770 zExp = bExp;
5771 }
5772 else {
5773 if ( aExp == 0x7FFF ) {
5774 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5775 return propagateFloat128NaN( a, b STATUS_VAR );
5776 }
5777 return a;
5778 }
5779 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
fe76d976 5780 if ( aExp == 0 ) {
e6afc87f
PM
5781 if (STATUS(flush_to_zero)) {
5782 if (zSig0 | zSig1) {
5783 float_raise(float_flag_output_denormal STATUS_VAR);
5784 }
5785 return packFloat128(zSign, 0, 0, 0);
5786 }
fe76d976
PB
5787 return packFloat128( zSign, 0, zSig0, zSig1 );
5788 }
158142c2
FB
5789 zSig2 = 0;
5790 zSig0 |= LIT64( 0x0002000000000000 );
5791 zExp = aExp;
5792 goto shiftRight1;
5793 }
5794 aSig0 |= LIT64( 0x0001000000000000 );
5795 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5796 --zExp;
5797 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
5798 ++zExp;
5799 shiftRight1:
5800 shift128ExtraRightJamming(
5801 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5802 roundAndPack:
5803 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5804
5805}
5806
5807/*----------------------------------------------------------------------------
5808| Returns the result of subtracting the absolute values of the quadruple-
5809| precision floating-point values `a' and `b'. If `zSign' is 1, the
5810| difference is negated before being returned. `zSign' is ignored if the
5811| result is a NaN. The subtraction is performed according to the IEC/IEEE
5812| Standard for Binary Floating-Point Arithmetic.
5813*----------------------------------------------------------------------------*/
5814
5815static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5816{
5817 int32 aExp, bExp, zExp;
bb98fe42 5818 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
158142c2
FB
5819 int32 expDiff;
5820 float128 z;
5821
5822 aSig1 = extractFloat128Frac1( a );
5823 aSig0 = extractFloat128Frac0( a );
5824 aExp = extractFloat128Exp( a );
5825 bSig1 = extractFloat128Frac1( b );
5826 bSig0 = extractFloat128Frac0( b );
5827 bExp = extractFloat128Exp( b );
5828 expDiff = aExp - bExp;
5829 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5830 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
5831 if ( 0 < expDiff ) goto aExpBigger;
5832 if ( expDiff < 0 ) goto bExpBigger;
5833 if ( aExp == 0x7FFF ) {
5834 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5835 return propagateFloat128NaN( a, b STATUS_VAR );
5836 }
5837 float_raise( float_flag_invalid STATUS_VAR);
5838 z.low = float128_default_nan_low;
5839 z.high = float128_default_nan_high;
5840 return z;
5841 }
5842 if ( aExp == 0 ) {
5843 aExp = 1;
5844 bExp = 1;
5845 }
5846 if ( bSig0 < aSig0 ) goto aBigger;
5847 if ( aSig0 < bSig0 ) goto bBigger;
5848 if ( bSig1 < aSig1 ) goto aBigger;
5849 if ( aSig1 < bSig1 ) goto bBigger;
5850 return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
5851 bExpBigger:
5852 if ( bExp == 0x7FFF ) {
5853 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5854 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
5855 }
5856 if ( aExp == 0 ) {
5857 ++expDiff;
5858 }
5859 else {
5860 aSig0 |= LIT64( 0x4000000000000000 );
5861 }
5862 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5863 bSig0 |= LIT64( 0x4000000000000000 );
5864 bBigger:
5865 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
5866 zExp = bExp;
5867 zSign ^= 1;
5868 goto normalizeRoundAndPack;
5869 aExpBigger:
5870 if ( aExp == 0x7FFF ) {
5871 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5872 return a;
5873 }
5874 if ( bExp == 0 ) {
5875 --expDiff;
5876 }
5877 else {
5878 bSig0 |= LIT64( 0x4000000000000000 );
5879 }
5880 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
5881 aSig0 |= LIT64( 0x4000000000000000 );
5882 aBigger:
5883 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5884 zExp = aExp;
5885 normalizeRoundAndPack:
5886 --zExp;
5887 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
5888
5889}
5890
5891/*----------------------------------------------------------------------------
5892| Returns the result of adding the quadruple-precision floating-point values
5893| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
5894| for Binary Floating-Point Arithmetic.
5895*----------------------------------------------------------------------------*/
5896
5897float128 float128_add( float128 a, float128 b STATUS_PARAM )
5898{
5899 flag aSign, bSign;
5900
5901 aSign = extractFloat128Sign( a );
5902 bSign = extractFloat128Sign( b );
5903 if ( aSign == bSign ) {
5904 return addFloat128Sigs( a, b, aSign STATUS_VAR );
5905 }
5906 else {
5907 return subFloat128Sigs( a, b, aSign STATUS_VAR );
5908 }
5909
5910}
5911
5912/*----------------------------------------------------------------------------
5913| Returns the result of subtracting the quadruple-precision floating-point
5914| values `a' and `b'. The operation is performed according to the IEC/IEEE
5915| Standard for Binary Floating-Point Arithmetic.
5916*----------------------------------------------------------------------------*/
5917
5918float128 float128_sub( float128 a, float128 b STATUS_PARAM )
5919{
5920 flag aSign, bSign;
5921
5922 aSign = extractFloat128Sign( a );
5923 bSign = extractFloat128Sign( b );
5924 if ( aSign == bSign ) {
5925 return subFloat128Sigs( a, b, aSign STATUS_VAR );
5926 }
5927 else {
5928 return addFloat128Sigs( a, b, aSign STATUS_VAR );
5929 }
5930
5931}
5932
5933/*----------------------------------------------------------------------------
5934| Returns the result of multiplying the quadruple-precision floating-point
5935| values `a' and `b'. The operation is performed according to the IEC/IEEE
5936| Standard for Binary Floating-Point Arithmetic.
5937*----------------------------------------------------------------------------*/
5938
5939float128 float128_mul( float128 a, float128 b STATUS_PARAM )
5940{
5941 flag aSign, bSign, zSign;
5942 int32 aExp, bExp, zExp;
bb98fe42 5943 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
158142c2
FB
5944 float128 z;
5945
5946 aSig1 = extractFloat128Frac1( a );
5947 aSig0 = extractFloat128Frac0( a );
5948 aExp = extractFloat128Exp( a );
5949 aSign = extractFloat128Sign( a );
5950 bSig1 = extractFloat128Frac1( b );
5951 bSig0 = extractFloat128Frac0( b );
5952 bExp = extractFloat128Exp( b );
5953 bSign = extractFloat128Sign( b );
5954 zSign = aSign ^ bSign;
5955 if ( aExp == 0x7FFF ) {
5956 if ( ( aSig0 | aSig1 )
5957 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5958 return propagateFloat128NaN( a, b STATUS_VAR );
5959 }
5960 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
5961 return packFloat128( zSign, 0x7FFF, 0, 0 );
5962 }
5963 if ( bExp == 0x7FFF ) {
5964 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5965 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5966 invalid:
5967 float_raise( float_flag_invalid STATUS_VAR);
5968 z.low = float128_default_nan_low;
5969 z.high = float128_default_nan_high;
5970 return z;
5971 }
5972 return packFloat128( zSign, 0x7FFF, 0, 0 );
5973 }
5974 if ( aExp == 0 ) {
5975 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5976 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5977 }
5978 if ( bExp == 0 ) {
5979 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5980 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5981 }
5982 zExp = aExp + bExp - 0x4000;
5983 aSig0 |= LIT64( 0x0001000000000000 );
5984 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5985 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5986 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5987 zSig2 |= ( zSig3 != 0 );
5988 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5989 shift128ExtraRightJamming(
5990 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5991 ++zExp;
5992 }
5993 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5994
5995}
5996
5997/*----------------------------------------------------------------------------
5998| Returns the result of dividing the quadruple-precision floating-point value
5999| `a' by the corresponding value `b'. The operation is performed according to
6000| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6001*----------------------------------------------------------------------------*/
6002
6003float128 float128_div( float128 a, float128 b STATUS_PARAM )
6004{
6005 flag aSign, bSign, zSign;
6006 int32 aExp, bExp, zExp;
bb98fe42
AF
6007 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6008 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
158142c2
FB
6009 float128 z;
6010
6011 aSig1 = extractFloat128Frac1( a );
6012 aSig0 = extractFloat128Frac0( a );
6013 aExp = extractFloat128Exp( a );
6014 aSign = extractFloat128Sign( a );
6015 bSig1 = extractFloat128Frac1( b );
6016 bSig0 = extractFloat128Frac0( b );
6017 bExp = extractFloat128Exp( b );
6018 bSign = extractFloat128Sign( b );
6019 zSign = aSign ^ bSign;
6020 if ( aExp == 0x7FFF ) {
6021 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6022 if ( bExp == 0x7FFF ) {
6023 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6024 goto invalid;
6025 }
6026 return packFloat128( zSign, 0x7FFF, 0, 0 );
6027 }
6028 if ( bExp == 0x7FFF ) {
6029 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6030 return packFloat128( zSign, 0, 0, 0 );
6031 }
6032 if ( bExp == 0 ) {
6033 if ( ( bSig0 | bSig1 ) == 0 ) {
6034 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6035 invalid:
6036 float_raise( float_flag_invalid STATUS_VAR);
6037 z.low = float128_default_nan_low;
6038 z.high = float128_default_nan_high;
6039 return z;
6040 }
6041 float_raise( float_flag_divbyzero STATUS_VAR);
6042 return packFloat128( zSign, 0x7FFF, 0, 0 );
6043 }
6044 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6045 }
6046 if ( aExp == 0 ) {
6047 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6048 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6049 }
6050 zExp = aExp - bExp + 0x3FFD;
6051 shortShift128Left(
6052 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6053 shortShift128Left(
6054 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6055 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6056 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6057 ++zExp;
6058 }
6059 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6060 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6061 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
bb98fe42 6062 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
6063 --zSig0;
6064 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6065 }
6066 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6067 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6068 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6069 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
bb98fe42 6070 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
6071 --zSig1;
6072 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6073 }
6074 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6075 }
6076 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6077 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6078
6079}
6080
6081/*----------------------------------------------------------------------------
6082| Returns the remainder of the quadruple-precision floating-point value `a'
6083| with respect to the corresponding value `b'. The operation is performed
6084| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6085*----------------------------------------------------------------------------*/
6086
6087float128 float128_rem( float128 a, float128 b STATUS_PARAM )
6088{
ed086f3d 6089 flag aSign, zSign;
158142c2 6090 int32 aExp, bExp, expDiff;
bb98fe42
AF
6091 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6092 uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6093 int64_t sigMean0;
158142c2
FB
6094 float128 z;
6095
6096 aSig1 = extractFloat128Frac1( a );
6097 aSig0 = extractFloat128Frac0( a );
6098 aExp = extractFloat128Exp( a );
6099 aSign = extractFloat128Sign( a );
6100 bSig1 = extractFloat128Frac1( b );
6101 bSig0 = extractFloat128Frac0( b );
6102 bExp = extractFloat128Exp( b );
158142c2
FB
6103 if ( aExp == 0x7FFF ) {
6104 if ( ( aSig0 | aSig1 )
6105 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6106 return propagateFloat128NaN( a, b STATUS_VAR );
6107 }
6108 goto invalid;
6109 }
6110 if ( bExp == 0x7FFF ) {
6111 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6112 return a;
6113 }
6114 if ( bExp == 0 ) {
6115 if ( ( bSig0 | bSig1 ) == 0 ) {
6116 invalid:
6117 float_raise( float_flag_invalid STATUS_VAR);
6118 z.low = float128_default_nan_low;
6119 z.high = float128_default_nan_high;
6120 return z;
6121 }
6122 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6123 }
6124 if ( aExp == 0 ) {
6125 if ( ( aSig0 | aSig1 ) == 0 ) return a;
6126 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6127 }
6128 expDiff = aExp - bExp;
6129 if ( expDiff < -1 ) return a;
6130 shortShift128Left(
6131 aSig0 | LIT64( 0x0001000000000000 ),
6132 aSig1,
6133 15 - ( expDiff < 0 ),
6134 &aSig0,
6135 &aSig1
6136 );
6137 shortShift128Left(
6138 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6139 q = le128( bSig0, bSig1, aSig0, aSig1 );
6140 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6141 expDiff -= 64;
6142 while ( 0 < expDiff ) {
6143 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6144 q = ( 4 < q ) ? q - 4 : 0;
6145 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6146 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6147 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6148 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6149 expDiff -= 61;
6150 }
6151 if ( -64 < expDiff ) {
6152 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6153 q = ( 4 < q ) ? q - 4 : 0;
6154 q >>= - expDiff;
6155 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6156 expDiff += 52;
6157 if ( expDiff < 0 ) {
6158 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6159 }
6160 else {
6161 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6162 }
6163 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6164 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6165 }
6166 else {
6167 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6168 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6169 }
6170 do {
6171 alternateASig0 = aSig0;
6172 alternateASig1 = aSig1;
6173 ++q;
6174 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
bb98fe42 6175 } while ( 0 <= (int64_t) aSig0 );
158142c2 6176 add128(
bb98fe42 6177 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
158142c2
FB
6178 if ( ( sigMean0 < 0 )
6179 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6180 aSig0 = alternateASig0;
6181 aSig1 = alternateASig1;
6182 }
bb98fe42 6183 zSign = ( (int64_t) aSig0 < 0 );
158142c2
FB
6184 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6185 return
6186 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
6187
6188}
6189
6190/*----------------------------------------------------------------------------
6191| Returns the square root of the quadruple-precision floating-point value `a'.
6192| The operation is performed according to the IEC/IEEE Standard for Binary
6193| Floating-Point Arithmetic.
6194*----------------------------------------------------------------------------*/
6195
6196float128 float128_sqrt( float128 a STATUS_PARAM )
6197{
6198 flag aSign;
6199 int32 aExp, zExp;
bb98fe42
AF
6200 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6201 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
158142c2
FB
6202 float128 z;
6203
6204 aSig1 = extractFloat128Frac1( a );
6205 aSig0 = extractFloat128Frac0( a );
6206 aExp = extractFloat128Exp( a );
6207 aSign = extractFloat128Sign( a );
6208 if ( aExp == 0x7FFF ) {
6209 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
6210 if ( ! aSign ) return a;
6211 goto invalid;
6212 }
6213 if ( aSign ) {
6214 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6215 invalid:
6216 float_raise( float_flag_invalid STATUS_VAR);
6217 z.low = float128_default_nan_low;
6218 z.high = float128_default_nan_high;
6219 return z;
6220 }
6221 if ( aExp == 0 ) {
6222 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6223 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6224 }
6225 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6226 aSig0 |= LIT64( 0x0001000000000000 );
6227 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6228 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6229 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6230 doubleZSig0 = zSig0<<1;
6231 mul64To128( zSig0, zSig0, &term0, &term1 );
6232 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
bb98fe42 6233 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
6234 --zSig0;
6235 doubleZSig0 -= 2;
6236 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6237 }
6238 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6239 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6240 if ( zSig1 == 0 ) zSig1 = 1;
6241 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6242 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6243 mul64To128( zSig1, zSig1, &term2, &term3 );
6244 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
bb98fe42 6245 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
6246 --zSig1;
6247 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6248 term3 |= 1;
6249 term2 |= doubleZSig0;
6250 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6251 }
6252 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6253 }
6254 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6255 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6256
6257}
6258
6259/*----------------------------------------------------------------------------
6260| Returns 1 if the quadruple-precision floating-point value `a' is equal to
b689362d
AJ
6261| the corresponding value `b', and 0 otherwise. The invalid exception is
6262| raised if either operand is a NaN. Otherwise, the comparison is performed
158142c2
FB
6263| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6264*----------------------------------------------------------------------------*/
6265
b689362d 6266int float128_eq( float128 a, float128 b STATUS_PARAM )
158142c2
FB
6267{
6268
6269 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6270 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6271 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6272 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6273 ) {
b689362d 6274 float_raise( float_flag_invalid STATUS_VAR);
158142c2
FB
6275 return 0;
6276 }
6277 return
6278 ( a.low == b.low )
6279 && ( ( a.high == b.high )
6280 || ( ( a.low == 0 )
bb98fe42 6281 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
6282 );
6283
6284}
6285
6286/*----------------------------------------------------------------------------
6287| Returns 1 if the quadruple-precision floating-point value `a' is less than
f5a64251
AJ
6288| or equal to the corresponding value `b', and 0 otherwise. The invalid
6289| exception is raised if either operand is a NaN. The comparison is performed
6290| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
6291*----------------------------------------------------------------------------*/
6292
750afe93 6293int float128_le( float128 a, float128 b STATUS_PARAM )
158142c2
FB
6294{
6295 flag aSign, bSign;
6296
6297 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6298 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6299 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6300 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6301 ) {
6302 float_raise( float_flag_invalid STATUS_VAR);
6303 return 0;
6304 }
6305 aSign = extractFloat128Sign( a );
6306 bSign = extractFloat128Sign( b );
6307 if ( aSign != bSign ) {
6308 return
6309 aSign
bb98fe42 6310 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
6311 == 0 );
6312 }
6313 return
6314 aSign ? le128( b.high, b.low, a.high, a.low )
6315 : le128( a.high, a.low, b.high, b.low );
6316
6317}
6318
6319/*----------------------------------------------------------------------------
6320| Returns 1 if the quadruple-precision floating-point value `a' is less than
f5a64251
AJ
6321| the corresponding value `b', and 0 otherwise. The invalid exception is
6322| raised if either operand is a NaN. The comparison is performed according
6323| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
6324*----------------------------------------------------------------------------*/
6325
750afe93 6326int float128_lt( float128 a, float128 b STATUS_PARAM )
158142c2
FB
6327{
6328 flag aSign, bSign;
6329
6330 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6331 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6332 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6333 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6334 ) {
6335 float_raise( float_flag_invalid STATUS_VAR);
6336 return 0;
6337 }
6338 aSign = extractFloat128Sign( a );
6339 bSign = extractFloat128Sign( b );
6340 if ( aSign != bSign ) {
6341 return
6342 aSign
bb98fe42 6343 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
6344 != 0 );
6345 }
6346 return
6347 aSign ? lt128( b.high, b.low, a.high, a.low )
6348 : lt128( a.high, a.low, b.high, b.low );
6349
6350}
6351
67b7861d
AJ
6352/*----------------------------------------------------------------------------
6353| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
f5a64251
AJ
6354| be compared, and 0 otherwise. The invalid exception is raised if either
6355| operand is a NaN. The comparison is performed according to the IEC/IEEE
6356| Standard for Binary Floating-Point Arithmetic.
67b7861d
AJ
6357*----------------------------------------------------------------------------*/
6358
6359int float128_unordered( float128 a, float128 b STATUS_PARAM )
6360{
6361 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6362 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6363 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6364 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6365 ) {
6366 float_raise( float_flag_invalid STATUS_VAR);
6367 return 1;
6368 }
6369 return 0;
6370}
6371
158142c2
FB
6372/*----------------------------------------------------------------------------
6373| Returns 1 if the quadruple-precision floating-point value `a' is equal to
f5a64251
AJ
6374| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6375| exception. The comparison is performed according to the IEC/IEEE Standard
6376| for Binary Floating-Point Arithmetic.
158142c2
FB
6377*----------------------------------------------------------------------------*/
6378
b689362d 6379int float128_eq_quiet( float128 a, float128 b STATUS_PARAM )
158142c2
FB
6380{
6381
6382 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6383 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6384 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6385 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6386 ) {
b689362d
AJ
6387 if ( float128_is_signaling_nan( a )
6388 || float128_is_signaling_nan( b ) ) {
6389 float_raise( float_flag_invalid STATUS_VAR);
6390 }
158142c2
FB
6391 return 0;
6392 }
6393 return
6394 ( a.low == b.low )
6395 && ( ( a.high == b.high )
6396 || ( ( a.low == 0 )
bb98fe42 6397 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
6398 );
6399
6400}
6401
6402/*----------------------------------------------------------------------------
6403| Returns 1 if the quadruple-precision floating-point value `a' is less than
6404| or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
6405| cause an exception. Otherwise, the comparison is performed according to the
6406| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6407*----------------------------------------------------------------------------*/
6408
750afe93 6409int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
158142c2
FB
6410{
6411 flag aSign, bSign;
6412
6413 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6414 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6415 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6416 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6417 ) {
6418 if ( float128_is_signaling_nan( a )
6419 || float128_is_signaling_nan( b ) ) {
6420 float_raise( float_flag_invalid STATUS_VAR);
6421 }
6422 return 0;
6423 }
6424 aSign = extractFloat128Sign( a );
6425 bSign = extractFloat128Sign( b );
6426 if ( aSign != bSign ) {
6427 return
6428 aSign
bb98fe42 6429 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
6430 == 0 );
6431 }
6432 return
6433 aSign ? le128( b.high, b.low, a.high, a.low )
6434 : le128( a.high, a.low, b.high, b.low );
6435
6436}
6437
6438/*----------------------------------------------------------------------------
6439| Returns 1 if the quadruple-precision floating-point value `a' is less than
6440| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6441| exception. Otherwise, the comparison is performed according to the IEC/IEEE
6442| Standard for Binary Floating-Point Arithmetic.
6443*----------------------------------------------------------------------------*/
6444
750afe93 6445int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
158142c2
FB
6446{
6447 flag aSign, bSign;
6448
6449 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6450 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6451 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6452 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6453 ) {
6454 if ( float128_is_signaling_nan( a )
6455 || float128_is_signaling_nan( b ) ) {
6456 float_raise( float_flag_invalid STATUS_VAR);
6457 }
6458 return 0;
6459 }
6460 aSign = extractFloat128Sign( a );
6461 bSign = extractFloat128Sign( b );
6462 if ( aSign != bSign ) {
6463 return
6464 aSign
bb98fe42 6465 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
6466 != 0 );
6467 }
6468 return
6469 aSign ? lt128( b.high, b.low, a.high, a.low )
6470 : lt128( a.high, a.low, b.high, b.low );
6471
6472}
6473
67b7861d
AJ
6474/*----------------------------------------------------------------------------
6475| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6476| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
6477| comparison is performed according to the IEC/IEEE Standard for Binary
6478| Floating-Point Arithmetic.
6479*----------------------------------------------------------------------------*/
6480
6481int float128_unordered_quiet( float128 a, float128 b STATUS_PARAM )
6482{
6483 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6484 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6485 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6486 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6487 ) {
6488 if ( float128_is_signaling_nan( a )
6489 || float128_is_signaling_nan( b ) ) {
6490 float_raise( float_flag_invalid STATUS_VAR);
6491 }
6492 return 1;
6493 }
6494 return 0;
6495}
6496
1d6bda35 6497/* misc functions */
c4850f9e 6498float32 uint32_to_float32(uint32_t a STATUS_PARAM)
1d6bda35
FB
6499{
6500 return int64_to_float32(a STATUS_VAR);
6501}
6502
c4850f9e 6503float64 uint32_to_float64(uint32_t a STATUS_PARAM)
1d6bda35
FB
6504{
6505 return int64_to_float64(a STATUS_VAR);
6506}
6507
9f8d2a09 6508uint32 float32_to_uint32( float32 a STATUS_PARAM )
1d6bda35
FB
6509{
6510 int64_t v;
9f8d2a09 6511 uint32 res;
1d6bda35
FB
6512
6513 v = float32_to_int64(a STATUS_VAR);
6514 if (v < 0) {
6515 res = 0;
6516 float_raise( float_flag_invalid STATUS_VAR);
6517 } else if (v > 0xffffffff) {
6518 res = 0xffffffff;
6519 float_raise( float_flag_invalid STATUS_VAR);
6520 } else {
6521 res = v;
6522 }
6523 return res;
6524}
6525
9f8d2a09 6526uint32 float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
1d6bda35
FB
6527{
6528 int64_t v;
9f8d2a09 6529 uint32 res;
1d6bda35
FB
6530
6531 v = float32_to_int64_round_to_zero(a STATUS_VAR);
6532 if (v < 0) {
6533 res = 0;
6534 float_raise( float_flag_invalid STATUS_VAR);
6535 } else if (v > 0xffffffff) {
6536 res = 0xffffffff;
6537 float_raise( float_flag_invalid STATUS_VAR);
6538 } else {
6539 res = v;
6540 }
6541 return res;
6542}
6543
f581bf54
WN
6544int_fast16_t float32_to_int16(float32 a STATUS_PARAM)
6545{
6546 int32_t v;
6547 int_fast16_t res;
6548 int old_exc_flags = get_float_exception_flags(status);
6549
6550 v = float32_to_int32(a STATUS_VAR);
6551 if (v < -0x8000) {
6552 res = -0x8000;
6553 } else if (v > 0x7fff) {
6554 res = 0x7fff;
6555 } else {
6556 return v;
6557 }
6558
6559 set_float_exception_flags(old_exc_flags, status);
6560 float_raise(float_flag_invalid STATUS_VAR);
6561 return res;
6562}
6563
6564uint_fast16_t float32_to_uint16(float32 a STATUS_PARAM)
6565{
6566 int32_t v;
6567 uint_fast16_t res;
6568 int old_exc_flags = get_float_exception_flags(status);
6569
6570 v = float32_to_int32(a STATUS_VAR);
6571 if (v < 0) {
6572 res = 0;
6573 } else if (v > 0xffff) {
6574 res = 0xffff;
6575 } else {
6576 return v;
6577 }
6578
6579 set_float_exception_flags(old_exc_flags, status);
6580 float_raise(float_flag_invalid STATUS_VAR);
6581 return res;
6582}
6583
5aea4c58 6584uint_fast16_t float32_to_uint16_round_to_zero(float32 a STATUS_PARAM)
cbcef455
PM
6585{
6586 int64_t v;
5aea4c58 6587 uint_fast16_t res;
cbcef455
PM
6588
6589 v = float32_to_int64_round_to_zero(a STATUS_VAR);
6590 if (v < 0) {
6591 res = 0;
6592 float_raise( float_flag_invalid STATUS_VAR);
6593 } else if (v > 0xffff) {
6594 res = 0xffff;
6595 float_raise( float_flag_invalid STATUS_VAR);
6596 } else {
6597 res = v;
6598 }
6599 return res;
6600}
6601
9f8d2a09 6602uint32 float64_to_uint32( float64 a STATUS_PARAM )
1d6bda35
FB
6603{
6604 int64_t v;
9f8d2a09 6605 uint32 res;
1d6bda35
FB
6606
6607 v = float64_to_int64(a STATUS_VAR);
6608 if (v < 0) {
6609 res = 0;
6610 float_raise( float_flag_invalid STATUS_VAR);
6611 } else if (v > 0xffffffff) {
6612 res = 0xffffffff;
6613 float_raise( float_flag_invalid STATUS_VAR);
6614 } else {
6615 res = v;
6616 }
6617 return res;
6618}
6619
9f8d2a09 6620uint32 float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
1d6bda35
FB
6621{
6622 int64_t v;
9f8d2a09 6623 uint32 res;
1d6bda35
FB
6624
6625 v = float64_to_int64_round_to_zero(a STATUS_VAR);
6626 if (v < 0) {
6627 res = 0;
6628 float_raise( float_flag_invalid STATUS_VAR);
6629 } else if (v > 0xffffffff) {
6630 res = 0xffffffff;
6631 float_raise( float_flag_invalid STATUS_VAR);
6632 } else {
6633 res = v;
6634 }
6635 return res;
6636}
6637
f581bf54
WN
6638int_fast16_t float64_to_int16(float64 a STATUS_PARAM)
6639{
6640 int64_t v;
6641 int_fast16_t res;
6642 int old_exc_flags = get_float_exception_flags(status);
6643
6644 v = float64_to_int32(a STATUS_VAR);
6645 if (v < -0x8000) {
6646 res = -0x8000;
6647 } else if (v > 0x7fff) {
6648 res = 0x7fff;
6649 } else {
6650 return v;
6651 }
6652
6653 set_float_exception_flags(old_exc_flags, status);
6654 float_raise(float_flag_invalid STATUS_VAR);
6655 return res;
6656}
6657
6658uint_fast16_t float64_to_uint16(float64 a STATUS_PARAM)
6659{
6660 int64_t v;
6661 uint_fast16_t res;
6662 int old_exc_flags = get_float_exception_flags(status);
6663
6664 v = float64_to_int32(a STATUS_VAR);
6665 if (v < 0) {
6666 res = 0;
6667 } else if (v > 0xffff) {
6668 res = 0xffff;
6669 } else {
6670 return v;
6671 }
6672
6673 set_float_exception_flags(old_exc_flags, status);
6674 float_raise(float_flag_invalid STATUS_VAR);
6675 return res;
6676}
6677
5aea4c58 6678uint_fast16_t float64_to_uint16_round_to_zero(float64 a STATUS_PARAM)
cbcef455
PM
6679{
6680 int64_t v;
5aea4c58 6681 uint_fast16_t res;
cbcef455
PM
6682
6683 v = float64_to_int64_round_to_zero(a STATUS_VAR);
6684 if (v < 0) {
6685 res = 0;
6686 float_raise( float_flag_invalid STATUS_VAR);
6687 } else if (v > 0xffff) {
6688 res = 0xffff;
6689 float_raise( float_flag_invalid STATUS_VAR);
6690 } else {
6691 res = v;
6692 }
6693 return res;
6694}
6695
fb3ea83a
TM
6696/*----------------------------------------------------------------------------
6697| Returns the result of converting the double-precision floating-point value
6698| `a' to the 64-bit unsigned integer format. The conversion is
6699| performed according to the IEC/IEEE Standard for Binary Floating-Point
6700| Arithmetic---which means in particular that the conversion is rounded
6701| according to the current rounding mode. If `a' is a NaN, the largest
6702| positive integer is returned. If the conversion overflows, the
6703| largest unsigned integer is returned. If 'a' is negative, the value is
6704| rounded and zero is returned; negative values that do not round to zero
6705| will raise the inexact exception.
6706*----------------------------------------------------------------------------*/
75d62a58 6707
fb3ea83a
TM
6708uint64_t float64_to_uint64(float64 a STATUS_PARAM)
6709{
6710 flag aSign;
6711 int_fast16_t aExp, shiftCount;
6712 uint64_t aSig, aSigExtra;
6713 a = float64_squash_input_denormal(a STATUS_VAR);
75d62a58 6714
fb3ea83a
TM
6715 aSig = extractFloat64Frac(a);
6716 aExp = extractFloat64Exp(a);
6717 aSign = extractFloat64Sign(a);
6718 if (aSign && (aExp > 1022)) {
6719 float_raise(float_flag_invalid STATUS_VAR);
6720 if (float64_is_any_nan(a)) {
6721 return LIT64(0xFFFFFFFFFFFFFFFF);
6722 } else {
6723 return 0;
6724 }
6725 }
6726 if (aExp) {
6727 aSig |= LIT64(0x0010000000000000);
6728 }
6729 shiftCount = 0x433 - aExp;
6730 if (shiftCount <= 0) {
6731 if (0x43E < aExp) {
6732 float_raise(float_flag_invalid STATUS_VAR);
6733 return LIT64(0xFFFFFFFFFFFFFFFF);
6734 }
6735 aSigExtra = 0;
6736 aSig <<= -shiftCount;
6737 } else {
6738 shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra);
6739 }
6740 return roundAndPackUint64(aSign, aSig, aSigExtra STATUS_VAR);
75d62a58
JM
6741}
6742
6743uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
6744{
6745 int64_t v;
6746
f090c9d4
PB
6747 v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
6748 v += float64_val(a);
6749 v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);
75d62a58
JM
6750
6751 return v - INT64_MIN;
6752}
6753
1d6bda35 6754#define COMPARE(s, nan_exp) \
750afe93 6755INLINE int float ## s ## _compare_internal( float ## s a, float ## s b, \
1d6bda35
FB
6756 int is_quiet STATUS_PARAM ) \
6757{ \
6758 flag aSign, bSign; \
bb98fe42 6759 uint ## s ## _t av, bv; \
37d18660
PM
6760 a = float ## s ## _squash_input_denormal(a STATUS_VAR); \
6761 b = float ## s ## _squash_input_denormal(b STATUS_VAR); \
1d6bda35
FB
6762 \
6763 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
6764 extractFloat ## s ## Frac( a ) ) || \
6765 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
6766 extractFloat ## s ## Frac( b ) )) { \
6767 if (!is_quiet || \
6768 float ## s ## _is_signaling_nan( a ) || \
6769 float ## s ## _is_signaling_nan( b ) ) { \
6770 float_raise( float_flag_invalid STATUS_VAR); \
6771 } \
6772 return float_relation_unordered; \
6773 } \
6774 aSign = extractFloat ## s ## Sign( a ); \
6775 bSign = extractFloat ## s ## Sign( b ); \
f090c9d4 6776 av = float ## s ## _val(a); \
cd8a2533 6777 bv = float ## s ## _val(b); \
1d6bda35 6778 if ( aSign != bSign ) { \
bb98fe42 6779 if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) { \
1d6bda35
FB
6780 /* zero case */ \
6781 return float_relation_equal; \
6782 } else { \
6783 return 1 - (2 * aSign); \
6784 } \
6785 } else { \
f090c9d4 6786 if (av == bv) { \
1d6bda35
FB
6787 return float_relation_equal; \
6788 } else { \
f090c9d4 6789 return 1 - 2 * (aSign ^ ( av < bv )); \
1d6bda35
FB
6790 } \
6791 } \
6792} \
6793 \
750afe93 6794int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM ) \
1d6bda35
FB
6795{ \
6796 return float ## s ## _compare_internal(a, b, 0 STATUS_VAR); \
6797} \
6798 \
750afe93 6799int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \
1d6bda35
FB
6800{ \
6801 return float ## s ## _compare_internal(a, b, 1 STATUS_VAR); \
6802}
6803
6804COMPARE(32, 0xff)
6805COMPARE(64, 0x7ff)
9ee6e8bb 6806
f6714d36
AJ
6807INLINE int floatx80_compare_internal( floatx80 a, floatx80 b,
6808 int is_quiet STATUS_PARAM )
6809{
6810 flag aSign, bSign;
6811
6812 if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
6813 ( extractFloatx80Frac( a )<<1 ) ) ||
6814 ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
6815 ( extractFloatx80Frac( b )<<1 ) )) {
6816 if (!is_quiet ||
6817 floatx80_is_signaling_nan( a ) ||
6818 floatx80_is_signaling_nan( b ) ) {
6819 float_raise( float_flag_invalid STATUS_VAR);
6820 }
6821 return float_relation_unordered;
6822 }
6823 aSign = extractFloatx80Sign( a );
6824 bSign = extractFloatx80Sign( b );
6825 if ( aSign != bSign ) {
6826
6827 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
6828 ( ( a.low | b.low ) == 0 ) ) {
6829 /* zero case */
6830 return float_relation_equal;
6831 } else {
6832 return 1 - (2 * aSign);
6833 }
6834 } else {
6835 if (a.low == b.low && a.high == b.high) {
6836 return float_relation_equal;
6837 } else {
6838 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6839 }
6840 }
6841}
6842
6843int floatx80_compare( floatx80 a, floatx80 b STATUS_PARAM )
6844{
6845 return floatx80_compare_internal(a, b, 0 STATUS_VAR);
6846}
6847
6848int floatx80_compare_quiet( floatx80 a, floatx80 b STATUS_PARAM )
6849{
6850 return floatx80_compare_internal(a, b, 1 STATUS_VAR);
6851}
6852
1f587329
BS
6853INLINE int float128_compare_internal( float128 a, float128 b,
6854 int is_quiet STATUS_PARAM )
6855{
6856 flag aSign, bSign;
6857
6858 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6859 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6860 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6861 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6862 if (!is_quiet ||
6863 float128_is_signaling_nan( a ) ||
6864 float128_is_signaling_nan( b ) ) {
6865 float_raise( float_flag_invalid STATUS_VAR);
6866 }
6867 return float_relation_unordered;
6868 }
6869 aSign = extractFloat128Sign( a );
6870 bSign = extractFloat128Sign( b );
6871 if ( aSign != bSign ) {
6872 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6873 /* zero case */
6874 return float_relation_equal;
6875 } else {
6876 return 1 - (2 * aSign);
6877 }
6878 } else {
6879 if (a.low == b.low && a.high == b.high) {
6880 return float_relation_equal;
6881 } else {
6882 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6883 }
6884 }
6885}
6886
6887int float128_compare( float128 a, float128 b STATUS_PARAM )
6888{
6889 return float128_compare_internal(a, b, 0 STATUS_VAR);
6890}
6891
6892int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
6893{
6894 return float128_compare_internal(a, b, 1 STATUS_VAR);
6895}
6896
274f1b04
PM
6897/* min() and max() functions. These can't be implemented as
6898 * 'compare and pick one input' because that would mishandle
6899 * NaNs and +0 vs -0.
e17ab310
WN
6900 *
6901 * minnum() and maxnum() functions. These are similar to the min()
6902 * and max() functions but if one of the arguments is a QNaN and
6903 * the other is numerical then the numerical argument is returned.
6904 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
6905 * and maxNum() operations. min() and max() are the typical min/max
6906 * semantics provided by many CPUs which predate that specification.
274f1b04 6907 */
e70614ea 6908#define MINMAX(s) \
274f1b04 6909INLINE float ## s float ## s ## _minmax(float ## s a, float ## s b, \
e17ab310 6910 int ismin, int isieee STATUS_PARAM) \
274f1b04
PM
6911{ \
6912 flag aSign, bSign; \
6913 uint ## s ## _t av, bv; \
6914 a = float ## s ## _squash_input_denormal(a STATUS_VAR); \
6915 b = float ## s ## _squash_input_denormal(b STATUS_VAR); \
6916 if (float ## s ## _is_any_nan(a) || \
6917 float ## s ## _is_any_nan(b)) { \
e17ab310
WN
6918 if (isieee) { \
6919 if (float ## s ## _is_quiet_nan(a) && \
6920 !float ## s ##_is_any_nan(b)) { \
6921 return b; \
6922 } else if (float ## s ## _is_quiet_nan(b) && \
6923 !float ## s ## _is_any_nan(a)) { \
6924 return a; \
6925 } \
6926 } \
274f1b04
PM
6927 return propagateFloat ## s ## NaN(a, b STATUS_VAR); \
6928 } \
6929 aSign = extractFloat ## s ## Sign(a); \
6930 bSign = extractFloat ## s ## Sign(b); \
6931 av = float ## s ## _val(a); \
6932 bv = float ## s ## _val(b); \
6933 if (aSign != bSign) { \
6934 if (ismin) { \
6935 return aSign ? a : b; \
6936 } else { \
6937 return aSign ? b : a; \
6938 } \
6939 } else { \
6940 if (ismin) { \
6941 return (aSign ^ (av < bv)) ? a : b; \
6942 } else { \
6943 return (aSign ^ (av < bv)) ? b : a; \
6944 } \
6945 } \
6946} \
6947 \
6948float ## s float ## s ## _min(float ## s a, float ## s b STATUS_PARAM) \
6949{ \
e17ab310 6950 return float ## s ## _minmax(a, b, 1, 0 STATUS_VAR); \
274f1b04
PM
6951} \
6952 \
6953float ## s float ## s ## _max(float ## s a, float ## s b STATUS_PARAM) \
6954{ \
e17ab310
WN
6955 return float ## s ## _minmax(a, b, 0, 0 STATUS_VAR); \
6956} \
6957 \
6958float ## s float ## s ## _minnum(float ## s a, float ## s b STATUS_PARAM) \
6959{ \
6960 return float ## s ## _minmax(a, b, 1, 1 STATUS_VAR); \
6961} \
6962 \
6963float ## s float ## s ## _maxnum(float ## s a, float ## s b STATUS_PARAM) \
6964{ \
6965 return float ## s ## _minmax(a, b, 0, 1 STATUS_VAR); \
274f1b04
PM
6966}
6967
e70614ea
WN
6968MINMAX(32)
6969MINMAX(64)
274f1b04
PM
6970
6971
9ee6e8bb
PB
6972/* Multiply A by 2 raised to the power N. */
6973float32 float32_scalbn( float32 a, int n STATUS_PARAM )
6974{
6975 flag aSign;
326b9e98 6976 int16_t aExp;
bb98fe42 6977 uint32_t aSig;
9ee6e8bb 6978
37d18660 6979 a = float32_squash_input_denormal(a STATUS_VAR);
9ee6e8bb
PB
6980 aSig = extractFloat32Frac( a );
6981 aExp = extractFloat32Exp( a );
6982 aSign = extractFloat32Sign( a );
6983
6984 if ( aExp == 0xFF ) {
326b9e98
AJ
6985 if ( aSig ) {
6986 return propagateFloat32NaN( a, a STATUS_VAR );
6987 }
9ee6e8bb
PB
6988 return a;
6989 }
69397542
PB
6990 if ( aExp != 0 )
6991 aSig |= 0x00800000;
6992 else if ( aSig == 0 )
6993 return a;
6994
326b9e98
AJ
6995 if (n > 0x200) {
6996 n = 0x200;
6997 } else if (n < -0x200) {
6998 n = -0x200;
6999 }
7000
69397542
PB
7001 aExp += n - 1;
7002 aSig <<= 7;
7003 return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
9ee6e8bb
PB
7004}
7005
7006float64 float64_scalbn( float64 a, int n STATUS_PARAM )
7007{
7008 flag aSign;
326b9e98 7009 int16_t aExp;
bb98fe42 7010 uint64_t aSig;
9ee6e8bb 7011
37d18660 7012 a = float64_squash_input_denormal(a STATUS_VAR);
9ee6e8bb
PB
7013 aSig = extractFloat64Frac( a );
7014 aExp = extractFloat64Exp( a );
7015 aSign = extractFloat64Sign( a );
7016
7017 if ( aExp == 0x7FF ) {
326b9e98
AJ
7018 if ( aSig ) {
7019 return propagateFloat64NaN( a, a STATUS_VAR );
7020 }
9ee6e8bb
PB
7021 return a;
7022 }
69397542
PB
7023 if ( aExp != 0 )
7024 aSig |= LIT64( 0x0010000000000000 );
7025 else if ( aSig == 0 )
7026 return a;
7027
326b9e98
AJ
7028 if (n > 0x1000) {
7029 n = 0x1000;
7030 } else if (n < -0x1000) {
7031 n = -0x1000;
7032 }
7033
69397542
PB
7034 aExp += n - 1;
7035 aSig <<= 10;
7036 return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
9ee6e8bb
PB
7037}
7038
9ee6e8bb
PB
7039floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
7040{
7041 flag aSign;
326b9e98 7042 int32_t aExp;
bb98fe42 7043 uint64_t aSig;
9ee6e8bb
PB
7044
7045 aSig = extractFloatx80Frac( a );
7046 aExp = extractFloatx80Exp( a );
7047 aSign = extractFloatx80Sign( a );
7048
326b9e98
AJ
7049 if ( aExp == 0x7FFF ) {
7050 if ( aSig<<1 ) {
7051 return propagateFloatx80NaN( a, a STATUS_VAR );
7052 }
9ee6e8bb
PB
7053 return a;
7054 }
326b9e98 7055
69397542
PB
7056 if (aExp == 0 && aSig == 0)
7057 return a;
7058
326b9e98
AJ
7059 if (n > 0x10000) {
7060 n = 0x10000;
7061 } else if (n < -0x10000) {
7062 n = -0x10000;
7063 }
7064
9ee6e8bb 7065 aExp += n;
69397542
PB
7066 return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
7067 aSign, aExp, aSig, 0 STATUS_VAR );
9ee6e8bb 7068}
9ee6e8bb 7069
9ee6e8bb
PB
7070float128 float128_scalbn( float128 a, int n STATUS_PARAM )
7071{
7072 flag aSign;
326b9e98 7073 int32_t aExp;
bb98fe42 7074 uint64_t aSig0, aSig1;
9ee6e8bb
PB
7075
7076 aSig1 = extractFloat128Frac1( a );
7077 aSig0 = extractFloat128Frac0( a );
7078 aExp = extractFloat128Exp( a );
7079 aSign = extractFloat128Sign( a );
7080 if ( aExp == 0x7FFF ) {
326b9e98
AJ
7081 if ( aSig0 | aSig1 ) {
7082 return propagateFloat128NaN( a, a STATUS_VAR );
7083 }
9ee6e8bb
PB
7084 return a;
7085 }
69397542
PB
7086 if ( aExp != 0 )
7087 aSig0 |= LIT64( 0x0001000000000000 );
7088 else if ( aSig0 == 0 && aSig1 == 0 )
7089 return a;
7090
326b9e98
AJ
7091 if (n > 0x10000) {
7092 n = 0x10000;
7093 } else if (n < -0x10000) {
7094 n = -0x10000;
7095 }
7096
69397542
PB
7097 aExp += n - 1;
7098 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7099 STATUS_VAR );
9ee6e8bb
PB
7100
7101}