]> git.proxmox.com Git - qemu.git/blob - fpu/softfloat.c
softfloat: Add new flag for when denormal result is flushed to zero
[qemu.git] / fpu / softfloat.c
1 /*
2 * QEMU float support
3 *
4 * Derived from SoftFloat.
5 */
6
7 /*============================================================================
8
9 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
10 Package, Release 2b.
11
12 Written by John R. Hauser. This work was made possible in part by the
13 International Computer Science Institute, located at Suite 600, 1947 Center
14 Street, Berkeley, California 94704. Funding was partially provided by the
15 National Science Foundation under grant MIP-9311980. The original version
16 of this code was written as part of a project to build a fixed-point vector
17 processor in collaboration with the University of California at Berkeley,
18 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
19 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
20 arithmetic/SoftFloat.html'.
21
22 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
23 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
24 RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
25 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
26 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
27 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
28 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
29 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
30
31 Derivative works are acceptable, even for commercial purposes, so long as
32 (1) the source code for the derivative work includes prominent notice that
33 the work is derivative, and (2) the source code includes prominent notice with
34 these four paragraphs for those parts of this code that are retained.
35
36 =============================================================================*/
37
38 #include "softfloat.h"
39
40 /*----------------------------------------------------------------------------
41 | Primitive arithmetic functions, including multi-word arithmetic, and
42 | division and square root approximations. (Can be specialized to target if
43 | desired.)
44 *----------------------------------------------------------------------------*/
45 #include "softfloat-macros.h"
46
47 /*----------------------------------------------------------------------------
48 | Functions and definitions to determine: (1) whether tininess for underflow
49 | is detected before or after rounding by default, (2) what (if anything)
50 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
51 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
52 | are propagated from function inputs to output. These details are target-
53 | specific.
54 *----------------------------------------------------------------------------*/
55 #include "softfloat-specialize.h"
56
57 void set_float_rounding_mode(int val STATUS_PARAM)
58 {
59 STATUS(float_rounding_mode) = val;
60 }
61
62 void set_float_exception_flags(int val STATUS_PARAM)
63 {
64 STATUS(float_exception_flags) = val;
65 }
66
67 #ifdef FLOATX80
68 void set_floatx80_rounding_precision(int val STATUS_PARAM)
69 {
70 STATUS(floatx80_rounding_precision) = val;
71 }
72 #endif
73
74 /*----------------------------------------------------------------------------
75 | Returns the fraction bits of the half-precision floating-point value `a'.
76 *----------------------------------------------------------------------------*/
77
78 INLINE uint32_t extractFloat16Frac(float16 a)
79 {
80 return float16_val(a) & 0x3ff;
81 }
82
83 /*----------------------------------------------------------------------------
84 | Returns the exponent bits of the half-precision floating-point value `a'.
85 *----------------------------------------------------------------------------*/
86
87 INLINE int16 extractFloat16Exp(float16 a)
88 {
89 return (float16_val(a) >> 10) & 0x1f;
90 }
91
92 /*----------------------------------------------------------------------------
93 | Returns the sign bit of the single-precision floating-point value `a'.
94 *----------------------------------------------------------------------------*/
95
96 INLINE flag extractFloat16Sign(float16 a)
97 {
98 return float16_val(a)>>15;
99 }
100
101 /*----------------------------------------------------------------------------
102 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
103 | and 7, and returns the properly rounded 32-bit integer corresponding to the
104 | input. If `zSign' is 1, the input is negated before being converted to an
105 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
106 | is simply rounded to an integer, with the inexact exception raised if the
107 | input cannot be represented exactly as an integer. However, if the fixed-
108 | point input is too large, the invalid exception is raised and the largest
109 | positive or negative integer is returned.
110 *----------------------------------------------------------------------------*/
111
112 static int32 roundAndPackInt32( flag zSign, uint64_t absZ STATUS_PARAM)
113 {
114 int8 roundingMode;
115 flag roundNearestEven;
116 int8 roundIncrement, roundBits;
117 int32 z;
118
119 roundingMode = STATUS(float_rounding_mode);
120 roundNearestEven = ( roundingMode == float_round_nearest_even );
121 roundIncrement = 0x40;
122 if ( ! roundNearestEven ) {
123 if ( roundingMode == float_round_to_zero ) {
124 roundIncrement = 0;
125 }
126 else {
127 roundIncrement = 0x7F;
128 if ( zSign ) {
129 if ( roundingMode == float_round_up ) roundIncrement = 0;
130 }
131 else {
132 if ( roundingMode == float_round_down ) roundIncrement = 0;
133 }
134 }
135 }
136 roundBits = absZ & 0x7F;
137 absZ = ( absZ + roundIncrement )>>7;
138 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
139 z = absZ;
140 if ( zSign ) z = - z;
141 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
142 float_raise( float_flag_invalid STATUS_VAR);
143 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
144 }
145 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
146 return z;
147
148 }
149
150 /*----------------------------------------------------------------------------
151 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
152 | `absZ1', with binary point between bits 63 and 64 (between the input words),
153 | and returns the properly rounded 64-bit integer corresponding to the input.
154 | If `zSign' is 1, the input is negated before being converted to an integer.
155 | Ordinarily, the fixed-point input is simply rounded to an integer, with
156 | the inexact exception raised if the input cannot be represented exactly as
157 | an integer. However, if the fixed-point input is too large, the invalid
158 | exception is raised and the largest positive or negative integer is
159 | returned.
160 *----------------------------------------------------------------------------*/
161
162 static int64 roundAndPackInt64( flag zSign, uint64_t absZ0, uint64_t absZ1 STATUS_PARAM)
163 {
164 int8 roundingMode;
165 flag roundNearestEven, increment;
166 int64 z;
167
168 roundingMode = STATUS(float_rounding_mode);
169 roundNearestEven = ( roundingMode == float_round_nearest_even );
170 increment = ( (int64_t) absZ1 < 0 );
171 if ( ! roundNearestEven ) {
172 if ( roundingMode == float_round_to_zero ) {
173 increment = 0;
174 }
175 else {
176 if ( zSign ) {
177 increment = ( roundingMode == float_round_down ) && absZ1;
178 }
179 else {
180 increment = ( roundingMode == float_round_up ) && absZ1;
181 }
182 }
183 }
184 if ( increment ) {
185 ++absZ0;
186 if ( absZ0 == 0 ) goto overflow;
187 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
188 }
189 z = absZ0;
190 if ( zSign ) z = - z;
191 if ( z && ( ( z < 0 ) ^ zSign ) ) {
192 overflow:
193 float_raise( float_flag_invalid STATUS_VAR);
194 return
195 zSign ? (int64_t) LIT64( 0x8000000000000000 )
196 : LIT64( 0x7FFFFFFFFFFFFFFF );
197 }
198 if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
199 return z;
200
201 }
202
203 /*----------------------------------------------------------------------------
204 | Returns the fraction bits of the single-precision floating-point value `a'.
205 *----------------------------------------------------------------------------*/
206
207 INLINE uint32_t extractFloat32Frac( float32 a )
208 {
209
210 return float32_val(a) & 0x007FFFFF;
211
212 }
213
214 /*----------------------------------------------------------------------------
215 | Returns the exponent bits of the single-precision floating-point value `a'.
216 *----------------------------------------------------------------------------*/
217
218 INLINE int16 extractFloat32Exp( float32 a )
219 {
220
221 return ( float32_val(a)>>23 ) & 0xFF;
222
223 }
224
225 /*----------------------------------------------------------------------------
226 | Returns the sign bit of the single-precision floating-point value `a'.
227 *----------------------------------------------------------------------------*/
228
229 INLINE flag extractFloat32Sign( float32 a )
230 {
231
232 return float32_val(a)>>31;
233
234 }
235
236 /*----------------------------------------------------------------------------
237 | If `a' is denormal and we are in flush-to-zero mode then set the
238 | input-denormal exception and return zero. Otherwise just return the value.
239 *----------------------------------------------------------------------------*/
240 static float32 float32_squash_input_denormal(float32 a STATUS_PARAM)
241 {
242 if (STATUS(flush_inputs_to_zero)) {
243 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
244 float_raise(float_flag_input_denormal STATUS_VAR);
245 return make_float32(float32_val(a) & 0x80000000);
246 }
247 }
248 return a;
249 }
250
251 /*----------------------------------------------------------------------------
252 | Normalizes the subnormal single-precision floating-point value represented
253 | by the denormalized significand `aSig'. The normalized exponent and
254 | significand are stored at the locations pointed to by `zExpPtr' and
255 | `zSigPtr', respectively.
256 *----------------------------------------------------------------------------*/
257
258 static void
259 normalizeFloat32Subnormal( uint32_t aSig, int16 *zExpPtr, uint32_t *zSigPtr )
260 {
261 int8 shiftCount;
262
263 shiftCount = countLeadingZeros32( aSig ) - 8;
264 *zSigPtr = aSig<<shiftCount;
265 *zExpPtr = 1 - shiftCount;
266
267 }
268
269 /*----------------------------------------------------------------------------
270 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
271 | single-precision floating-point value, returning the result. After being
272 | shifted into the proper positions, the three fields are simply added
273 | together to form the result. This means that any integer portion of `zSig'
274 | will be added into the exponent. Since a properly normalized significand
275 | will have an integer portion equal to 1, the `zExp' input should be 1 less
276 | than the desired result exponent whenever `zSig' is a complete, normalized
277 | significand.
278 *----------------------------------------------------------------------------*/
279
280 INLINE float32 packFloat32( flag zSign, int16 zExp, uint32_t zSig )
281 {
282
283 return make_float32(
284 ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
285
286 }
287
288 /*----------------------------------------------------------------------------
289 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
290 | and significand `zSig', and returns the proper single-precision floating-
291 | point value corresponding to the abstract input. Ordinarily, the abstract
292 | value is simply rounded and packed into the single-precision format, with
293 | the inexact exception raised if the abstract input cannot be represented
294 | exactly. However, if the abstract value is too large, the overflow and
295 | inexact exceptions are raised and an infinity or maximal finite value is
296 | returned. If the abstract value is too small, the input value is rounded to
297 | a subnormal number, and the underflow and inexact exceptions are raised if
298 | the abstract input cannot be represented exactly as a subnormal single-
299 | precision floating-point number.
300 | The input significand `zSig' has its binary point between bits 30
301 | and 29, which is 7 bits to the left of the usual location. This shifted
302 | significand must be normalized or smaller. If `zSig' is not normalized,
303 | `zExp' must be 0; in that case, the result returned is a subnormal number,
304 | and it must not require rounding. In the usual case that `zSig' is
305 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
306 | The handling of underflow and overflow follows the IEC/IEEE Standard for
307 | Binary Floating-Point Arithmetic.
308 *----------------------------------------------------------------------------*/
309
310 static float32 roundAndPackFloat32( flag zSign, int16 zExp, uint32_t zSig STATUS_PARAM)
311 {
312 int8 roundingMode;
313 flag roundNearestEven;
314 int8 roundIncrement, roundBits;
315 flag isTiny;
316
317 roundingMode = STATUS(float_rounding_mode);
318 roundNearestEven = ( roundingMode == float_round_nearest_even );
319 roundIncrement = 0x40;
320 if ( ! roundNearestEven ) {
321 if ( roundingMode == float_round_to_zero ) {
322 roundIncrement = 0;
323 }
324 else {
325 roundIncrement = 0x7F;
326 if ( zSign ) {
327 if ( roundingMode == float_round_up ) roundIncrement = 0;
328 }
329 else {
330 if ( roundingMode == float_round_down ) roundIncrement = 0;
331 }
332 }
333 }
334 roundBits = zSig & 0x7F;
335 if ( 0xFD <= (uint16_t) zExp ) {
336 if ( ( 0xFD < zExp )
337 || ( ( zExp == 0xFD )
338 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
339 ) {
340 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
341 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
342 }
343 if ( zExp < 0 ) {
344 if (STATUS(flush_to_zero)) {
345 float_raise(float_flag_output_denormal STATUS_VAR);
346 return packFloat32(zSign, 0, 0);
347 }
348 isTiny =
349 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
350 || ( zExp < -1 )
351 || ( zSig + roundIncrement < 0x80000000 );
352 shift32RightJamming( zSig, - zExp, &zSig );
353 zExp = 0;
354 roundBits = zSig & 0x7F;
355 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
356 }
357 }
358 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
359 zSig = ( zSig + roundIncrement )>>7;
360 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
361 if ( zSig == 0 ) zExp = 0;
362 return packFloat32( zSign, zExp, zSig );
363
364 }
365
366 /*----------------------------------------------------------------------------
367 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
368 | and significand `zSig', and returns the proper single-precision floating-
369 | point value corresponding to the abstract input. This routine is just like
370 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
371 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
372 | floating-point exponent.
373 *----------------------------------------------------------------------------*/
374
375 static float32
376 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, uint32_t zSig STATUS_PARAM)
377 {
378 int8 shiftCount;
379
380 shiftCount = countLeadingZeros32( zSig ) - 1;
381 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
382
383 }
384
385 /*----------------------------------------------------------------------------
386 | Returns the fraction bits of the double-precision floating-point value `a'.
387 *----------------------------------------------------------------------------*/
388
389 INLINE uint64_t extractFloat64Frac( float64 a )
390 {
391
392 return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
393
394 }
395
396 /*----------------------------------------------------------------------------
397 | Returns the exponent bits of the double-precision floating-point value `a'.
398 *----------------------------------------------------------------------------*/
399
400 INLINE int16 extractFloat64Exp( float64 a )
401 {
402
403 return ( float64_val(a)>>52 ) & 0x7FF;
404
405 }
406
407 /*----------------------------------------------------------------------------
408 | Returns the sign bit of the double-precision floating-point value `a'.
409 *----------------------------------------------------------------------------*/
410
411 INLINE flag extractFloat64Sign( float64 a )
412 {
413
414 return float64_val(a)>>63;
415
416 }
417
418 /*----------------------------------------------------------------------------
419 | If `a' is denormal and we are in flush-to-zero mode then set the
420 | input-denormal exception and return zero. Otherwise just return the value.
421 *----------------------------------------------------------------------------*/
422 static float64 float64_squash_input_denormal(float64 a STATUS_PARAM)
423 {
424 if (STATUS(flush_inputs_to_zero)) {
425 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
426 float_raise(float_flag_input_denormal STATUS_VAR);
427 return make_float64(float64_val(a) & (1ULL << 63));
428 }
429 }
430 return a;
431 }
432
433 /*----------------------------------------------------------------------------
434 | Normalizes the subnormal double-precision floating-point value represented
435 | by the denormalized significand `aSig'. The normalized exponent and
436 | significand are stored at the locations pointed to by `zExpPtr' and
437 | `zSigPtr', respectively.
438 *----------------------------------------------------------------------------*/
439
440 static void
441 normalizeFloat64Subnormal( uint64_t aSig, int16 *zExpPtr, uint64_t *zSigPtr )
442 {
443 int8 shiftCount;
444
445 shiftCount = countLeadingZeros64( aSig ) - 11;
446 *zSigPtr = aSig<<shiftCount;
447 *zExpPtr = 1 - shiftCount;
448
449 }
450
451 /*----------------------------------------------------------------------------
452 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
453 | double-precision floating-point value, returning the result. After being
454 | shifted into the proper positions, the three fields are simply added
455 | together to form the result. This means that any integer portion of `zSig'
456 | will be added into the exponent. Since a properly normalized significand
457 | will have an integer portion equal to 1, the `zExp' input should be 1 less
458 | than the desired result exponent whenever `zSig' is a complete, normalized
459 | significand.
460 *----------------------------------------------------------------------------*/
461
462 INLINE float64 packFloat64( flag zSign, int16 zExp, uint64_t zSig )
463 {
464
465 return make_float64(
466 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
467
468 }
469
470 /*----------------------------------------------------------------------------
471 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
472 | and significand `zSig', and returns the proper double-precision floating-
473 | point value corresponding to the abstract input. Ordinarily, the abstract
474 | value is simply rounded and packed into the double-precision format, with
475 | the inexact exception raised if the abstract input cannot be represented
476 | exactly. However, if the abstract value is too large, the overflow and
477 | inexact exceptions are raised and an infinity or maximal finite value is
478 | returned. If the abstract value is too small, the input value is rounded
479 | to a subnormal number, and the underflow and inexact exceptions are raised
480 | if the abstract input cannot be represented exactly as a subnormal double-
481 | precision floating-point number.
482 | The input significand `zSig' has its binary point between bits 62
483 | and 61, which is 10 bits to the left of the usual location. This shifted
484 | significand must be normalized or smaller. If `zSig' is not normalized,
485 | `zExp' must be 0; in that case, the result returned is a subnormal number,
486 | and it must not require rounding. In the usual case that `zSig' is
487 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
488 | The handling of underflow and overflow follows the IEC/IEEE Standard for
489 | Binary Floating-Point Arithmetic.
490 *----------------------------------------------------------------------------*/
491
492 static float64 roundAndPackFloat64( flag zSign, int16 zExp, uint64_t zSig STATUS_PARAM)
493 {
494 int8 roundingMode;
495 flag roundNearestEven;
496 int16 roundIncrement, roundBits;
497 flag isTiny;
498
499 roundingMode = STATUS(float_rounding_mode);
500 roundNearestEven = ( roundingMode == float_round_nearest_even );
501 roundIncrement = 0x200;
502 if ( ! roundNearestEven ) {
503 if ( roundingMode == float_round_to_zero ) {
504 roundIncrement = 0;
505 }
506 else {
507 roundIncrement = 0x3FF;
508 if ( zSign ) {
509 if ( roundingMode == float_round_up ) roundIncrement = 0;
510 }
511 else {
512 if ( roundingMode == float_round_down ) roundIncrement = 0;
513 }
514 }
515 }
516 roundBits = zSig & 0x3FF;
517 if ( 0x7FD <= (uint16_t) zExp ) {
518 if ( ( 0x7FD < zExp )
519 || ( ( zExp == 0x7FD )
520 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
521 ) {
522 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
523 return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
524 }
525 if ( zExp < 0 ) {
526 if (STATUS(flush_to_zero)) {
527 float_raise(float_flag_output_denormal STATUS_VAR);
528 return packFloat64(zSign, 0, 0);
529 }
530 isTiny =
531 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
532 || ( zExp < -1 )
533 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
534 shift64RightJamming( zSig, - zExp, &zSig );
535 zExp = 0;
536 roundBits = zSig & 0x3FF;
537 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
538 }
539 }
540 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
541 zSig = ( zSig + roundIncrement )>>10;
542 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
543 if ( zSig == 0 ) zExp = 0;
544 return packFloat64( zSign, zExp, zSig );
545
546 }
547
548 /*----------------------------------------------------------------------------
549 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
550 | and significand `zSig', and returns the proper double-precision floating-
551 | point value corresponding to the abstract input. This routine is just like
552 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
553 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
554 | floating-point exponent.
555 *----------------------------------------------------------------------------*/
556
557 static float64
558 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, uint64_t zSig STATUS_PARAM)
559 {
560 int8 shiftCount;
561
562 shiftCount = countLeadingZeros64( zSig ) - 1;
563 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
564
565 }
566
567 #ifdef FLOATX80
568
569 /*----------------------------------------------------------------------------
570 | Returns the fraction bits of the extended double-precision floating-point
571 | value `a'.
572 *----------------------------------------------------------------------------*/
573
574 INLINE uint64_t extractFloatx80Frac( floatx80 a )
575 {
576
577 return a.low;
578
579 }
580
581 /*----------------------------------------------------------------------------
582 | Returns the exponent bits of the extended double-precision floating-point
583 | value `a'.
584 *----------------------------------------------------------------------------*/
585
586 INLINE int32 extractFloatx80Exp( floatx80 a )
587 {
588
589 return a.high & 0x7FFF;
590
591 }
592
593 /*----------------------------------------------------------------------------
594 | Returns the sign bit of the extended double-precision floating-point value
595 | `a'.
596 *----------------------------------------------------------------------------*/
597
598 INLINE flag extractFloatx80Sign( floatx80 a )
599 {
600
601 return a.high>>15;
602
603 }
604
605 /*----------------------------------------------------------------------------
606 | Normalizes the subnormal extended double-precision floating-point value
607 | represented by the denormalized significand `aSig'. The normalized exponent
608 | and significand are stored at the locations pointed to by `zExpPtr' and
609 | `zSigPtr', respectively.
610 *----------------------------------------------------------------------------*/
611
612 static void
613 normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr )
614 {
615 int8 shiftCount;
616
617 shiftCount = countLeadingZeros64( aSig );
618 *zSigPtr = aSig<<shiftCount;
619 *zExpPtr = 1 - shiftCount;
620
621 }
622
623 /*----------------------------------------------------------------------------
624 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
625 | extended double-precision floating-point value, returning the result.
626 *----------------------------------------------------------------------------*/
627
628 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, uint64_t zSig )
629 {
630 floatx80 z;
631
632 z.low = zSig;
633 z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
634 return z;
635
636 }
637
638 /*----------------------------------------------------------------------------
639 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
640 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
641 | and returns the proper extended double-precision floating-point value
642 | corresponding to the abstract input. Ordinarily, the abstract value is
643 | rounded and packed into the extended double-precision format, with the
644 | inexact exception raised if the abstract input cannot be represented
645 | exactly. However, if the abstract value is too large, the overflow and
646 | inexact exceptions are raised and an infinity or maximal finite value is
647 | returned. If the abstract value is too small, the input value is rounded to
648 | a subnormal number, and the underflow and inexact exceptions are raised if
649 | the abstract input cannot be represented exactly as a subnormal extended
650 | double-precision floating-point number.
651 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
652 | number of bits as single or double precision, respectively. Otherwise, the
653 | result is rounded to the full precision of the extended double-precision
654 | format.
655 | The input significand must be normalized or smaller. If the input
656 | significand is not normalized, `zExp' must be 0; in that case, the result
657 | returned is a subnormal number, and it must not require rounding. The
658 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
659 | Floating-Point Arithmetic.
660 *----------------------------------------------------------------------------*/
661
662 static floatx80
663 roundAndPackFloatx80(
664 int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
665 STATUS_PARAM)
666 {
667 int8 roundingMode;
668 flag roundNearestEven, increment, isTiny;
669 int64 roundIncrement, roundMask, roundBits;
670
671 roundingMode = STATUS(float_rounding_mode);
672 roundNearestEven = ( roundingMode == float_round_nearest_even );
673 if ( roundingPrecision == 80 ) goto precision80;
674 if ( roundingPrecision == 64 ) {
675 roundIncrement = LIT64( 0x0000000000000400 );
676 roundMask = LIT64( 0x00000000000007FF );
677 }
678 else if ( roundingPrecision == 32 ) {
679 roundIncrement = LIT64( 0x0000008000000000 );
680 roundMask = LIT64( 0x000000FFFFFFFFFF );
681 }
682 else {
683 goto precision80;
684 }
685 zSig0 |= ( zSig1 != 0 );
686 if ( ! roundNearestEven ) {
687 if ( roundingMode == float_round_to_zero ) {
688 roundIncrement = 0;
689 }
690 else {
691 roundIncrement = roundMask;
692 if ( zSign ) {
693 if ( roundingMode == float_round_up ) roundIncrement = 0;
694 }
695 else {
696 if ( roundingMode == float_round_down ) roundIncrement = 0;
697 }
698 }
699 }
700 roundBits = zSig0 & roundMask;
701 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
702 if ( ( 0x7FFE < zExp )
703 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
704 ) {
705 goto overflow;
706 }
707 if ( zExp <= 0 ) {
708 if (STATUS(flush_to_zero)) {
709 float_raise(float_flag_output_denormal STATUS_VAR);
710 return packFloatx80(zSign, 0, 0);
711 }
712 isTiny =
713 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
714 || ( zExp < 0 )
715 || ( zSig0 <= zSig0 + roundIncrement );
716 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
717 zExp = 0;
718 roundBits = zSig0 & roundMask;
719 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
720 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
721 zSig0 += roundIncrement;
722 if ( (int64_t) zSig0 < 0 ) zExp = 1;
723 roundIncrement = roundMask + 1;
724 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
725 roundMask |= roundIncrement;
726 }
727 zSig0 &= ~ roundMask;
728 return packFloatx80( zSign, zExp, zSig0 );
729 }
730 }
731 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
732 zSig0 += roundIncrement;
733 if ( zSig0 < roundIncrement ) {
734 ++zExp;
735 zSig0 = LIT64( 0x8000000000000000 );
736 }
737 roundIncrement = roundMask + 1;
738 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
739 roundMask |= roundIncrement;
740 }
741 zSig0 &= ~ roundMask;
742 if ( zSig0 == 0 ) zExp = 0;
743 return packFloatx80( zSign, zExp, zSig0 );
744 precision80:
745 increment = ( (int64_t) zSig1 < 0 );
746 if ( ! roundNearestEven ) {
747 if ( roundingMode == float_round_to_zero ) {
748 increment = 0;
749 }
750 else {
751 if ( zSign ) {
752 increment = ( roundingMode == float_round_down ) && zSig1;
753 }
754 else {
755 increment = ( roundingMode == float_round_up ) && zSig1;
756 }
757 }
758 }
759 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
760 if ( ( 0x7FFE < zExp )
761 || ( ( zExp == 0x7FFE )
762 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
763 && increment
764 )
765 ) {
766 roundMask = 0;
767 overflow:
768 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
769 if ( ( roundingMode == float_round_to_zero )
770 || ( zSign && ( roundingMode == float_round_up ) )
771 || ( ! zSign && ( roundingMode == float_round_down ) )
772 ) {
773 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
774 }
775 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
776 }
777 if ( zExp <= 0 ) {
778 isTiny =
779 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
780 || ( zExp < 0 )
781 || ! increment
782 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
783 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
784 zExp = 0;
785 if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
786 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
787 if ( roundNearestEven ) {
788 increment = ( (int64_t) zSig1 < 0 );
789 }
790 else {
791 if ( zSign ) {
792 increment = ( roundingMode == float_round_down ) && zSig1;
793 }
794 else {
795 increment = ( roundingMode == float_round_up ) && zSig1;
796 }
797 }
798 if ( increment ) {
799 ++zSig0;
800 zSig0 &=
801 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
802 if ( (int64_t) zSig0 < 0 ) zExp = 1;
803 }
804 return packFloatx80( zSign, zExp, zSig0 );
805 }
806 }
807 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
808 if ( increment ) {
809 ++zSig0;
810 if ( zSig0 == 0 ) {
811 ++zExp;
812 zSig0 = LIT64( 0x8000000000000000 );
813 }
814 else {
815 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
816 }
817 }
818 else {
819 if ( zSig0 == 0 ) zExp = 0;
820 }
821 return packFloatx80( zSign, zExp, zSig0 );
822
823 }
824
825 /*----------------------------------------------------------------------------
826 | Takes an abstract floating-point value having sign `zSign', exponent
827 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
828 | and returns the proper extended double-precision floating-point value
829 | corresponding to the abstract input. This routine is just like
830 | `roundAndPackFloatx80' except that the input significand does not have to be
831 | normalized.
832 *----------------------------------------------------------------------------*/
833
834 static floatx80
835 normalizeRoundAndPackFloatx80(
836 int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
837 STATUS_PARAM)
838 {
839 int8 shiftCount;
840
841 if ( zSig0 == 0 ) {
842 zSig0 = zSig1;
843 zSig1 = 0;
844 zExp -= 64;
845 }
846 shiftCount = countLeadingZeros64( zSig0 );
847 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
848 zExp -= shiftCount;
849 return
850 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
851
852 }
853
854 #endif
855
856 #ifdef FLOAT128
857
858 /*----------------------------------------------------------------------------
859 | Returns the least-significant 64 fraction bits of the quadruple-precision
860 | floating-point value `a'.
861 *----------------------------------------------------------------------------*/
862
863 INLINE uint64_t extractFloat128Frac1( float128 a )
864 {
865
866 return a.low;
867
868 }
869
870 /*----------------------------------------------------------------------------
871 | Returns the most-significant 48 fraction bits of the quadruple-precision
872 | floating-point value `a'.
873 *----------------------------------------------------------------------------*/
874
875 INLINE uint64_t extractFloat128Frac0( float128 a )
876 {
877
878 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
879
880 }
881
882 /*----------------------------------------------------------------------------
883 | Returns the exponent bits of the quadruple-precision floating-point value
884 | `a'.
885 *----------------------------------------------------------------------------*/
886
887 INLINE int32 extractFloat128Exp( float128 a )
888 {
889
890 return ( a.high>>48 ) & 0x7FFF;
891
892 }
893
894 /*----------------------------------------------------------------------------
895 | Returns the sign bit of the quadruple-precision floating-point value `a'.
896 *----------------------------------------------------------------------------*/
897
898 INLINE flag extractFloat128Sign( float128 a )
899 {
900
901 return a.high>>63;
902
903 }
904
905 /*----------------------------------------------------------------------------
906 | Normalizes the subnormal quadruple-precision floating-point value
907 | represented by the denormalized significand formed by the concatenation of
908 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
909 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
910 | significand are stored at the location pointed to by `zSig0Ptr', and the
911 | least significant 64 bits of the normalized significand are stored at the
912 | location pointed to by `zSig1Ptr'.
913 *----------------------------------------------------------------------------*/
914
915 static void
916 normalizeFloat128Subnormal(
917 uint64_t aSig0,
918 uint64_t aSig1,
919 int32 *zExpPtr,
920 uint64_t *zSig0Ptr,
921 uint64_t *zSig1Ptr
922 )
923 {
924 int8 shiftCount;
925
926 if ( aSig0 == 0 ) {
927 shiftCount = countLeadingZeros64( aSig1 ) - 15;
928 if ( shiftCount < 0 ) {
929 *zSig0Ptr = aSig1>>( - shiftCount );
930 *zSig1Ptr = aSig1<<( shiftCount & 63 );
931 }
932 else {
933 *zSig0Ptr = aSig1<<shiftCount;
934 *zSig1Ptr = 0;
935 }
936 *zExpPtr = - shiftCount - 63;
937 }
938 else {
939 shiftCount = countLeadingZeros64( aSig0 ) - 15;
940 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
941 *zExpPtr = 1 - shiftCount;
942 }
943
944 }
945
946 /*----------------------------------------------------------------------------
947 | Packs the sign `zSign', the exponent `zExp', and the significand formed
948 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
949 | floating-point value, returning the result. After being shifted into the
950 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
951 | added together to form the most significant 32 bits of the result. This
952 | means that any integer portion of `zSig0' will be added into the exponent.
953 | Since a properly normalized significand will have an integer portion equal
954 | to 1, the `zExp' input should be 1 less than the desired result exponent
955 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
956 | significand.
957 *----------------------------------------------------------------------------*/
958
959 INLINE float128
960 packFloat128( flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 )
961 {
962 float128 z;
963
964 z.low = zSig1;
965 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
966 return z;
967
968 }
969
970 /*----------------------------------------------------------------------------
971 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
972 | and extended significand formed by the concatenation of `zSig0', `zSig1',
973 | and `zSig2', and returns the proper quadruple-precision floating-point value
974 | corresponding to the abstract input. Ordinarily, the abstract value is
975 | simply rounded and packed into the quadruple-precision format, with the
976 | inexact exception raised if the abstract input cannot be represented
977 | exactly. However, if the abstract value is too large, the overflow and
978 | inexact exceptions are raised and an infinity or maximal finite value is
979 | returned. If the abstract value is too small, the input value is rounded to
980 | a subnormal number, and the underflow and inexact exceptions are raised if
981 | the abstract input cannot be represented exactly as a subnormal quadruple-
982 | precision floating-point number.
983 | The input significand must be normalized or smaller. If the input
984 | significand is not normalized, `zExp' must be 0; in that case, the result
985 | returned is a subnormal number, and it must not require rounding. In the
986 | usual case that the input significand is normalized, `zExp' must be 1 less
987 | than the ``true'' floating-point exponent. The handling of underflow and
988 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
989 *----------------------------------------------------------------------------*/
990
991 static float128
992 roundAndPackFloat128(
993 flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1, uint64_t zSig2 STATUS_PARAM)
994 {
995 int8 roundingMode;
996 flag roundNearestEven, increment, isTiny;
997
998 roundingMode = STATUS(float_rounding_mode);
999 roundNearestEven = ( roundingMode == float_round_nearest_even );
1000 increment = ( (int64_t) zSig2 < 0 );
1001 if ( ! roundNearestEven ) {
1002 if ( roundingMode == float_round_to_zero ) {
1003 increment = 0;
1004 }
1005 else {
1006 if ( zSign ) {
1007 increment = ( roundingMode == float_round_down ) && zSig2;
1008 }
1009 else {
1010 increment = ( roundingMode == float_round_up ) && zSig2;
1011 }
1012 }
1013 }
1014 if ( 0x7FFD <= (uint32_t) zExp ) {
1015 if ( ( 0x7FFD < zExp )
1016 || ( ( zExp == 0x7FFD )
1017 && eq128(
1018 LIT64( 0x0001FFFFFFFFFFFF ),
1019 LIT64( 0xFFFFFFFFFFFFFFFF ),
1020 zSig0,
1021 zSig1
1022 )
1023 && increment
1024 )
1025 ) {
1026 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
1027 if ( ( roundingMode == float_round_to_zero )
1028 || ( zSign && ( roundingMode == float_round_up ) )
1029 || ( ! zSign && ( roundingMode == float_round_down ) )
1030 ) {
1031 return
1032 packFloat128(
1033 zSign,
1034 0x7FFE,
1035 LIT64( 0x0000FFFFFFFFFFFF ),
1036 LIT64( 0xFFFFFFFFFFFFFFFF )
1037 );
1038 }
1039 return packFloat128( zSign, 0x7FFF, 0, 0 );
1040 }
1041 if ( zExp < 0 ) {
1042 if (STATUS(flush_to_zero)) {
1043 float_raise(float_flag_output_denormal STATUS_VAR);
1044 return packFloat128(zSign, 0, 0, 0);
1045 }
1046 isTiny =
1047 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
1048 || ( zExp < -1 )
1049 || ! increment
1050 || lt128(
1051 zSig0,
1052 zSig1,
1053 LIT64( 0x0001FFFFFFFFFFFF ),
1054 LIT64( 0xFFFFFFFFFFFFFFFF )
1055 );
1056 shift128ExtraRightJamming(
1057 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1058 zExp = 0;
1059 if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
1060 if ( roundNearestEven ) {
1061 increment = ( (int64_t) zSig2 < 0 );
1062 }
1063 else {
1064 if ( zSign ) {
1065 increment = ( roundingMode == float_round_down ) && zSig2;
1066 }
1067 else {
1068 increment = ( roundingMode == float_round_up ) && zSig2;
1069 }
1070 }
1071 }
1072 }
1073 if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1074 if ( increment ) {
1075 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1076 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1077 }
1078 else {
1079 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1080 }
1081 return packFloat128( zSign, zExp, zSig0, zSig1 );
1082
1083 }
1084
1085 /*----------------------------------------------------------------------------
1086 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1087 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1088 | returns the proper quadruple-precision floating-point value corresponding
1089 | to the abstract input. This routine is just like `roundAndPackFloat128'
1090 | except that the input significand has fewer bits and does not have to be
1091 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1092 | point exponent.
1093 *----------------------------------------------------------------------------*/
1094
1095 static float128
1096 normalizeRoundAndPackFloat128(
1097 flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 STATUS_PARAM)
1098 {
1099 int8 shiftCount;
1100 uint64_t zSig2;
1101
1102 if ( zSig0 == 0 ) {
1103 zSig0 = zSig1;
1104 zSig1 = 0;
1105 zExp -= 64;
1106 }
1107 shiftCount = countLeadingZeros64( zSig0 ) - 15;
1108 if ( 0 <= shiftCount ) {
1109 zSig2 = 0;
1110 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1111 }
1112 else {
1113 shift128ExtraRightJamming(
1114 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1115 }
1116 zExp -= shiftCount;
1117 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1118
1119 }
1120
1121 #endif
1122
1123 /*----------------------------------------------------------------------------
1124 | Returns the result of converting the 32-bit two's complement integer `a'
1125 | to the single-precision floating-point format. The conversion is performed
1126 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1127 *----------------------------------------------------------------------------*/
1128
1129 float32 int32_to_float32( int32 a STATUS_PARAM )
1130 {
1131 flag zSign;
1132
1133 if ( a == 0 ) return float32_zero;
1134 if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1135 zSign = ( a < 0 );
1136 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1137
1138 }
1139
1140 /*----------------------------------------------------------------------------
1141 | Returns the result of converting the 32-bit two's complement integer `a'
1142 | to the double-precision floating-point format. The conversion is performed
1143 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1144 *----------------------------------------------------------------------------*/
1145
1146 float64 int32_to_float64( int32 a STATUS_PARAM )
1147 {
1148 flag zSign;
1149 uint32 absA;
1150 int8 shiftCount;
1151 uint64_t zSig;
1152
1153 if ( a == 0 ) return float64_zero;
1154 zSign = ( a < 0 );
1155 absA = zSign ? - a : a;
1156 shiftCount = countLeadingZeros32( absA ) + 21;
1157 zSig = absA;
1158 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1159
1160 }
1161
1162 #ifdef FLOATX80
1163
1164 /*----------------------------------------------------------------------------
1165 | Returns the result of converting the 32-bit two's complement integer `a'
1166 | to the extended double-precision floating-point format. The conversion
1167 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1168 | Arithmetic.
1169 *----------------------------------------------------------------------------*/
1170
1171 floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1172 {
1173 flag zSign;
1174 uint32 absA;
1175 int8 shiftCount;
1176 uint64_t zSig;
1177
1178 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1179 zSign = ( a < 0 );
1180 absA = zSign ? - a : a;
1181 shiftCount = countLeadingZeros32( absA ) + 32;
1182 zSig = absA;
1183 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1184
1185 }
1186
1187 #endif
1188
1189 #ifdef FLOAT128
1190
1191 /*----------------------------------------------------------------------------
1192 | Returns the result of converting the 32-bit two's complement integer `a' to
1193 | the quadruple-precision floating-point format. The conversion is performed
1194 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1195 *----------------------------------------------------------------------------*/
1196
1197 float128 int32_to_float128( int32 a STATUS_PARAM )
1198 {
1199 flag zSign;
1200 uint32 absA;
1201 int8 shiftCount;
1202 uint64_t zSig0;
1203
1204 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1205 zSign = ( a < 0 );
1206 absA = zSign ? - a : a;
1207 shiftCount = countLeadingZeros32( absA ) + 17;
1208 zSig0 = absA;
1209 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1210
1211 }
1212
1213 #endif
1214
1215 /*----------------------------------------------------------------------------
1216 | Returns the result of converting the 64-bit two's complement integer `a'
1217 | to the single-precision floating-point format. The conversion is performed
1218 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1219 *----------------------------------------------------------------------------*/
1220
1221 float32 int64_to_float32( int64 a STATUS_PARAM )
1222 {
1223 flag zSign;
1224 uint64 absA;
1225 int8 shiftCount;
1226
1227 if ( a == 0 ) return float32_zero;
1228 zSign = ( a < 0 );
1229 absA = zSign ? - a : a;
1230 shiftCount = countLeadingZeros64( absA ) - 40;
1231 if ( 0 <= shiftCount ) {
1232 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1233 }
1234 else {
1235 shiftCount += 7;
1236 if ( shiftCount < 0 ) {
1237 shift64RightJamming( absA, - shiftCount, &absA );
1238 }
1239 else {
1240 absA <<= shiftCount;
1241 }
1242 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1243 }
1244
1245 }
1246
1247 float32 uint64_to_float32( uint64 a STATUS_PARAM )
1248 {
1249 int8 shiftCount;
1250
1251 if ( a == 0 ) return float32_zero;
1252 shiftCount = countLeadingZeros64( a ) - 40;
1253 if ( 0 <= shiftCount ) {
1254 return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount );
1255 }
1256 else {
1257 shiftCount += 7;
1258 if ( shiftCount < 0 ) {
1259 shift64RightJamming( a, - shiftCount, &a );
1260 }
1261 else {
1262 a <<= shiftCount;
1263 }
1264 return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR );
1265 }
1266 }
1267
1268 /*----------------------------------------------------------------------------
1269 | Returns the result of converting the 64-bit two's complement integer `a'
1270 | to the double-precision floating-point format. The conversion is performed
1271 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1272 *----------------------------------------------------------------------------*/
1273
1274 float64 int64_to_float64( int64 a STATUS_PARAM )
1275 {
1276 flag zSign;
1277
1278 if ( a == 0 ) return float64_zero;
1279 if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
1280 return packFloat64( 1, 0x43E, 0 );
1281 }
1282 zSign = ( a < 0 );
1283 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1284
1285 }
1286
1287 float64 uint64_to_float64( uint64 a STATUS_PARAM )
1288 {
1289 if ( a == 0 ) return float64_zero;
1290 return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR );
1291
1292 }
1293
1294 #ifdef FLOATX80
1295
1296 /*----------------------------------------------------------------------------
1297 | Returns the result of converting the 64-bit two's complement integer `a'
1298 | to the extended double-precision floating-point format. The conversion
1299 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1300 | Arithmetic.
1301 *----------------------------------------------------------------------------*/
1302
1303 floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1304 {
1305 flag zSign;
1306 uint64 absA;
1307 int8 shiftCount;
1308
1309 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1310 zSign = ( a < 0 );
1311 absA = zSign ? - a : a;
1312 shiftCount = countLeadingZeros64( absA );
1313 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1314
1315 }
1316
1317 #endif
1318
1319 #ifdef FLOAT128
1320
1321 /*----------------------------------------------------------------------------
1322 | Returns the result of converting the 64-bit two's complement integer `a' to
1323 | the quadruple-precision floating-point format. The conversion is performed
1324 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1325 *----------------------------------------------------------------------------*/
1326
1327 float128 int64_to_float128( int64 a STATUS_PARAM )
1328 {
1329 flag zSign;
1330 uint64 absA;
1331 int8 shiftCount;
1332 int32 zExp;
1333 uint64_t zSig0, zSig1;
1334
1335 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1336 zSign = ( a < 0 );
1337 absA = zSign ? - a : a;
1338 shiftCount = countLeadingZeros64( absA ) + 49;
1339 zExp = 0x406E - shiftCount;
1340 if ( 64 <= shiftCount ) {
1341 zSig1 = 0;
1342 zSig0 = absA;
1343 shiftCount -= 64;
1344 }
1345 else {
1346 zSig1 = absA;
1347 zSig0 = 0;
1348 }
1349 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1350 return packFloat128( zSign, zExp, zSig0, zSig1 );
1351
1352 }
1353
1354 #endif
1355
1356 /*----------------------------------------------------------------------------
1357 | Returns the result of converting the single-precision floating-point value
1358 | `a' to the 32-bit two's complement integer format. The conversion is
1359 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1360 | Arithmetic---which means in particular that the conversion is rounded
1361 | according to the current rounding mode. If `a' is a NaN, the largest
1362 | positive integer is returned. Otherwise, if the conversion overflows, the
1363 | largest integer with the same sign as `a' is returned.
1364 *----------------------------------------------------------------------------*/
1365
1366 int32 float32_to_int32( float32 a STATUS_PARAM )
1367 {
1368 flag aSign;
1369 int16 aExp, shiftCount;
1370 uint32_t aSig;
1371 uint64_t aSig64;
1372
1373 a = float32_squash_input_denormal(a STATUS_VAR);
1374 aSig = extractFloat32Frac( a );
1375 aExp = extractFloat32Exp( a );
1376 aSign = extractFloat32Sign( a );
1377 if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1378 if ( aExp ) aSig |= 0x00800000;
1379 shiftCount = 0xAF - aExp;
1380 aSig64 = aSig;
1381 aSig64 <<= 32;
1382 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1383 return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1384
1385 }
1386
1387 /*----------------------------------------------------------------------------
1388 | Returns the result of converting the single-precision floating-point value
1389 | `a' to the 32-bit two's complement integer format. The conversion is
1390 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1391 | Arithmetic, except that the conversion is always rounded toward zero.
1392 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1393 | the conversion overflows, the largest integer with the same sign as `a' is
1394 | returned.
1395 *----------------------------------------------------------------------------*/
1396
1397 int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1398 {
1399 flag aSign;
1400 int16 aExp, shiftCount;
1401 uint32_t aSig;
1402 int32 z;
1403 a = float32_squash_input_denormal(a STATUS_VAR);
1404
1405 aSig = extractFloat32Frac( a );
1406 aExp = extractFloat32Exp( a );
1407 aSign = extractFloat32Sign( a );
1408 shiftCount = aExp - 0x9E;
1409 if ( 0 <= shiftCount ) {
1410 if ( float32_val(a) != 0xCF000000 ) {
1411 float_raise( float_flag_invalid STATUS_VAR);
1412 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1413 }
1414 return (int32_t) 0x80000000;
1415 }
1416 else if ( aExp <= 0x7E ) {
1417 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1418 return 0;
1419 }
1420 aSig = ( aSig | 0x00800000 )<<8;
1421 z = aSig>>( - shiftCount );
1422 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1423 STATUS(float_exception_flags) |= float_flag_inexact;
1424 }
1425 if ( aSign ) z = - z;
1426 return z;
1427
1428 }
1429
1430 /*----------------------------------------------------------------------------
1431 | Returns the result of converting the single-precision floating-point value
1432 | `a' to the 16-bit two's complement integer format. The conversion is
1433 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1434 | Arithmetic, except that the conversion is always rounded toward zero.
1435 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1436 | the conversion overflows, the largest integer with the same sign as `a' is
1437 | returned.
1438 *----------------------------------------------------------------------------*/
1439
1440 int16 float32_to_int16_round_to_zero( float32 a STATUS_PARAM )
1441 {
1442 flag aSign;
1443 int16 aExp, shiftCount;
1444 uint32_t aSig;
1445 int32 z;
1446
1447 aSig = extractFloat32Frac( a );
1448 aExp = extractFloat32Exp( a );
1449 aSign = extractFloat32Sign( a );
1450 shiftCount = aExp - 0x8E;
1451 if ( 0 <= shiftCount ) {
1452 if ( float32_val(a) != 0xC7000000 ) {
1453 float_raise( float_flag_invalid STATUS_VAR);
1454 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1455 return 0x7FFF;
1456 }
1457 }
1458 return (int32_t) 0xffff8000;
1459 }
1460 else if ( aExp <= 0x7E ) {
1461 if ( aExp | aSig ) {
1462 STATUS(float_exception_flags) |= float_flag_inexact;
1463 }
1464 return 0;
1465 }
1466 shiftCount -= 0x10;
1467 aSig = ( aSig | 0x00800000 )<<8;
1468 z = aSig>>( - shiftCount );
1469 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1470 STATUS(float_exception_flags) |= float_flag_inexact;
1471 }
1472 if ( aSign ) {
1473 z = - z;
1474 }
1475 return z;
1476
1477 }
1478
1479 /*----------------------------------------------------------------------------
1480 | Returns the result of converting the single-precision floating-point value
1481 | `a' to the 64-bit two's complement integer format. The conversion is
1482 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1483 | Arithmetic---which means in particular that the conversion is rounded
1484 | according to the current rounding mode. If `a' is a NaN, the largest
1485 | positive integer is returned. Otherwise, if the conversion overflows, the
1486 | largest integer with the same sign as `a' is returned.
1487 *----------------------------------------------------------------------------*/
1488
1489 int64 float32_to_int64( float32 a STATUS_PARAM )
1490 {
1491 flag aSign;
1492 int16 aExp, shiftCount;
1493 uint32_t aSig;
1494 uint64_t aSig64, aSigExtra;
1495 a = float32_squash_input_denormal(a STATUS_VAR);
1496
1497 aSig = extractFloat32Frac( a );
1498 aExp = extractFloat32Exp( a );
1499 aSign = extractFloat32Sign( a );
1500 shiftCount = 0xBE - aExp;
1501 if ( shiftCount < 0 ) {
1502 float_raise( float_flag_invalid STATUS_VAR);
1503 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1504 return LIT64( 0x7FFFFFFFFFFFFFFF );
1505 }
1506 return (int64_t) LIT64( 0x8000000000000000 );
1507 }
1508 if ( aExp ) aSig |= 0x00800000;
1509 aSig64 = aSig;
1510 aSig64 <<= 40;
1511 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1512 return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1513
1514 }
1515
1516 /*----------------------------------------------------------------------------
1517 | Returns the result of converting the single-precision floating-point value
1518 | `a' to the 64-bit two's complement integer format. The conversion is
1519 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1520 | Arithmetic, except that the conversion is always rounded toward zero. If
1521 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1522 | conversion overflows, the largest integer with the same sign as `a' is
1523 | returned.
1524 *----------------------------------------------------------------------------*/
1525
1526 int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1527 {
1528 flag aSign;
1529 int16 aExp, shiftCount;
1530 uint32_t aSig;
1531 uint64_t aSig64;
1532 int64 z;
1533 a = float32_squash_input_denormal(a STATUS_VAR);
1534
1535 aSig = extractFloat32Frac( a );
1536 aExp = extractFloat32Exp( a );
1537 aSign = extractFloat32Sign( a );
1538 shiftCount = aExp - 0xBE;
1539 if ( 0 <= shiftCount ) {
1540 if ( float32_val(a) != 0xDF000000 ) {
1541 float_raise( float_flag_invalid STATUS_VAR);
1542 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1543 return LIT64( 0x7FFFFFFFFFFFFFFF );
1544 }
1545 }
1546 return (int64_t) LIT64( 0x8000000000000000 );
1547 }
1548 else if ( aExp <= 0x7E ) {
1549 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1550 return 0;
1551 }
1552 aSig64 = aSig | 0x00800000;
1553 aSig64 <<= 40;
1554 z = aSig64>>( - shiftCount );
1555 if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
1556 STATUS(float_exception_flags) |= float_flag_inexact;
1557 }
1558 if ( aSign ) z = - z;
1559 return z;
1560
1561 }
1562
1563 /*----------------------------------------------------------------------------
1564 | Returns the result of converting the single-precision floating-point value
1565 | `a' to the double-precision floating-point format. The conversion is
1566 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1567 | Arithmetic.
1568 *----------------------------------------------------------------------------*/
1569
1570 float64 float32_to_float64( float32 a STATUS_PARAM )
1571 {
1572 flag aSign;
1573 int16 aExp;
1574 uint32_t aSig;
1575 a = float32_squash_input_denormal(a STATUS_VAR);
1576
1577 aSig = extractFloat32Frac( a );
1578 aExp = extractFloat32Exp( a );
1579 aSign = extractFloat32Sign( a );
1580 if ( aExp == 0xFF ) {
1581 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1582 return packFloat64( aSign, 0x7FF, 0 );
1583 }
1584 if ( aExp == 0 ) {
1585 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1586 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1587 --aExp;
1588 }
1589 return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
1590
1591 }
1592
1593 #ifdef FLOATX80
1594
1595 /*----------------------------------------------------------------------------
1596 | Returns the result of converting the single-precision floating-point value
1597 | `a' to the extended double-precision floating-point format. The conversion
1598 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1599 | Arithmetic.
1600 *----------------------------------------------------------------------------*/
1601
1602 floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1603 {
1604 flag aSign;
1605 int16 aExp;
1606 uint32_t aSig;
1607
1608 a = float32_squash_input_denormal(a STATUS_VAR);
1609 aSig = extractFloat32Frac( a );
1610 aExp = extractFloat32Exp( a );
1611 aSign = extractFloat32Sign( a );
1612 if ( aExp == 0xFF ) {
1613 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1614 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1615 }
1616 if ( aExp == 0 ) {
1617 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1618 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1619 }
1620 aSig |= 0x00800000;
1621 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
1622
1623 }
1624
1625 #endif
1626
1627 #ifdef FLOAT128
1628
1629 /*----------------------------------------------------------------------------
1630 | Returns the result of converting the single-precision floating-point value
1631 | `a' to the double-precision floating-point format. The conversion is
1632 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1633 | Arithmetic.
1634 *----------------------------------------------------------------------------*/
1635
1636 float128 float32_to_float128( float32 a STATUS_PARAM )
1637 {
1638 flag aSign;
1639 int16 aExp;
1640 uint32_t aSig;
1641
1642 a = float32_squash_input_denormal(a STATUS_VAR);
1643 aSig = extractFloat32Frac( a );
1644 aExp = extractFloat32Exp( a );
1645 aSign = extractFloat32Sign( a );
1646 if ( aExp == 0xFF ) {
1647 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1648 return packFloat128( aSign, 0x7FFF, 0, 0 );
1649 }
1650 if ( aExp == 0 ) {
1651 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1652 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1653 --aExp;
1654 }
1655 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
1656
1657 }
1658
1659 #endif
1660
1661 /*----------------------------------------------------------------------------
1662 | Rounds the single-precision floating-point value `a' to an integer, and
1663 | returns the result as a single-precision floating-point value. The
1664 | operation is performed according to the IEC/IEEE Standard for Binary
1665 | Floating-Point Arithmetic.
1666 *----------------------------------------------------------------------------*/
1667
1668 float32 float32_round_to_int( float32 a STATUS_PARAM)
1669 {
1670 flag aSign;
1671 int16 aExp;
1672 uint32_t lastBitMask, roundBitsMask;
1673 int8 roundingMode;
1674 uint32_t z;
1675 a = float32_squash_input_denormal(a STATUS_VAR);
1676
1677 aExp = extractFloat32Exp( a );
1678 if ( 0x96 <= aExp ) {
1679 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1680 return propagateFloat32NaN( a, a STATUS_VAR );
1681 }
1682 return a;
1683 }
1684 if ( aExp <= 0x7E ) {
1685 if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
1686 STATUS(float_exception_flags) |= float_flag_inexact;
1687 aSign = extractFloat32Sign( a );
1688 switch ( STATUS(float_rounding_mode) ) {
1689 case float_round_nearest_even:
1690 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1691 return packFloat32( aSign, 0x7F, 0 );
1692 }
1693 break;
1694 case float_round_down:
1695 return make_float32(aSign ? 0xBF800000 : 0);
1696 case float_round_up:
1697 return make_float32(aSign ? 0x80000000 : 0x3F800000);
1698 }
1699 return packFloat32( aSign, 0, 0 );
1700 }
1701 lastBitMask = 1;
1702 lastBitMask <<= 0x96 - aExp;
1703 roundBitsMask = lastBitMask - 1;
1704 z = float32_val(a);
1705 roundingMode = STATUS(float_rounding_mode);
1706 if ( roundingMode == float_round_nearest_even ) {
1707 z += lastBitMask>>1;
1708 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1709 }
1710 else if ( roundingMode != float_round_to_zero ) {
1711 if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
1712 z += roundBitsMask;
1713 }
1714 }
1715 z &= ~ roundBitsMask;
1716 if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1717 return make_float32(z);
1718
1719 }
1720
1721 /*----------------------------------------------------------------------------
1722 | Returns the result of adding the absolute values of the single-precision
1723 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1724 | before being returned. `zSign' is ignored if the result is a NaN.
1725 | The addition is performed according to the IEC/IEEE Standard for Binary
1726 | Floating-Point Arithmetic.
1727 *----------------------------------------------------------------------------*/
1728
1729 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1730 {
1731 int16 aExp, bExp, zExp;
1732 uint32_t aSig, bSig, zSig;
1733 int16 expDiff;
1734
1735 aSig = extractFloat32Frac( a );
1736 aExp = extractFloat32Exp( a );
1737 bSig = extractFloat32Frac( b );
1738 bExp = extractFloat32Exp( b );
1739 expDiff = aExp - bExp;
1740 aSig <<= 6;
1741 bSig <<= 6;
1742 if ( 0 < expDiff ) {
1743 if ( aExp == 0xFF ) {
1744 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1745 return a;
1746 }
1747 if ( bExp == 0 ) {
1748 --expDiff;
1749 }
1750 else {
1751 bSig |= 0x20000000;
1752 }
1753 shift32RightJamming( bSig, expDiff, &bSig );
1754 zExp = aExp;
1755 }
1756 else if ( expDiff < 0 ) {
1757 if ( bExp == 0xFF ) {
1758 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1759 return packFloat32( zSign, 0xFF, 0 );
1760 }
1761 if ( aExp == 0 ) {
1762 ++expDiff;
1763 }
1764 else {
1765 aSig |= 0x20000000;
1766 }
1767 shift32RightJamming( aSig, - expDiff, &aSig );
1768 zExp = bExp;
1769 }
1770 else {
1771 if ( aExp == 0xFF ) {
1772 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1773 return a;
1774 }
1775 if ( aExp == 0 ) {
1776 if (STATUS(flush_to_zero)) {
1777 if (aSig | bSig) {
1778 float_raise(float_flag_output_denormal STATUS_VAR);
1779 }
1780 return packFloat32(zSign, 0, 0);
1781 }
1782 return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1783 }
1784 zSig = 0x40000000 + aSig + bSig;
1785 zExp = aExp;
1786 goto roundAndPack;
1787 }
1788 aSig |= 0x20000000;
1789 zSig = ( aSig + bSig )<<1;
1790 --zExp;
1791 if ( (int32_t) zSig < 0 ) {
1792 zSig = aSig + bSig;
1793 ++zExp;
1794 }
1795 roundAndPack:
1796 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1797
1798 }
1799
1800 /*----------------------------------------------------------------------------
1801 | Returns the result of subtracting the absolute values of the single-
1802 | precision floating-point values `a' and `b'. If `zSign' is 1, the
1803 | difference is negated before being returned. `zSign' is ignored if the
1804 | result is a NaN. The subtraction is performed according to the IEC/IEEE
1805 | Standard for Binary Floating-Point Arithmetic.
1806 *----------------------------------------------------------------------------*/
1807
1808 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1809 {
1810 int16 aExp, bExp, zExp;
1811 uint32_t aSig, bSig, zSig;
1812 int16 expDiff;
1813
1814 aSig = extractFloat32Frac( a );
1815 aExp = extractFloat32Exp( a );
1816 bSig = extractFloat32Frac( b );
1817 bExp = extractFloat32Exp( b );
1818 expDiff = aExp - bExp;
1819 aSig <<= 7;
1820 bSig <<= 7;
1821 if ( 0 < expDiff ) goto aExpBigger;
1822 if ( expDiff < 0 ) goto bExpBigger;
1823 if ( aExp == 0xFF ) {
1824 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1825 float_raise( float_flag_invalid STATUS_VAR);
1826 return float32_default_nan;
1827 }
1828 if ( aExp == 0 ) {
1829 aExp = 1;
1830 bExp = 1;
1831 }
1832 if ( bSig < aSig ) goto aBigger;
1833 if ( aSig < bSig ) goto bBigger;
1834 return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1835 bExpBigger:
1836 if ( bExp == 0xFF ) {
1837 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1838 return packFloat32( zSign ^ 1, 0xFF, 0 );
1839 }
1840 if ( aExp == 0 ) {
1841 ++expDiff;
1842 }
1843 else {
1844 aSig |= 0x40000000;
1845 }
1846 shift32RightJamming( aSig, - expDiff, &aSig );
1847 bSig |= 0x40000000;
1848 bBigger:
1849 zSig = bSig - aSig;
1850 zExp = bExp;
1851 zSign ^= 1;
1852 goto normalizeRoundAndPack;
1853 aExpBigger:
1854 if ( aExp == 0xFF ) {
1855 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1856 return a;
1857 }
1858 if ( bExp == 0 ) {
1859 --expDiff;
1860 }
1861 else {
1862 bSig |= 0x40000000;
1863 }
1864 shift32RightJamming( bSig, expDiff, &bSig );
1865 aSig |= 0x40000000;
1866 aBigger:
1867 zSig = aSig - bSig;
1868 zExp = aExp;
1869 normalizeRoundAndPack:
1870 --zExp;
1871 return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1872
1873 }
1874
1875 /*----------------------------------------------------------------------------
1876 | Returns the result of adding the single-precision floating-point values `a'
1877 | and `b'. The operation is performed according to the IEC/IEEE Standard for
1878 | Binary Floating-Point Arithmetic.
1879 *----------------------------------------------------------------------------*/
1880
1881 float32 float32_add( float32 a, float32 b STATUS_PARAM )
1882 {
1883 flag aSign, bSign;
1884 a = float32_squash_input_denormal(a STATUS_VAR);
1885 b = float32_squash_input_denormal(b STATUS_VAR);
1886
1887 aSign = extractFloat32Sign( a );
1888 bSign = extractFloat32Sign( b );
1889 if ( aSign == bSign ) {
1890 return addFloat32Sigs( a, b, aSign STATUS_VAR);
1891 }
1892 else {
1893 return subFloat32Sigs( a, b, aSign STATUS_VAR );
1894 }
1895
1896 }
1897
1898 /*----------------------------------------------------------------------------
1899 | Returns the result of subtracting the single-precision floating-point values
1900 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1901 | for Binary Floating-Point Arithmetic.
1902 *----------------------------------------------------------------------------*/
1903
1904 float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1905 {
1906 flag aSign, bSign;
1907 a = float32_squash_input_denormal(a STATUS_VAR);
1908 b = float32_squash_input_denormal(b STATUS_VAR);
1909
1910 aSign = extractFloat32Sign( a );
1911 bSign = extractFloat32Sign( b );
1912 if ( aSign == bSign ) {
1913 return subFloat32Sigs( a, b, aSign STATUS_VAR );
1914 }
1915 else {
1916 return addFloat32Sigs( a, b, aSign STATUS_VAR );
1917 }
1918
1919 }
1920
1921 /*----------------------------------------------------------------------------
1922 | Returns the result of multiplying the single-precision floating-point values
1923 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1924 | for Binary Floating-Point Arithmetic.
1925 *----------------------------------------------------------------------------*/
1926
1927 float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1928 {
1929 flag aSign, bSign, zSign;
1930 int16 aExp, bExp, zExp;
1931 uint32_t aSig, bSig;
1932 uint64_t zSig64;
1933 uint32_t zSig;
1934
1935 a = float32_squash_input_denormal(a STATUS_VAR);
1936 b = float32_squash_input_denormal(b STATUS_VAR);
1937
1938 aSig = extractFloat32Frac( a );
1939 aExp = extractFloat32Exp( a );
1940 aSign = extractFloat32Sign( a );
1941 bSig = extractFloat32Frac( b );
1942 bExp = extractFloat32Exp( b );
1943 bSign = extractFloat32Sign( b );
1944 zSign = aSign ^ bSign;
1945 if ( aExp == 0xFF ) {
1946 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1947 return propagateFloat32NaN( a, b STATUS_VAR );
1948 }
1949 if ( ( bExp | bSig ) == 0 ) {
1950 float_raise( float_flag_invalid STATUS_VAR);
1951 return float32_default_nan;
1952 }
1953 return packFloat32( zSign, 0xFF, 0 );
1954 }
1955 if ( bExp == 0xFF ) {
1956 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1957 if ( ( aExp | aSig ) == 0 ) {
1958 float_raise( float_flag_invalid STATUS_VAR);
1959 return float32_default_nan;
1960 }
1961 return packFloat32( zSign, 0xFF, 0 );
1962 }
1963 if ( aExp == 0 ) {
1964 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1965 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1966 }
1967 if ( bExp == 0 ) {
1968 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1969 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1970 }
1971 zExp = aExp + bExp - 0x7F;
1972 aSig = ( aSig | 0x00800000 )<<7;
1973 bSig = ( bSig | 0x00800000 )<<8;
1974 shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
1975 zSig = zSig64;
1976 if ( 0 <= (int32_t) ( zSig<<1 ) ) {
1977 zSig <<= 1;
1978 --zExp;
1979 }
1980 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1981
1982 }
1983
1984 /*----------------------------------------------------------------------------
1985 | Returns the result of dividing the single-precision floating-point value `a'
1986 | by the corresponding value `b'. The operation is performed according to the
1987 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1988 *----------------------------------------------------------------------------*/
1989
1990 float32 float32_div( float32 a, float32 b STATUS_PARAM )
1991 {
1992 flag aSign, bSign, zSign;
1993 int16 aExp, bExp, zExp;
1994 uint32_t aSig, bSig, zSig;
1995 a = float32_squash_input_denormal(a STATUS_VAR);
1996 b = float32_squash_input_denormal(b STATUS_VAR);
1997
1998 aSig = extractFloat32Frac( a );
1999 aExp = extractFloat32Exp( a );
2000 aSign = extractFloat32Sign( a );
2001 bSig = extractFloat32Frac( b );
2002 bExp = extractFloat32Exp( b );
2003 bSign = extractFloat32Sign( b );
2004 zSign = aSign ^ bSign;
2005 if ( aExp == 0xFF ) {
2006 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2007 if ( bExp == 0xFF ) {
2008 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2009 float_raise( float_flag_invalid STATUS_VAR);
2010 return float32_default_nan;
2011 }
2012 return packFloat32( zSign, 0xFF, 0 );
2013 }
2014 if ( bExp == 0xFF ) {
2015 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2016 return packFloat32( zSign, 0, 0 );
2017 }
2018 if ( bExp == 0 ) {
2019 if ( bSig == 0 ) {
2020 if ( ( aExp | aSig ) == 0 ) {
2021 float_raise( float_flag_invalid STATUS_VAR);
2022 return float32_default_nan;
2023 }
2024 float_raise( float_flag_divbyzero STATUS_VAR);
2025 return packFloat32( zSign, 0xFF, 0 );
2026 }
2027 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2028 }
2029 if ( aExp == 0 ) {
2030 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2031 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2032 }
2033 zExp = aExp - bExp + 0x7D;
2034 aSig = ( aSig | 0x00800000 )<<7;
2035 bSig = ( bSig | 0x00800000 )<<8;
2036 if ( bSig <= ( aSig + aSig ) ) {
2037 aSig >>= 1;
2038 ++zExp;
2039 }
2040 zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
2041 if ( ( zSig & 0x3F ) == 0 ) {
2042 zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
2043 }
2044 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2045
2046 }
2047
2048 /*----------------------------------------------------------------------------
2049 | Returns the remainder of the single-precision floating-point value `a'
2050 | with respect to the corresponding value `b'. The operation is performed
2051 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2052 *----------------------------------------------------------------------------*/
2053
2054 float32 float32_rem( float32 a, float32 b STATUS_PARAM )
2055 {
2056 flag aSign, zSign;
2057 int16 aExp, bExp, expDiff;
2058 uint32_t aSig, bSig;
2059 uint32_t q;
2060 uint64_t aSig64, bSig64, q64;
2061 uint32_t alternateASig;
2062 int32_t sigMean;
2063 a = float32_squash_input_denormal(a STATUS_VAR);
2064 b = float32_squash_input_denormal(b STATUS_VAR);
2065
2066 aSig = extractFloat32Frac( a );
2067 aExp = extractFloat32Exp( a );
2068 aSign = extractFloat32Sign( a );
2069 bSig = extractFloat32Frac( b );
2070 bExp = extractFloat32Exp( b );
2071 if ( aExp == 0xFF ) {
2072 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2073 return propagateFloat32NaN( a, b STATUS_VAR );
2074 }
2075 float_raise( float_flag_invalid STATUS_VAR);
2076 return float32_default_nan;
2077 }
2078 if ( bExp == 0xFF ) {
2079 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2080 return a;
2081 }
2082 if ( bExp == 0 ) {
2083 if ( bSig == 0 ) {
2084 float_raise( float_flag_invalid STATUS_VAR);
2085 return float32_default_nan;
2086 }
2087 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2088 }
2089 if ( aExp == 0 ) {
2090 if ( aSig == 0 ) return a;
2091 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2092 }
2093 expDiff = aExp - bExp;
2094 aSig |= 0x00800000;
2095 bSig |= 0x00800000;
2096 if ( expDiff < 32 ) {
2097 aSig <<= 8;
2098 bSig <<= 8;
2099 if ( expDiff < 0 ) {
2100 if ( expDiff < -1 ) return a;
2101 aSig >>= 1;
2102 }
2103 q = ( bSig <= aSig );
2104 if ( q ) aSig -= bSig;
2105 if ( 0 < expDiff ) {
2106 q = ( ( (uint64_t) aSig )<<32 ) / bSig;
2107 q >>= 32 - expDiff;
2108 bSig >>= 2;
2109 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2110 }
2111 else {
2112 aSig >>= 2;
2113 bSig >>= 2;
2114 }
2115 }
2116 else {
2117 if ( bSig <= aSig ) aSig -= bSig;
2118 aSig64 = ( (uint64_t) aSig )<<40;
2119 bSig64 = ( (uint64_t) bSig )<<40;
2120 expDiff -= 64;
2121 while ( 0 < expDiff ) {
2122 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2123 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2124 aSig64 = - ( ( bSig * q64 )<<38 );
2125 expDiff -= 62;
2126 }
2127 expDiff += 64;
2128 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2129 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2130 q = q64>>( 64 - expDiff );
2131 bSig <<= 6;
2132 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2133 }
2134 do {
2135 alternateASig = aSig;
2136 ++q;
2137 aSig -= bSig;
2138 } while ( 0 <= (int32_t) aSig );
2139 sigMean = aSig + alternateASig;
2140 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2141 aSig = alternateASig;
2142 }
2143 zSign = ( (int32_t) aSig < 0 );
2144 if ( zSign ) aSig = - aSig;
2145 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2146
2147 }
2148
2149 /*----------------------------------------------------------------------------
2150 | Returns the square root of the single-precision floating-point value `a'.
2151 | The operation is performed according to the IEC/IEEE Standard for Binary
2152 | Floating-Point Arithmetic.
2153 *----------------------------------------------------------------------------*/
2154
2155 float32 float32_sqrt( float32 a STATUS_PARAM )
2156 {
2157 flag aSign;
2158 int16 aExp, zExp;
2159 uint32_t aSig, zSig;
2160 uint64_t rem, term;
2161 a = float32_squash_input_denormal(a STATUS_VAR);
2162
2163 aSig = extractFloat32Frac( a );
2164 aExp = extractFloat32Exp( a );
2165 aSign = extractFloat32Sign( a );
2166 if ( aExp == 0xFF ) {
2167 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2168 if ( ! aSign ) return a;
2169 float_raise( float_flag_invalid STATUS_VAR);
2170 return float32_default_nan;
2171 }
2172 if ( aSign ) {
2173 if ( ( aExp | aSig ) == 0 ) return a;
2174 float_raise( float_flag_invalid STATUS_VAR);
2175 return float32_default_nan;
2176 }
2177 if ( aExp == 0 ) {
2178 if ( aSig == 0 ) return float32_zero;
2179 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2180 }
2181 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2182 aSig = ( aSig | 0x00800000 )<<8;
2183 zSig = estimateSqrt32( aExp, aSig ) + 2;
2184 if ( ( zSig & 0x7F ) <= 5 ) {
2185 if ( zSig < 2 ) {
2186 zSig = 0x7FFFFFFF;
2187 goto roundAndPack;
2188 }
2189 aSig >>= aExp & 1;
2190 term = ( (uint64_t) zSig ) * zSig;
2191 rem = ( ( (uint64_t) aSig )<<32 ) - term;
2192 while ( (int64_t) rem < 0 ) {
2193 --zSig;
2194 rem += ( ( (uint64_t) zSig )<<1 ) | 1;
2195 }
2196 zSig |= ( rem != 0 );
2197 }
2198 shift32RightJamming( zSig, 1, &zSig );
2199 roundAndPack:
2200 return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2201
2202 }
2203
2204 /*----------------------------------------------------------------------------
2205 | Returns the binary exponential of the single-precision floating-point value
2206 | `a'. The operation is performed according to the IEC/IEEE Standard for
2207 | Binary Floating-Point Arithmetic.
2208 |
2209 | Uses the following identities:
2210 |
2211 | 1. -------------------------------------------------------------------------
2212 | x x*ln(2)
2213 | 2 = e
2214 |
2215 | 2. -------------------------------------------------------------------------
2216 | 2 3 4 5 n
2217 | x x x x x x x
2218 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2219 | 1! 2! 3! 4! 5! n!
2220 *----------------------------------------------------------------------------*/
2221
2222 static const float64 float32_exp2_coefficients[15] =
2223 {
2224 const_float64( 0x3ff0000000000000ll ), /* 1 */
2225 const_float64( 0x3fe0000000000000ll ), /* 2 */
2226 const_float64( 0x3fc5555555555555ll ), /* 3 */
2227 const_float64( 0x3fa5555555555555ll ), /* 4 */
2228 const_float64( 0x3f81111111111111ll ), /* 5 */
2229 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
2230 const_float64( 0x3f2a01a01a01a01all ), /* 7 */
2231 const_float64( 0x3efa01a01a01a01all ), /* 8 */
2232 const_float64( 0x3ec71de3a556c734ll ), /* 9 */
2233 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2234 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2235 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2236 const_float64( 0x3de6124613a86d09ll ), /* 13 */
2237 const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2238 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2239 };
2240
2241 float32 float32_exp2( float32 a STATUS_PARAM )
2242 {
2243 flag aSign;
2244 int16 aExp;
2245 uint32_t aSig;
2246 float64 r, x, xn;
2247 int i;
2248 a = float32_squash_input_denormal(a STATUS_VAR);
2249
2250 aSig = extractFloat32Frac( a );
2251 aExp = extractFloat32Exp( a );
2252 aSign = extractFloat32Sign( a );
2253
2254 if ( aExp == 0xFF) {
2255 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2256 return (aSign) ? float32_zero : a;
2257 }
2258 if (aExp == 0) {
2259 if (aSig == 0) return float32_one;
2260 }
2261
2262 float_raise( float_flag_inexact STATUS_VAR);
2263
2264 /* ******************************* */
2265 /* using float64 for approximation */
2266 /* ******************************* */
2267 x = float32_to_float64(a STATUS_VAR);
2268 x = float64_mul(x, float64_ln2 STATUS_VAR);
2269
2270 xn = x;
2271 r = float64_one;
2272 for (i = 0 ; i < 15 ; i++) {
2273 float64 f;
2274
2275 f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2276 r = float64_add(r, f STATUS_VAR);
2277
2278 xn = float64_mul(xn, x STATUS_VAR);
2279 }
2280
2281 return float64_to_float32(r, status);
2282 }
2283
2284 /*----------------------------------------------------------------------------
2285 | Returns the binary log of the single-precision floating-point value `a'.
2286 | The operation is performed according to the IEC/IEEE Standard for Binary
2287 | Floating-Point Arithmetic.
2288 *----------------------------------------------------------------------------*/
2289 float32 float32_log2( float32 a STATUS_PARAM )
2290 {
2291 flag aSign, zSign;
2292 int16 aExp;
2293 uint32_t aSig, zSig, i;
2294
2295 a = float32_squash_input_denormal(a STATUS_VAR);
2296 aSig = extractFloat32Frac( a );
2297 aExp = extractFloat32Exp( a );
2298 aSign = extractFloat32Sign( a );
2299
2300 if ( aExp == 0 ) {
2301 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2302 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2303 }
2304 if ( aSign ) {
2305 float_raise( float_flag_invalid STATUS_VAR);
2306 return float32_default_nan;
2307 }
2308 if ( aExp == 0xFF ) {
2309 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2310 return a;
2311 }
2312
2313 aExp -= 0x7F;
2314 aSig |= 0x00800000;
2315 zSign = aExp < 0;
2316 zSig = aExp << 23;
2317
2318 for (i = 1 << 22; i > 0; i >>= 1) {
2319 aSig = ( (uint64_t)aSig * aSig ) >> 23;
2320 if ( aSig & 0x01000000 ) {
2321 aSig >>= 1;
2322 zSig |= i;
2323 }
2324 }
2325
2326 if ( zSign )
2327 zSig = -zSig;
2328
2329 return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2330 }
2331
2332 /*----------------------------------------------------------------------------
2333 | Returns 1 if the single-precision floating-point value `a' is equal to
2334 | the corresponding value `b', and 0 otherwise. The invalid exception is
2335 | raised if either operand is a NaN. Otherwise, the comparison is performed
2336 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2337 *----------------------------------------------------------------------------*/
2338
2339 int float32_eq( float32 a, float32 b STATUS_PARAM )
2340 {
2341 uint32_t av, bv;
2342 a = float32_squash_input_denormal(a STATUS_VAR);
2343 b = float32_squash_input_denormal(b STATUS_VAR);
2344
2345 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2346 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2347 ) {
2348 float_raise( float_flag_invalid STATUS_VAR);
2349 return 0;
2350 }
2351 av = float32_val(a);
2352 bv = float32_val(b);
2353 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2354 }
2355
2356 /*----------------------------------------------------------------------------
2357 | Returns 1 if the single-precision floating-point value `a' is less than
2358 | or equal to the corresponding value `b', and 0 otherwise. The invalid
2359 | exception is raised if either operand is a NaN. The comparison is performed
2360 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2361 *----------------------------------------------------------------------------*/
2362
2363 int float32_le( float32 a, float32 b STATUS_PARAM )
2364 {
2365 flag aSign, bSign;
2366 uint32_t av, bv;
2367 a = float32_squash_input_denormal(a STATUS_VAR);
2368 b = float32_squash_input_denormal(b STATUS_VAR);
2369
2370 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2371 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2372 ) {
2373 float_raise( float_flag_invalid STATUS_VAR);
2374 return 0;
2375 }
2376 aSign = extractFloat32Sign( a );
2377 bSign = extractFloat32Sign( b );
2378 av = float32_val(a);
2379 bv = float32_val(b);
2380 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2381 return ( av == bv ) || ( aSign ^ ( av < bv ) );
2382
2383 }
2384
2385 /*----------------------------------------------------------------------------
2386 | Returns 1 if the single-precision floating-point value `a' is less than
2387 | the corresponding value `b', and 0 otherwise. The invalid exception is
2388 | raised if either operand is a NaN. The comparison is performed according
2389 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2390 *----------------------------------------------------------------------------*/
2391
2392 int float32_lt( float32 a, float32 b STATUS_PARAM )
2393 {
2394 flag aSign, bSign;
2395 uint32_t av, bv;
2396 a = float32_squash_input_denormal(a STATUS_VAR);
2397 b = float32_squash_input_denormal(b STATUS_VAR);
2398
2399 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2400 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2401 ) {
2402 float_raise( float_flag_invalid STATUS_VAR);
2403 return 0;
2404 }
2405 aSign = extractFloat32Sign( a );
2406 bSign = extractFloat32Sign( b );
2407 av = float32_val(a);
2408 bv = float32_val(b);
2409 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2410 return ( av != bv ) && ( aSign ^ ( av < bv ) );
2411
2412 }
2413
2414 /*----------------------------------------------------------------------------
2415 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2416 | be compared, and 0 otherwise. The invalid exception is raised if either
2417 | operand is a NaN. The comparison is performed according to the IEC/IEEE
2418 | Standard for Binary Floating-Point Arithmetic.
2419 *----------------------------------------------------------------------------*/
2420
2421 int float32_unordered( float32 a, float32 b STATUS_PARAM )
2422 {
2423 a = float32_squash_input_denormal(a STATUS_VAR);
2424 b = float32_squash_input_denormal(b STATUS_VAR);
2425
2426 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2427 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2428 ) {
2429 float_raise( float_flag_invalid STATUS_VAR);
2430 return 1;
2431 }
2432 return 0;
2433 }
2434
2435 /*----------------------------------------------------------------------------
2436 | Returns 1 if the single-precision floating-point value `a' is equal to
2437 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2438 | exception. The comparison is performed according to the IEC/IEEE Standard
2439 | for Binary Floating-Point Arithmetic.
2440 *----------------------------------------------------------------------------*/
2441
2442 int float32_eq_quiet( float32 a, float32 b STATUS_PARAM )
2443 {
2444 a = float32_squash_input_denormal(a STATUS_VAR);
2445 b = float32_squash_input_denormal(b STATUS_VAR);
2446
2447 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2448 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2449 ) {
2450 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2451 float_raise( float_flag_invalid STATUS_VAR);
2452 }
2453 return 0;
2454 }
2455 return ( float32_val(a) == float32_val(b) ) ||
2456 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2457 }
2458
2459 /*----------------------------------------------------------------------------
2460 | Returns 1 if the single-precision floating-point value `a' is less than or
2461 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2462 | cause an exception. Otherwise, the comparison is performed according to the
2463 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2464 *----------------------------------------------------------------------------*/
2465
2466 int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2467 {
2468 flag aSign, bSign;
2469 uint32_t av, bv;
2470 a = float32_squash_input_denormal(a STATUS_VAR);
2471 b = float32_squash_input_denormal(b STATUS_VAR);
2472
2473 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2474 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2475 ) {
2476 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2477 float_raise( float_flag_invalid STATUS_VAR);
2478 }
2479 return 0;
2480 }
2481 aSign = extractFloat32Sign( a );
2482 bSign = extractFloat32Sign( b );
2483 av = float32_val(a);
2484 bv = float32_val(b);
2485 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2486 return ( av == bv ) || ( aSign ^ ( av < bv ) );
2487
2488 }
2489
2490 /*----------------------------------------------------------------------------
2491 | Returns 1 if the single-precision floating-point value `a' is less than
2492 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2493 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
2494 | Standard for Binary Floating-Point Arithmetic.
2495 *----------------------------------------------------------------------------*/
2496
2497 int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2498 {
2499 flag aSign, bSign;
2500 uint32_t av, bv;
2501 a = float32_squash_input_denormal(a STATUS_VAR);
2502 b = float32_squash_input_denormal(b STATUS_VAR);
2503
2504 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2505 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2506 ) {
2507 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2508 float_raise( float_flag_invalid STATUS_VAR);
2509 }
2510 return 0;
2511 }
2512 aSign = extractFloat32Sign( a );
2513 bSign = extractFloat32Sign( b );
2514 av = float32_val(a);
2515 bv = float32_val(b);
2516 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2517 return ( av != bv ) && ( aSign ^ ( av < bv ) );
2518
2519 }
2520
2521 /*----------------------------------------------------------------------------
2522 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2523 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
2524 | comparison is performed according to the IEC/IEEE Standard for Binary
2525 | Floating-Point Arithmetic.
2526 *----------------------------------------------------------------------------*/
2527
2528 int float32_unordered_quiet( float32 a, float32 b STATUS_PARAM )
2529 {
2530 a = float32_squash_input_denormal(a STATUS_VAR);
2531 b = float32_squash_input_denormal(b STATUS_VAR);
2532
2533 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2534 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2535 ) {
2536 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2537 float_raise( float_flag_invalid STATUS_VAR);
2538 }
2539 return 1;
2540 }
2541 return 0;
2542 }
2543
2544 /*----------------------------------------------------------------------------
2545 | Returns the result of converting the double-precision floating-point value
2546 | `a' to the 32-bit two's complement integer format. The conversion is
2547 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2548 | Arithmetic---which means in particular that the conversion is rounded
2549 | according to the current rounding mode. If `a' is a NaN, the largest
2550 | positive integer is returned. Otherwise, if the conversion overflows, the
2551 | largest integer with the same sign as `a' is returned.
2552 *----------------------------------------------------------------------------*/
2553
2554 int32 float64_to_int32( float64 a STATUS_PARAM )
2555 {
2556 flag aSign;
2557 int16 aExp, shiftCount;
2558 uint64_t aSig;
2559 a = float64_squash_input_denormal(a STATUS_VAR);
2560
2561 aSig = extractFloat64Frac( a );
2562 aExp = extractFloat64Exp( a );
2563 aSign = extractFloat64Sign( a );
2564 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2565 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2566 shiftCount = 0x42C - aExp;
2567 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2568 return roundAndPackInt32( aSign, aSig STATUS_VAR );
2569
2570 }
2571
2572 /*----------------------------------------------------------------------------
2573 | Returns the result of converting the double-precision floating-point value
2574 | `a' to the 32-bit two's complement integer format. The conversion is
2575 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2576 | Arithmetic, except that the conversion is always rounded toward zero.
2577 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2578 | the conversion overflows, the largest integer with the same sign as `a' is
2579 | returned.
2580 *----------------------------------------------------------------------------*/
2581
2582 int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2583 {
2584 flag aSign;
2585 int16 aExp, shiftCount;
2586 uint64_t aSig, savedASig;
2587 int32 z;
2588 a = float64_squash_input_denormal(a STATUS_VAR);
2589
2590 aSig = extractFloat64Frac( a );
2591 aExp = extractFloat64Exp( a );
2592 aSign = extractFloat64Sign( a );
2593 if ( 0x41E < aExp ) {
2594 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2595 goto invalid;
2596 }
2597 else if ( aExp < 0x3FF ) {
2598 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2599 return 0;
2600 }
2601 aSig |= LIT64( 0x0010000000000000 );
2602 shiftCount = 0x433 - aExp;
2603 savedASig = aSig;
2604 aSig >>= shiftCount;
2605 z = aSig;
2606 if ( aSign ) z = - z;
2607 if ( ( z < 0 ) ^ aSign ) {
2608 invalid:
2609 float_raise( float_flag_invalid STATUS_VAR);
2610 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2611 }
2612 if ( ( aSig<<shiftCount ) != savedASig ) {
2613 STATUS(float_exception_flags) |= float_flag_inexact;
2614 }
2615 return z;
2616
2617 }
2618
2619 /*----------------------------------------------------------------------------
2620 | Returns the result of converting the double-precision floating-point value
2621 | `a' to the 16-bit two's complement integer format. The conversion is
2622 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2623 | Arithmetic, except that the conversion is always rounded toward zero.
2624 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2625 | the conversion overflows, the largest integer with the same sign as `a' is
2626 | returned.
2627 *----------------------------------------------------------------------------*/
2628
2629 int16 float64_to_int16_round_to_zero( float64 a STATUS_PARAM )
2630 {
2631 flag aSign;
2632 int16 aExp, shiftCount;
2633 uint64_t aSig, savedASig;
2634 int32 z;
2635
2636 aSig = extractFloat64Frac( a );
2637 aExp = extractFloat64Exp( a );
2638 aSign = extractFloat64Sign( a );
2639 if ( 0x40E < aExp ) {
2640 if ( ( aExp == 0x7FF ) && aSig ) {
2641 aSign = 0;
2642 }
2643 goto invalid;
2644 }
2645 else if ( aExp < 0x3FF ) {
2646 if ( aExp || aSig ) {
2647 STATUS(float_exception_flags) |= float_flag_inexact;
2648 }
2649 return 0;
2650 }
2651 aSig |= LIT64( 0x0010000000000000 );
2652 shiftCount = 0x433 - aExp;
2653 savedASig = aSig;
2654 aSig >>= shiftCount;
2655 z = aSig;
2656 if ( aSign ) {
2657 z = - z;
2658 }
2659 if ( ( (int16_t)z < 0 ) ^ aSign ) {
2660 invalid:
2661 float_raise( float_flag_invalid STATUS_VAR);
2662 return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
2663 }
2664 if ( ( aSig<<shiftCount ) != savedASig ) {
2665 STATUS(float_exception_flags) |= float_flag_inexact;
2666 }
2667 return z;
2668 }
2669
2670 /*----------------------------------------------------------------------------
2671 | Returns the result of converting the double-precision floating-point value
2672 | `a' to the 64-bit two's complement integer format. The conversion is
2673 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2674 | Arithmetic---which means in particular that the conversion is rounded
2675 | according to the current rounding mode. If `a' is a NaN, the largest
2676 | positive integer is returned. Otherwise, if the conversion overflows, the
2677 | largest integer with the same sign as `a' is returned.
2678 *----------------------------------------------------------------------------*/
2679
2680 int64 float64_to_int64( float64 a STATUS_PARAM )
2681 {
2682 flag aSign;
2683 int16 aExp, shiftCount;
2684 uint64_t aSig, aSigExtra;
2685 a = float64_squash_input_denormal(a STATUS_VAR);
2686
2687 aSig = extractFloat64Frac( a );
2688 aExp = extractFloat64Exp( a );
2689 aSign = extractFloat64Sign( a );
2690 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2691 shiftCount = 0x433 - aExp;
2692 if ( shiftCount <= 0 ) {
2693 if ( 0x43E < aExp ) {
2694 float_raise( float_flag_invalid STATUS_VAR);
2695 if ( ! aSign
2696 || ( ( aExp == 0x7FF )
2697 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2698 ) {
2699 return LIT64( 0x7FFFFFFFFFFFFFFF );
2700 }
2701 return (int64_t) LIT64( 0x8000000000000000 );
2702 }
2703 aSigExtra = 0;
2704 aSig <<= - shiftCount;
2705 }
2706 else {
2707 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2708 }
2709 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2710
2711 }
2712
2713 /*----------------------------------------------------------------------------
2714 | Returns the result of converting the double-precision floating-point value
2715 | `a' to the 64-bit two's complement integer format. The conversion is
2716 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2717 | Arithmetic, except that the conversion is always rounded toward zero.
2718 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2719 | the conversion overflows, the largest integer with the same sign as `a' is
2720 | returned.
2721 *----------------------------------------------------------------------------*/
2722
2723 int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2724 {
2725 flag aSign;
2726 int16 aExp, shiftCount;
2727 uint64_t aSig;
2728 int64 z;
2729 a = float64_squash_input_denormal(a STATUS_VAR);
2730
2731 aSig = extractFloat64Frac( a );
2732 aExp = extractFloat64Exp( a );
2733 aSign = extractFloat64Sign( a );
2734 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2735 shiftCount = aExp - 0x433;
2736 if ( 0 <= shiftCount ) {
2737 if ( 0x43E <= aExp ) {
2738 if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
2739 float_raise( float_flag_invalid STATUS_VAR);
2740 if ( ! aSign
2741 || ( ( aExp == 0x7FF )
2742 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2743 ) {
2744 return LIT64( 0x7FFFFFFFFFFFFFFF );
2745 }
2746 }
2747 return (int64_t) LIT64( 0x8000000000000000 );
2748 }
2749 z = aSig<<shiftCount;
2750 }
2751 else {
2752 if ( aExp < 0x3FE ) {
2753 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2754 return 0;
2755 }
2756 z = aSig>>( - shiftCount );
2757 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
2758 STATUS(float_exception_flags) |= float_flag_inexact;
2759 }
2760 }
2761 if ( aSign ) z = - z;
2762 return z;
2763
2764 }
2765
2766 /*----------------------------------------------------------------------------
2767 | Returns the result of converting the double-precision floating-point value
2768 | `a' to the single-precision floating-point format. The conversion is
2769 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2770 | Arithmetic.
2771 *----------------------------------------------------------------------------*/
2772
2773 float32 float64_to_float32( float64 a STATUS_PARAM )
2774 {
2775 flag aSign;
2776 int16 aExp;
2777 uint64_t aSig;
2778 uint32_t zSig;
2779 a = float64_squash_input_denormal(a STATUS_VAR);
2780
2781 aSig = extractFloat64Frac( a );
2782 aExp = extractFloat64Exp( a );
2783 aSign = extractFloat64Sign( a );
2784 if ( aExp == 0x7FF ) {
2785 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2786 return packFloat32( aSign, 0xFF, 0 );
2787 }
2788 shift64RightJamming( aSig, 22, &aSig );
2789 zSig = aSig;
2790 if ( aExp || zSig ) {
2791 zSig |= 0x40000000;
2792 aExp -= 0x381;
2793 }
2794 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2795
2796 }
2797
2798
2799 /*----------------------------------------------------------------------------
2800 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2801 | half-precision floating-point value, returning the result. After being
2802 | shifted into the proper positions, the three fields are simply added
2803 | together to form the result. This means that any integer portion of `zSig'
2804 | will be added into the exponent. Since a properly normalized significand
2805 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2806 | than the desired result exponent whenever `zSig' is a complete, normalized
2807 | significand.
2808 *----------------------------------------------------------------------------*/
2809 static float16 packFloat16(flag zSign, int16 zExp, uint16_t zSig)
2810 {
2811 return make_float16(
2812 (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
2813 }
2814
2815 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
2816 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
2817
2818 float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM)
2819 {
2820 flag aSign;
2821 int16 aExp;
2822 uint32_t aSig;
2823
2824 aSign = extractFloat16Sign(a);
2825 aExp = extractFloat16Exp(a);
2826 aSig = extractFloat16Frac(a);
2827
2828 if (aExp == 0x1f && ieee) {
2829 if (aSig) {
2830 return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
2831 }
2832 return packFloat32(aSign, 0xff, aSig << 13);
2833 }
2834 if (aExp == 0) {
2835 int8 shiftCount;
2836
2837 if (aSig == 0) {
2838 return packFloat32(aSign, 0, 0);
2839 }
2840
2841 shiftCount = countLeadingZeros32( aSig ) - 21;
2842 aSig = aSig << shiftCount;
2843 aExp = -shiftCount;
2844 }
2845 return packFloat32( aSign, aExp + 0x70, aSig << 13);
2846 }
2847
2848 float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM)
2849 {
2850 flag aSign;
2851 int16 aExp;
2852 uint32_t aSig;
2853 uint32_t mask;
2854 uint32_t increment;
2855 int8 roundingMode;
2856 a = float32_squash_input_denormal(a STATUS_VAR);
2857
2858 aSig = extractFloat32Frac( a );
2859 aExp = extractFloat32Exp( a );
2860 aSign = extractFloat32Sign( a );
2861 if ( aExp == 0xFF ) {
2862 if (aSig) {
2863 /* Input is a NaN */
2864 float16 r = commonNaNToFloat16( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2865 if (!ieee) {
2866 return packFloat16(aSign, 0, 0);
2867 }
2868 return r;
2869 }
2870 /* Infinity */
2871 if (!ieee) {
2872 float_raise(float_flag_invalid STATUS_VAR);
2873 return packFloat16(aSign, 0x1f, 0x3ff);
2874 }
2875 return packFloat16(aSign, 0x1f, 0);
2876 }
2877 if (aExp == 0 && aSig == 0) {
2878 return packFloat16(aSign, 0, 0);
2879 }
2880 /* Decimal point between bits 22 and 23. */
2881 aSig |= 0x00800000;
2882 aExp -= 0x7f;
2883 if (aExp < -14) {
2884 mask = 0x00ffffff;
2885 if (aExp >= -24) {
2886 mask >>= 25 + aExp;
2887 }
2888 } else {
2889 mask = 0x00001fff;
2890 }
2891 if (aSig & mask) {
2892 float_raise( float_flag_underflow STATUS_VAR );
2893 roundingMode = STATUS(float_rounding_mode);
2894 switch (roundingMode) {
2895 case float_round_nearest_even:
2896 increment = (mask + 1) >> 1;
2897 if ((aSig & mask) == increment) {
2898 increment = aSig & (increment << 1);
2899 }
2900 break;
2901 case float_round_up:
2902 increment = aSign ? 0 : mask;
2903 break;
2904 case float_round_down:
2905 increment = aSign ? mask : 0;
2906 break;
2907 default: /* round_to_zero */
2908 increment = 0;
2909 break;
2910 }
2911 aSig += increment;
2912 if (aSig >= 0x01000000) {
2913 aSig >>= 1;
2914 aExp++;
2915 }
2916 } else if (aExp < -14
2917 && STATUS(float_detect_tininess) == float_tininess_before_rounding) {
2918 float_raise( float_flag_underflow STATUS_VAR);
2919 }
2920
2921 if (ieee) {
2922 if (aExp > 15) {
2923 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
2924 return packFloat16(aSign, 0x1f, 0);
2925 }
2926 } else {
2927 if (aExp > 16) {
2928 float_raise(float_flag_invalid | float_flag_inexact STATUS_VAR);
2929 return packFloat16(aSign, 0x1f, 0x3ff);
2930 }
2931 }
2932 if (aExp < -24) {
2933 return packFloat16(aSign, 0, 0);
2934 }
2935 if (aExp < -14) {
2936 aSig >>= -14 - aExp;
2937 aExp = -14;
2938 }
2939 return packFloat16(aSign, aExp + 14, aSig >> 13);
2940 }
2941
2942 #ifdef FLOATX80
2943
2944 /*----------------------------------------------------------------------------
2945 | Returns the result of converting the double-precision floating-point value
2946 | `a' to the extended double-precision floating-point format. The conversion
2947 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2948 | Arithmetic.
2949 *----------------------------------------------------------------------------*/
2950
2951 floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
2952 {
2953 flag aSign;
2954 int16 aExp;
2955 uint64_t aSig;
2956
2957 a = float64_squash_input_denormal(a STATUS_VAR);
2958 aSig = extractFloat64Frac( a );
2959 aExp = extractFloat64Exp( a );
2960 aSign = extractFloat64Sign( a );
2961 if ( aExp == 0x7FF ) {
2962 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2963 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2964 }
2965 if ( aExp == 0 ) {
2966 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2967 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2968 }
2969 return
2970 packFloatx80(
2971 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2972
2973 }
2974
2975 #endif
2976
2977 #ifdef FLOAT128
2978
2979 /*----------------------------------------------------------------------------
2980 | Returns the result of converting the double-precision floating-point value
2981 | `a' to the quadruple-precision floating-point format. The conversion is
2982 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2983 | Arithmetic.
2984 *----------------------------------------------------------------------------*/
2985
2986 float128 float64_to_float128( float64 a STATUS_PARAM )
2987 {
2988 flag aSign;
2989 int16 aExp;
2990 uint64_t aSig, zSig0, zSig1;
2991
2992 a = float64_squash_input_denormal(a STATUS_VAR);
2993 aSig = extractFloat64Frac( a );
2994 aExp = extractFloat64Exp( a );
2995 aSign = extractFloat64Sign( a );
2996 if ( aExp == 0x7FF ) {
2997 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2998 return packFloat128( aSign, 0x7FFF, 0, 0 );
2999 }
3000 if ( aExp == 0 ) {
3001 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3002 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3003 --aExp;
3004 }
3005 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
3006 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
3007
3008 }
3009
3010 #endif
3011
3012 /*----------------------------------------------------------------------------
3013 | Rounds the double-precision floating-point value `a' to an integer, and
3014 | returns the result as a double-precision floating-point value. The
3015 | operation is performed according to the IEC/IEEE Standard for Binary
3016 | Floating-Point Arithmetic.
3017 *----------------------------------------------------------------------------*/
3018
3019 float64 float64_round_to_int( float64 a STATUS_PARAM )
3020 {
3021 flag aSign;
3022 int16 aExp;
3023 uint64_t lastBitMask, roundBitsMask;
3024 int8 roundingMode;
3025 uint64_t z;
3026 a = float64_squash_input_denormal(a STATUS_VAR);
3027
3028 aExp = extractFloat64Exp( a );
3029 if ( 0x433 <= aExp ) {
3030 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
3031 return propagateFloat64NaN( a, a STATUS_VAR );
3032 }
3033 return a;
3034 }
3035 if ( aExp < 0x3FF ) {
3036 if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
3037 STATUS(float_exception_flags) |= float_flag_inexact;
3038 aSign = extractFloat64Sign( a );
3039 switch ( STATUS(float_rounding_mode) ) {
3040 case float_round_nearest_even:
3041 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3042 return packFloat64( aSign, 0x3FF, 0 );
3043 }
3044 break;
3045 case float_round_down:
3046 return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
3047 case float_round_up:
3048 return make_float64(
3049 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
3050 }
3051 return packFloat64( aSign, 0, 0 );
3052 }
3053 lastBitMask = 1;
3054 lastBitMask <<= 0x433 - aExp;
3055 roundBitsMask = lastBitMask - 1;
3056 z = float64_val(a);
3057 roundingMode = STATUS(float_rounding_mode);
3058 if ( roundingMode == float_round_nearest_even ) {
3059 z += lastBitMask>>1;
3060 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
3061 }
3062 else if ( roundingMode != float_round_to_zero ) {
3063 if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
3064 z += roundBitsMask;
3065 }
3066 }
3067 z &= ~ roundBitsMask;
3068 if ( z != float64_val(a) )
3069 STATUS(float_exception_flags) |= float_flag_inexact;
3070 return make_float64(z);
3071
3072 }
3073
3074 float64 float64_trunc_to_int( float64 a STATUS_PARAM)
3075 {
3076 int oldmode;
3077 float64 res;
3078 oldmode = STATUS(float_rounding_mode);
3079 STATUS(float_rounding_mode) = float_round_to_zero;
3080 res = float64_round_to_int(a STATUS_VAR);
3081 STATUS(float_rounding_mode) = oldmode;
3082 return res;
3083 }
3084
3085 /*----------------------------------------------------------------------------
3086 | Returns the result of adding the absolute values of the double-precision
3087 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
3088 | before being returned. `zSign' is ignored if the result is a NaN.
3089 | The addition is performed according to the IEC/IEEE Standard for Binary
3090 | Floating-Point Arithmetic.
3091 *----------------------------------------------------------------------------*/
3092
3093 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3094 {
3095 int16 aExp, bExp, zExp;
3096 uint64_t aSig, bSig, zSig;
3097 int16 expDiff;
3098
3099 aSig = extractFloat64Frac( a );
3100 aExp = extractFloat64Exp( a );
3101 bSig = extractFloat64Frac( b );
3102 bExp = extractFloat64Exp( b );
3103 expDiff = aExp - bExp;
3104 aSig <<= 9;
3105 bSig <<= 9;
3106 if ( 0 < expDiff ) {
3107 if ( aExp == 0x7FF ) {
3108 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3109 return a;
3110 }
3111 if ( bExp == 0 ) {
3112 --expDiff;
3113 }
3114 else {
3115 bSig |= LIT64( 0x2000000000000000 );
3116 }
3117 shift64RightJamming( bSig, expDiff, &bSig );
3118 zExp = aExp;
3119 }
3120 else if ( expDiff < 0 ) {
3121 if ( bExp == 0x7FF ) {
3122 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3123 return packFloat64( zSign, 0x7FF, 0 );
3124 }
3125 if ( aExp == 0 ) {
3126 ++expDiff;
3127 }
3128 else {
3129 aSig |= LIT64( 0x2000000000000000 );
3130 }
3131 shift64RightJamming( aSig, - expDiff, &aSig );
3132 zExp = bExp;
3133 }
3134 else {
3135 if ( aExp == 0x7FF ) {
3136 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3137 return a;
3138 }
3139 if ( aExp == 0 ) {
3140 if (STATUS(flush_to_zero)) {
3141 if (aSig | bSig) {
3142 float_raise(float_flag_output_denormal STATUS_VAR);
3143 }
3144 return packFloat64(zSign, 0, 0);
3145 }
3146 return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3147 }
3148 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3149 zExp = aExp;
3150 goto roundAndPack;
3151 }
3152 aSig |= LIT64( 0x2000000000000000 );
3153 zSig = ( aSig + bSig )<<1;
3154 --zExp;
3155 if ( (int64_t) zSig < 0 ) {
3156 zSig = aSig + bSig;
3157 ++zExp;
3158 }
3159 roundAndPack:
3160 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3161
3162 }
3163
3164 /*----------------------------------------------------------------------------
3165 | Returns the result of subtracting the absolute values of the double-
3166 | precision floating-point values `a' and `b'. If `zSign' is 1, the
3167 | difference is negated before being returned. `zSign' is ignored if the
3168 | result is a NaN. The subtraction is performed according to the IEC/IEEE
3169 | Standard for Binary Floating-Point Arithmetic.
3170 *----------------------------------------------------------------------------*/
3171
3172 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3173 {
3174 int16 aExp, bExp, zExp;
3175 uint64_t aSig, bSig, zSig;
3176 int16 expDiff;
3177
3178 aSig = extractFloat64Frac( a );
3179 aExp = extractFloat64Exp( a );
3180 bSig = extractFloat64Frac( b );
3181 bExp = extractFloat64Exp( b );
3182 expDiff = aExp - bExp;
3183 aSig <<= 10;
3184 bSig <<= 10;
3185 if ( 0 < expDiff ) goto aExpBigger;
3186 if ( expDiff < 0 ) goto bExpBigger;
3187 if ( aExp == 0x7FF ) {
3188 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3189 float_raise( float_flag_invalid STATUS_VAR);
3190 return float64_default_nan;
3191 }
3192 if ( aExp == 0 ) {
3193 aExp = 1;
3194 bExp = 1;
3195 }
3196 if ( bSig < aSig ) goto aBigger;
3197 if ( aSig < bSig ) goto bBigger;
3198 return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3199 bExpBigger:
3200 if ( bExp == 0x7FF ) {
3201 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3202 return packFloat64( zSign ^ 1, 0x7FF, 0 );
3203 }
3204 if ( aExp == 0 ) {
3205 ++expDiff;
3206 }
3207 else {
3208 aSig |= LIT64( 0x4000000000000000 );
3209 }
3210 shift64RightJamming( aSig, - expDiff, &aSig );
3211 bSig |= LIT64( 0x4000000000000000 );
3212 bBigger:
3213 zSig = bSig - aSig;
3214 zExp = bExp;
3215 zSign ^= 1;
3216 goto normalizeRoundAndPack;
3217 aExpBigger:
3218 if ( aExp == 0x7FF ) {
3219 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3220 return a;
3221 }
3222 if ( bExp == 0 ) {
3223 --expDiff;
3224 }
3225 else {
3226 bSig |= LIT64( 0x4000000000000000 );
3227 }
3228 shift64RightJamming( bSig, expDiff, &bSig );
3229 aSig |= LIT64( 0x4000000000000000 );
3230 aBigger:
3231 zSig = aSig - bSig;
3232 zExp = aExp;
3233 normalizeRoundAndPack:
3234 --zExp;
3235 return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3236
3237 }
3238
3239 /*----------------------------------------------------------------------------
3240 | Returns the result of adding the double-precision floating-point values `a'
3241 | and `b'. The operation is performed according to the IEC/IEEE Standard for
3242 | Binary Floating-Point Arithmetic.
3243 *----------------------------------------------------------------------------*/
3244
3245 float64 float64_add( float64 a, float64 b STATUS_PARAM )
3246 {
3247 flag aSign, bSign;
3248 a = float64_squash_input_denormal(a STATUS_VAR);
3249 b = float64_squash_input_denormal(b STATUS_VAR);
3250
3251 aSign = extractFloat64Sign( a );
3252 bSign = extractFloat64Sign( b );
3253 if ( aSign == bSign ) {
3254 return addFloat64Sigs( a, b, aSign STATUS_VAR );
3255 }
3256 else {
3257 return subFloat64Sigs( a, b, aSign STATUS_VAR );
3258 }
3259
3260 }
3261
3262 /*----------------------------------------------------------------------------
3263 | Returns the result of subtracting the double-precision floating-point values
3264 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3265 | for Binary Floating-Point Arithmetic.
3266 *----------------------------------------------------------------------------*/
3267
3268 float64 float64_sub( float64 a, float64 b STATUS_PARAM )
3269 {
3270 flag aSign, bSign;
3271 a = float64_squash_input_denormal(a STATUS_VAR);
3272 b = float64_squash_input_denormal(b STATUS_VAR);
3273
3274 aSign = extractFloat64Sign( a );
3275 bSign = extractFloat64Sign( b );
3276 if ( aSign == bSign ) {
3277 return subFloat64Sigs( a, b, aSign STATUS_VAR );
3278 }
3279 else {
3280 return addFloat64Sigs( a, b, aSign STATUS_VAR );
3281 }
3282
3283 }
3284
3285 /*----------------------------------------------------------------------------
3286 | Returns the result of multiplying the double-precision floating-point values
3287 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3288 | for Binary Floating-Point Arithmetic.
3289 *----------------------------------------------------------------------------*/
3290
3291 float64 float64_mul( float64 a, float64 b STATUS_PARAM )
3292 {
3293 flag aSign, bSign, zSign;
3294 int16 aExp, bExp, zExp;
3295 uint64_t aSig, bSig, zSig0, zSig1;
3296
3297 a = float64_squash_input_denormal(a STATUS_VAR);
3298 b = float64_squash_input_denormal(b STATUS_VAR);
3299
3300 aSig = extractFloat64Frac( a );
3301 aExp = extractFloat64Exp( a );
3302 aSign = extractFloat64Sign( a );
3303 bSig = extractFloat64Frac( b );
3304 bExp = extractFloat64Exp( b );
3305 bSign = extractFloat64Sign( b );
3306 zSign = aSign ^ bSign;
3307 if ( aExp == 0x7FF ) {
3308 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3309 return propagateFloat64NaN( a, b STATUS_VAR );
3310 }
3311 if ( ( bExp | bSig ) == 0 ) {
3312 float_raise( float_flag_invalid STATUS_VAR);
3313 return float64_default_nan;
3314 }
3315 return packFloat64( zSign, 0x7FF, 0 );
3316 }
3317 if ( bExp == 0x7FF ) {
3318 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3319 if ( ( aExp | aSig ) == 0 ) {
3320 float_raise( float_flag_invalid STATUS_VAR);
3321 return float64_default_nan;
3322 }
3323 return packFloat64( zSign, 0x7FF, 0 );
3324 }
3325 if ( aExp == 0 ) {
3326 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3327 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3328 }
3329 if ( bExp == 0 ) {
3330 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
3331 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3332 }
3333 zExp = aExp + bExp - 0x3FF;
3334 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3335 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3336 mul64To128( aSig, bSig, &zSig0, &zSig1 );
3337 zSig0 |= ( zSig1 != 0 );
3338 if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
3339 zSig0 <<= 1;
3340 --zExp;
3341 }
3342 return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
3343
3344 }
3345
3346 /*----------------------------------------------------------------------------
3347 | Returns the result of dividing the double-precision floating-point value `a'
3348 | by the corresponding value `b'. The operation is performed according to
3349 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3350 *----------------------------------------------------------------------------*/
3351
3352 float64 float64_div( float64 a, float64 b STATUS_PARAM )
3353 {
3354 flag aSign, bSign, zSign;
3355 int16 aExp, bExp, zExp;
3356 uint64_t aSig, bSig, zSig;
3357 uint64_t rem0, rem1;
3358 uint64_t term0, term1;
3359 a = float64_squash_input_denormal(a STATUS_VAR);
3360 b = float64_squash_input_denormal(b STATUS_VAR);
3361
3362 aSig = extractFloat64Frac( a );
3363 aExp = extractFloat64Exp( a );
3364 aSign = extractFloat64Sign( a );
3365 bSig = extractFloat64Frac( b );
3366 bExp = extractFloat64Exp( b );
3367 bSign = extractFloat64Sign( b );
3368 zSign = aSign ^ bSign;
3369 if ( aExp == 0x7FF ) {
3370 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3371 if ( bExp == 0x7FF ) {
3372 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3373 float_raise( float_flag_invalid STATUS_VAR);
3374 return float64_default_nan;
3375 }
3376 return packFloat64( zSign, 0x7FF, 0 );
3377 }
3378 if ( bExp == 0x7FF ) {
3379 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3380 return packFloat64( zSign, 0, 0 );
3381 }
3382 if ( bExp == 0 ) {
3383 if ( bSig == 0 ) {
3384 if ( ( aExp | aSig ) == 0 ) {
3385 float_raise( float_flag_invalid STATUS_VAR);
3386 return float64_default_nan;
3387 }
3388 float_raise( float_flag_divbyzero STATUS_VAR);
3389 return packFloat64( zSign, 0x7FF, 0 );
3390 }
3391 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3392 }
3393 if ( aExp == 0 ) {
3394 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3395 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3396 }
3397 zExp = aExp - bExp + 0x3FD;
3398 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3399 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3400 if ( bSig <= ( aSig + aSig ) ) {
3401 aSig >>= 1;
3402 ++zExp;
3403 }
3404 zSig = estimateDiv128To64( aSig, 0, bSig );
3405 if ( ( zSig & 0x1FF ) <= 2 ) {
3406 mul64To128( bSig, zSig, &term0, &term1 );
3407 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3408 while ( (int64_t) rem0 < 0 ) {
3409 --zSig;
3410 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3411 }
3412 zSig |= ( rem1 != 0 );
3413 }
3414 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3415
3416 }
3417
3418 /*----------------------------------------------------------------------------
3419 | Returns the remainder of the double-precision floating-point value `a'
3420 | with respect to the corresponding value `b'. The operation is performed
3421 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3422 *----------------------------------------------------------------------------*/
3423
3424 float64 float64_rem( float64 a, float64 b STATUS_PARAM )
3425 {
3426 flag aSign, zSign;
3427 int16 aExp, bExp, expDiff;
3428 uint64_t aSig, bSig;
3429 uint64_t q, alternateASig;
3430 int64_t sigMean;
3431
3432 a = float64_squash_input_denormal(a STATUS_VAR);
3433 b = float64_squash_input_denormal(b STATUS_VAR);
3434 aSig = extractFloat64Frac( a );
3435 aExp = extractFloat64Exp( a );
3436 aSign = extractFloat64Sign( a );
3437 bSig = extractFloat64Frac( b );
3438 bExp = extractFloat64Exp( b );
3439 if ( aExp == 0x7FF ) {
3440 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3441 return propagateFloat64NaN( a, b STATUS_VAR );
3442 }
3443 float_raise( float_flag_invalid STATUS_VAR);
3444 return float64_default_nan;
3445 }
3446 if ( bExp == 0x7FF ) {
3447 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3448 return a;
3449 }
3450 if ( bExp == 0 ) {
3451 if ( bSig == 0 ) {
3452 float_raise( float_flag_invalid STATUS_VAR);
3453 return float64_default_nan;
3454 }
3455 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3456 }
3457 if ( aExp == 0 ) {
3458 if ( aSig == 0 ) return a;
3459 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3460 }
3461 expDiff = aExp - bExp;
3462 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3463 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3464 if ( expDiff < 0 ) {
3465 if ( expDiff < -1 ) return a;
3466 aSig >>= 1;
3467 }
3468 q = ( bSig <= aSig );
3469 if ( q ) aSig -= bSig;
3470 expDiff -= 64;
3471 while ( 0 < expDiff ) {
3472 q = estimateDiv128To64( aSig, 0, bSig );
3473 q = ( 2 < q ) ? q - 2 : 0;
3474 aSig = - ( ( bSig>>2 ) * q );
3475 expDiff -= 62;
3476 }
3477 expDiff += 64;
3478 if ( 0 < expDiff ) {
3479 q = estimateDiv128To64( aSig, 0, bSig );
3480 q = ( 2 < q ) ? q - 2 : 0;
3481 q >>= 64 - expDiff;
3482 bSig >>= 2;
3483 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3484 }
3485 else {
3486 aSig >>= 2;
3487 bSig >>= 2;
3488 }
3489 do {
3490 alternateASig = aSig;
3491 ++q;
3492 aSig -= bSig;
3493 } while ( 0 <= (int64_t) aSig );
3494 sigMean = aSig + alternateASig;
3495 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3496 aSig = alternateASig;
3497 }
3498 zSign = ( (int64_t) aSig < 0 );
3499 if ( zSign ) aSig = - aSig;
3500 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
3501
3502 }
3503
3504 /*----------------------------------------------------------------------------
3505 | Returns the square root of the double-precision floating-point value `a'.
3506 | The operation is performed according to the IEC/IEEE Standard for Binary
3507 | Floating-Point Arithmetic.
3508 *----------------------------------------------------------------------------*/
3509
3510 float64 float64_sqrt( float64 a STATUS_PARAM )
3511 {
3512 flag aSign;
3513 int16 aExp, zExp;
3514 uint64_t aSig, zSig, doubleZSig;
3515 uint64_t rem0, rem1, term0, term1;
3516 a = float64_squash_input_denormal(a STATUS_VAR);
3517
3518 aSig = extractFloat64Frac( a );
3519 aExp = extractFloat64Exp( a );
3520 aSign = extractFloat64Sign( a );
3521 if ( aExp == 0x7FF ) {
3522 if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
3523 if ( ! aSign ) return a;
3524 float_raise( float_flag_invalid STATUS_VAR);
3525 return float64_default_nan;
3526 }
3527 if ( aSign ) {
3528 if ( ( aExp | aSig ) == 0 ) return a;
3529 float_raise( float_flag_invalid STATUS_VAR);
3530 return float64_default_nan;
3531 }
3532 if ( aExp == 0 ) {
3533 if ( aSig == 0 ) return float64_zero;
3534 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3535 }
3536 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3537 aSig |= LIT64( 0x0010000000000000 );
3538 zSig = estimateSqrt32( aExp, aSig>>21 );
3539 aSig <<= 9 - ( aExp & 1 );
3540 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3541 if ( ( zSig & 0x1FF ) <= 5 ) {
3542 doubleZSig = zSig<<1;
3543 mul64To128( zSig, zSig, &term0, &term1 );
3544 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3545 while ( (int64_t) rem0 < 0 ) {
3546 --zSig;
3547 doubleZSig -= 2;
3548 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3549 }
3550 zSig |= ( ( rem0 | rem1 ) != 0 );
3551 }
3552 return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
3553
3554 }
3555
3556 /*----------------------------------------------------------------------------
3557 | Returns the binary log of the double-precision floating-point value `a'.
3558 | The operation is performed according to the IEC/IEEE Standard for Binary
3559 | Floating-Point Arithmetic.
3560 *----------------------------------------------------------------------------*/
3561 float64 float64_log2( float64 a STATUS_PARAM )
3562 {
3563 flag aSign, zSign;
3564 int16 aExp;
3565 uint64_t aSig, aSig0, aSig1, zSig, i;
3566 a = float64_squash_input_denormal(a STATUS_VAR);
3567
3568 aSig = extractFloat64Frac( a );
3569 aExp = extractFloat64Exp( a );
3570 aSign = extractFloat64Sign( a );
3571
3572 if ( aExp == 0 ) {
3573 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
3574 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3575 }
3576 if ( aSign ) {
3577 float_raise( float_flag_invalid STATUS_VAR);
3578 return float64_default_nan;
3579 }
3580 if ( aExp == 0x7FF ) {
3581 if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
3582 return a;
3583 }
3584
3585 aExp -= 0x3FF;
3586 aSig |= LIT64( 0x0010000000000000 );
3587 zSign = aExp < 0;
3588 zSig = (uint64_t)aExp << 52;
3589 for (i = 1LL << 51; i > 0; i >>= 1) {
3590 mul64To128( aSig, aSig, &aSig0, &aSig1 );
3591 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
3592 if ( aSig & LIT64( 0x0020000000000000 ) ) {
3593 aSig >>= 1;
3594 zSig |= i;
3595 }
3596 }
3597
3598 if ( zSign )
3599 zSig = -zSig;
3600 return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
3601 }
3602
3603 /*----------------------------------------------------------------------------
3604 | Returns 1 if the double-precision floating-point value `a' is equal to the
3605 | corresponding value `b', and 0 otherwise. The invalid exception is raised
3606 | if either operand is a NaN. Otherwise, the comparison is performed
3607 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3608 *----------------------------------------------------------------------------*/
3609
3610 int float64_eq( float64 a, float64 b STATUS_PARAM )
3611 {
3612 uint64_t av, bv;
3613 a = float64_squash_input_denormal(a STATUS_VAR);
3614 b = float64_squash_input_denormal(b STATUS_VAR);
3615
3616 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3617 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3618 ) {
3619 float_raise( float_flag_invalid STATUS_VAR);
3620 return 0;
3621 }
3622 av = float64_val(a);
3623 bv = float64_val(b);
3624 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3625
3626 }
3627
3628 /*----------------------------------------------------------------------------
3629 | Returns 1 if the double-precision floating-point value `a' is less than or
3630 | equal to the corresponding value `b', and 0 otherwise. The invalid
3631 | exception is raised if either operand is a NaN. The comparison is performed
3632 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3633 *----------------------------------------------------------------------------*/
3634
3635 int float64_le( float64 a, float64 b STATUS_PARAM )
3636 {
3637 flag aSign, bSign;
3638 uint64_t av, bv;
3639 a = float64_squash_input_denormal(a STATUS_VAR);
3640 b = float64_squash_input_denormal(b STATUS_VAR);
3641
3642 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3643 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3644 ) {
3645 float_raise( float_flag_invalid STATUS_VAR);
3646 return 0;
3647 }
3648 aSign = extractFloat64Sign( a );
3649 bSign = extractFloat64Sign( b );
3650 av = float64_val(a);
3651 bv = float64_val(b);
3652 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3653 return ( av == bv ) || ( aSign ^ ( av < bv ) );
3654
3655 }
3656
3657 /*----------------------------------------------------------------------------
3658 | Returns 1 if the double-precision floating-point value `a' is less than
3659 | the corresponding value `b', and 0 otherwise. The invalid exception is
3660 | raised if either operand is a NaN. The comparison is performed according
3661 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3662 *----------------------------------------------------------------------------*/
3663
3664 int float64_lt( float64 a, float64 b STATUS_PARAM )
3665 {
3666 flag aSign, bSign;
3667 uint64_t av, bv;
3668
3669 a = float64_squash_input_denormal(a STATUS_VAR);
3670 b = float64_squash_input_denormal(b STATUS_VAR);
3671 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3672 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3673 ) {
3674 float_raise( float_flag_invalid STATUS_VAR);
3675 return 0;
3676 }
3677 aSign = extractFloat64Sign( a );
3678 bSign = extractFloat64Sign( b );
3679 av = float64_val(a);
3680 bv = float64_val(b);
3681 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
3682 return ( av != bv ) && ( aSign ^ ( av < bv ) );
3683
3684 }
3685
3686 /*----------------------------------------------------------------------------
3687 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
3688 | be compared, and 0 otherwise. The invalid exception is raised if either
3689 | operand is a NaN. The comparison is performed according to the IEC/IEEE
3690 | Standard for Binary Floating-Point Arithmetic.
3691 *----------------------------------------------------------------------------*/
3692
3693 int float64_unordered( float64 a, float64 b STATUS_PARAM )
3694 {
3695 a = float64_squash_input_denormal(a STATUS_VAR);
3696 b = float64_squash_input_denormal(b STATUS_VAR);
3697
3698 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3699 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3700 ) {
3701 float_raise( float_flag_invalid STATUS_VAR);
3702 return 1;
3703 }
3704 return 0;
3705 }
3706
3707 /*----------------------------------------------------------------------------
3708 | Returns 1 if the double-precision floating-point value `a' is equal to the
3709 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3710 | exception.The comparison is performed according to the IEC/IEEE Standard
3711 | for Binary Floating-Point Arithmetic.
3712 *----------------------------------------------------------------------------*/
3713
3714 int float64_eq_quiet( float64 a, float64 b STATUS_PARAM )
3715 {
3716 uint64_t av, bv;
3717 a = float64_squash_input_denormal(a STATUS_VAR);
3718 b = float64_squash_input_denormal(b STATUS_VAR);
3719
3720 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3721 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3722 ) {
3723 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3724 float_raise( float_flag_invalid STATUS_VAR);
3725 }
3726 return 0;
3727 }
3728 av = float64_val(a);
3729 bv = float64_val(b);
3730 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3731
3732 }
3733
3734 /*----------------------------------------------------------------------------
3735 | Returns 1 if the double-precision floating-point value `a' is less than or
3736 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3737 | cause an exception. Otherwise, the comparison is performed according to the
3738 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3739 *----------------------------------------------------------------------------*/
3740
3741 int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
3742 {
3743 flag aSign, bSign;
3744 uint64_t av, bv;
3745 a = float64_squash_input_denormal(a STATUS_VAR);
3746 b = float64_squash_input_denormal(b STATUS_VAR);
3747
3748 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3749 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3750 ) {
3751 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3752 float_raise( float_flag_invalid STATUS_VAR);
3753 }
3754 return 0;
3755 }
3756 aSign = extractFloat64Sign( a );
3757 bSign = extractFloat64Sign( b );
3758 av = float64_val(a);
3759 bv = float64_val(b);
3760 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3761 return ( av == bv ) || ( aSign ^ ( av < bv ) );
3762
3763 }
3764
3765 /*----------------------------------------------------------------------------
3766 | Returns 1 if the double-precision floating-point value `a' is less than
3767 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3768 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
3769 | Standard for Binary Floating-Point Arithmetic.
3770 *----------------------------------------------------------------------------*/
3771
3772 int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
3773 {
3774 flag aSign, bSign;
3775 uint64_t av, bv;
3776 a = float64_squash_input_denormal(a STATUS_VAR);
3777 b = float64_squash_input_denormal(b STATUS_VAR);
3778
3779 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3780 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3781 ) {
3782 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3783 float_raise( float_flag_invalid STATUS_VAR);
3784 }
3785 return 0;
3786 }
3787 aSign = extractFloat64Sign( a );
3788 bSign = extractFloat64Sign( b );
3789 av = float64_val(a);
3790 bv = float64_val(b);
3791 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
3792 return ( av != bv ) && ( aSign ^ ( av < bv ) );
3793
3794 }
3795
3796 /*----------------------------------------------------------------------------
3797 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
3798 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
3799 | comparison is performed according to the IEC/IEEE Standard for Binary
3800 | Floating-Point Arithmetic.
3801 *----------------------------------------------------------------------------*/
3802
3803 int float64_unordered_quiet( float64 a, float64 b STATUS_PARAM )
3804 {
3805 a = float64_squash_input_denormal(a STATUS_VAR);
3806 b = float64_squash_input_denormal(b STATUS_VAR);
3807
3808 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3809 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3810 ) {
3811 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3812 float_raise( float_flag_invalid STATUS_VAR);
3813 }
3814 return 1;
3815 }
3816 return 0;
3817 }
3818
3819 #ifdef FLOATX80
3820
3821 /*----------------------------------------------------------------------------
3822 | Returns the result of converting the extended double-precision floating-
3823 | point value `a' to the 32-bit two's complement integer format. The
3824 | conversion is performed according to the IEC/IEEE Standard for Binary
3825 | Floating-Point Arithmetic---which means in particular that the conversion
3826 | is rounded according to the current rounding mode. If `a' is a NaN, the
3827 | largest positive integer is returned. Otherwise, if the conversion
3828 | overflows, the largest integer with the same sign as `a' is returned.
3829 *----------------------------------------------------------------------------*/
3830
3831 int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
3832 {
3833 flag aSign;
3834 int32 aExp, shiftCount;
3835 uint64_t aSig;
3836
3837 aSig = extractFloatx80Frac( a );
3838 aExp = extractFloatx80Exp( a );
3839 aSign = extractFloatx80Sign( a );
3840 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
3841 shiftCount = 0x4037 - aExp;
3842 if ( shiftCount <= 0 ) shiftCount = 1;
3843 shift64RightJamming( aSig, shiftCount, &aSig );
3844 return roundAndPackInt32( aSign, aSig STATUS_VAR );
3845
3846 }
3847
3848 /*----------------------------------------------------------------------------
3849 | Returns the result of converting the extended double-precision floating-
3850 | point value `a' to the 32-bit two's complement integer format. The
3851 | conversion is performed according to the IEC/IEEE Standard for Binary
3852 | Floating-Point Arithmetic, except that the conversion is always rounded
3853 | toward zero. If `a' is a NaN, the largest positive integer is returned.
3854 | Otherwise, if the conversion overflows, the largest integer with the same
3855 | sign as `a' is returned.
3856 *----------------------------------------------------------------------------*/
3857
3858 int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
3859 {
3860 flag aSign;
3861 int32 aExp, shiftCount;
3862 uint64_t aSig, savedASig;
3863 int32 z;
3864
3865 aSig = extractFloatx80Frac( a );
3866 aExp = extractFloatx80Exp( a );
3867 aSign = extractFloatx80Sign( a );
3868 if ( 0x401E < aExp ) {
3869 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
3870 goto invalid;
3871 }
3872 else if ( aExp < 0x3FFF ) {
3873 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3874 return 0;
3875 }
3876 shiftCount = 0x403E - aExp;
3877 savedASig = aSig;
3878 aSig >>= shiftCount;
3879 z = aSig;
3880 if ( aSign ) z = - z;
3881 if ( ( z < 0 ) ^ aSign ) {
3882 invalid:
3883 float_raise( float_flag_invalid STATUS_VAR);
3884 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
3885 }
3886 if ( ( aSig<<shiftCount ) != savedASig ) {
3887 STATUS(float_exception_flags) |= float_flag_inexact;
3888 }
3889 return z;
3890
3891 }
3892
3893 /*----------------------------------------------------------------------------
3894 | Returns the result of converting the extended double-precision floating-
3895 | point value `a' to the 64-bit two's complement integer format. The
3896 | conversion is performed according to the IEC/IEEE Standard for Binary
3897 | Floating-Point Arithmetic---which means in particular that the conversion
3898 | is rounded according to the current rounding mode. If `a' is a NaN,
3899 | the largest positive integer is returned. Otherwise, if the conversion
3900 | overflows, the largest integer with the same sign as `a' is returned.
3901 *----------------------------------------------------------------------------*/
3902
3903 int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
3904 {
3905 flag aSign;
3906 int32 aExp, shiftCount;
3907 uint64_t aSig, aSigExtra;
3908
3909 aSig = extractFloatx80Frac( a );
3910 aExp = extractFloatx80Exp( a );
3911 aSign = extractFloatx80Sign( a );
3912 shiftCount = 0x403E - aExp;
3913 if ( shiftCount <= 0 ) {
3914 if ( shiftCount ) {
3915 float_raise( float_flag_invalid STATUS_VAR);
3916 if ( ! aSign
3917 || ( ( aExp == 0x7FFF )
3918 && ( aSig != LIT64( 0x8000000000000000 ) ) )
3919 ) {
3920 return LIT64( 0x7FFFFFFFFFFFFFFF );
3921 }
3922 return (int64_t) LIT64( 0x8000000000000000 );
3923 }
3924 aSigExtra = 0;
3925 }
3926 else {
3927 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3928 }
3929 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3930
3931 }
3932
3933 /*----------------------------------------------------------------------------
3934 | Returns the result of converting the extended double-precision floating-
3935 | point value `a' to the 64-bit two's complement integer format. The
3936 | conversion is performed according to the IEC/IEEE Standard for Binary
3937 | Floating-Point Arithmetic, except that the conversion is always rounded
3938 | toward zero. If `a' is a NaN, the largest positive integer is returned.
3939 | Otherwise, if the conversion overflows, the largest integer with the same
3940 | sign as `a' is returned.
3941 *----------------------------------------------------------------------------*/
3942
3943 int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
3944 {
3945 flag aSign;
3946 int32 aExp, shiftCount;
3947 uint64_t aSig;
3948 int64 z;
3949
3950 aSig = extractFloatx80Frac( a );
3951 aExp = extractFloatx80Exp( a );
3952 aSign = extractFloatx80Sign( a );
3953 shiftCount = aExp - 0x403E;
3954 if ( 0 <= shiftCount ) {
3955 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3956 if ( ( a.high != 0xC03E ) || aSig ) {
3957 float_raise( float_flag_invalid STATUS_VAR);
3958 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3959 return LIT64( 0x7FFFFFFFFFFFFFFF );
3960 }
3961 }
3962 return (int64_t) LIT64( 0x8000000000000000 );
3963 }
3964 else if ( aExp < 0x3FFF ) {
3965 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3966 return 0;
3967 }
3968 z = aSig>>( - shiftCount );
3969 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
3970 STATUS(float_exception_flags) |= float_flag_inexact;
3971 }
3972 if ( aSign ) z = - z;
3973 return z;
3974
3975 }
3976
3977 /*----------------------------------------------------------------------------
3978 | Returns the result of converting the extended double-precision floating-
3979 | point value `a' to the single-precision floating-point format. The
3980 | conversion is performed according to the IEC/IEEE Standard for Binary
3981 | Floating-Point Arithmetic.
3982 *----------------------------------------------------------------------------*/
3983
3984 float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
3985 {
3986 flag aSign;
3987 int32 aExp;
3988 uint64_t aSig;
3989
3990 aSig = extractFloatx80Frac( a );
3991 aExp = extractFloatx80Exp( a );
3992 aSign = extractFloatx80Sign( a );
3993 if ( aExp == 0x7FFF ) {
3994 if ( (uint64_t) ( aSig<<1 ) ) {
3995 return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3996 }
3997 return packFloat32( aSign, 0xFF, 0 );
3998 }
3999 shift64RightJamming( aSig, 33, &aSig );
4000 if ( aExp || aSig ) aExp -= 0x3F81;
4001 return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
4002
4003 }
4004
4005 /*----------------------------------------------------------------------------
4006 | Returns the result of converting the extended double-precision floating-
4007 | point value `a' to the double-precision floating-point format. The
4008 | conversion is performed according to the IEC/IEEE Standard for Binary
4009 | Floating-Point Arithmetic.
4010 *----------------------------------------------------------------------------*/
4011
4012 float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
4013 {
4014 flag aSign;
4015 int32 aExp;
4016 uint64_t aSig, zSig;
4017
4018 aSig = extractFloatx80Frac( a );
4019 aExp = extractFloatx80Exp( a );
4020 aSign = extractFloatx80Sign( a );
4021 if ( aExp == 0x7FFF ) {
4022 if ( (uint64_t) ( aSig<<1 ) ) {
4023 return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4024 }
4025 return packFloat64( aSign, 0x7FF, 0 );
4026 }
4027 shift64RightJamming( aSig, 1, &zSig );
4028 if ( aExp || aSig ) aExp -= 0x3C01;
4029 return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
4030
4031 }
4032
4033 #ifdef FLOAT128
4034
4035 /*----------------------------------------------------------------------------
4036 | Returns the result of converting the extended double-precision floating-
4037 | point value `a' to the quadruple-precision floating-point format. The
4038 | conversion is performed according to the IEC/IEEE Standard for Binary
4039 | Floating-Point Arithmetic.
4040 *----------------------------------------------------------------------------*/
4041
4042 float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
4043 {
4044 flag aSign;
4045 int16 aExp;
4046 uint64_t aSig, zSig0, zSig1;
4047
4048 aSig = extractFloatx80Frac( a );
4049 aExp = extractFloatx80Exp( a );
4050 aSign = extractFloatx80Sign( a );
4051 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4052 return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4053 }
4054 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4055 return packFloat128( aSign, aExp, zSig0, zSig1 );
4056
4057 }
4058
4059 #endif
4060
4061 /*----------------------------------------------------------------------------
4062 | Rounds the extended double-precision floating-point value `a' to an integer,
4063 | and returns the result as an extended quadruple-precision floating-point
4064 | value. The operation is performed according to the IEC/IEEE Standard for
4065 | Binary Floating-Point Arithmetic.
4066 *----------------------------------------------------------------------------*/
4067
4068 floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
4069 {
4070 flag aSign;
4071 int32 aExp;
4072 uint64_t lastBitMask, roundBitsMask;
4073 int8 roundingMode;
4074 floatx80 z;
4075
4076 aExp = extractFloatx80Exp( a );
4077 if ( 0x403E <= aExp ) {
4078 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4079 return propagateFloatx80NaN( a, a STATUS_VAR );
4080 }
4081 return a;
4082 }
4083 if ( aExp < 0x3FFF ) {
4084 if ( ( aExp == 0 )
4085 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4086 return a;
4087 }
4088 STATUS(float_exception_flags) |= float_flag_inexact;
4089 aSign = extractFloatx80Sign( a );
4090 switch ( STATUS(float_rounding_mode) ) {
4091 case float_round_nearest_even:
4092 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4093 ) {
4094 return
4095 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4096 }
4097 break;
4098 case float_round_down:
4099 return
4100 aSign ?
4101 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4102 : packFloatx80( 0, 0, 0 );
4103 case float_round_up:
4104 return
4105 aSign ? packFloatx80( 1, 0, 0 )
4106 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4107 }
4108 return packFloatx80( aSign, 0, 0 );
4109 }
4110 lastBitMask = 1;
4111 lastBitMask <<= 0x403E - aExp;
4112 roundBitsMask = lastBitMask - 1;
4113 z = a;
4114 roundingMode = STATUS(float_rounding_mode);
4115 if ( roundingMode == float_round_nearest_even ) {
4116 z.low += lastBitMask>>1;
4117 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4118 }
4119 else if ( roundingMode != float_round_to_zero ) {
4120 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
4121 z.low += roundBitsMask;
4122 }
4123 }
4124 z.low &= ~ roundBitsMask;
4125 if ( z.low == 0 ) {
4126 ++z.high;
4127 z.low = LIT64( 0x8000000000000000 );
4128 }
4129 if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
4130 return z;
4131
4132 }
4133
4134 /*----------------------------------------------------------------------------
4135 | Returns the result of adding the absolute values of the extended double-
4136 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
4137 | negated before being returned. `zSign' is ignored if the result is a NaN.
4138 | The addition is performed according to the IEC/IEEE Standard for Binary
4139 | Floating-Point Arithmetic.
4140 *----------------------------------------------------------------------------*/
4141
4142 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
4143 {
4144 int32 aExp, bExp, zExp;
4145 uint64_t aSig, bSig, zSig0, zSig1;
4146 int32 expDiff;
4147
4148 aSig = extractFloatx80Frac( a );
4149 aExp = extractFloatx80Exp( a );
4150 bSig = extractFloatx80Frac( b );
4151 bExp = extractFloatx80Exp( b );
4152 expDiff = aExp - bExp;
4153 if ( 0 < expDiff ) {
4154 if ( aExp == 0x7FFF ) {
4155 if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4156 return a;
4157 }
4158 if ( bExp == 0 ) --expDiff;
4159 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4160 zExp = aExp;
4161 }
4162 else if ( expDiff < 0 ) {
4163 if ( bExp == 0x7FFF ) {
4164 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4165 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4166 }
4167 if ( aExp == 0 ) ++expDiff;
4168 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4169 zExp = bExp;
4170 }
4171 else {
4172 if ( aExp == 0x7FFF ) {
4173 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4174 return propagateFloatx80NaN( a, b STATUS_VAR );
4175 }
4176 return a;
4177 }
4178 zSig1 = 0;
4179 zSig0 = aSig + bSig;
4180 if ( aExp == 0 ) {
4181 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4182 goto roundAndPack;
4183 }
4184 zExp = aExp;
4185 goto shiftRight1;
4186 }
4187 zSig0 = aSig + bSig;
4188 if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
4189 shiftRight1:
4190 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4191 zSig0 |= LIT64( 0x8000000000000000 );
4192 ++zExp;
4193 roundAndPack:
4194 return
4195 roundAndPackFloatx80(
4196 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4197
4198 }
4199
4200 /*----------------------------------------------------------------------------
4201 | Returns the result of subtracting the absolute values of the extended
4202 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
4203 | difference is negated before being returned. `zSign' is ignored if the
4204 | result is a NaN. The subtraction is performed according to the IEC/IEEE
4205 | Standard for Binary Floating-Point Arithmetic.
4206 *----------------------------------------------------------------------------*/
4207
4208 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
4209 {
4210 int32 aExp, bExp, zExp;
4211 uint64_t aSig, bSig, zSig0, zSig1;
4212 int32 expDiff;
4213 floatx80 z;
4214
4215 aSig = extractFloatx80Frac( a );
4216 aExp = extractFloatx80Exp( a );
4217 bSig = extractFloatx80Frac( b );
4218 bExp = extractFloatx80Exp( b );
4219 expDiff = aExp - bExp;
4220 if ( 0 < expDiff ) goto aExpBigger;
4221 if ( expDiff < 0 ) goto bExpBigger;
4222 if ( aExp == 0x7FFF ) {
4223 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4224 return propagateFloatx80NaN( a, b STATUS_VAR );
4225 }
4226 float_raise( float_flag_invalid STATUS_VAR);
4227 z.low = floatx80_default_nan_low;
4228 z.high = floatx80_default_nan_high;
4229 return z;
4230 }
4231 if ( aExp == 0 ) {
4232 aExp = 1;
4233 bExp = 1;
4234 }
4235 zSig1 = 0;
4236 if ( bSig < aSig ) goto aBigger;
4237 if ( aSig < bSig ) goto bBigger;
4238 return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
4239 bExpBigger:
4240 if ( bExp == 0x7FFF ) {
4241 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4242 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
4243 }
4244 if ( aExp == 0 ) ++expDiff;
4245 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4246 bBigger:
4247 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4248 zExp = bExp;
4249 zSign ^= 1;
4250 goto normalizeRoundAndPack;
4251 aExpBigger:
4252 if ( aExp == 0x7FFF ) {
4253 if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4254 return a;
4255 }
4256 if ( bExp == 0 ) --expDiff;
4257 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4258 aBigger:
4259 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4260 zExp = aExp;
4261 normalizeRoundAndPack:
4262 return
4263 normalizeRoundAndPackFloatx80(
4264 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4265
4266 }
4267
4268 /*----------------------------------------------------------------------------
4269 | Returns the result of adding the extended double-precision floating-point
4270 | values `a' and `b'. The operation is performed according to the IEC/IEEE
4271 | Standard for Binary Floating-Point Arithmetic.
4272 *----------------------------------------------------------------------------*/
4273
4274 floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
4275 {
4276 flag aSign, bSign;
4277
4278 aSign = extractFloatx80Sign( a );
4279 bSign = extractFloatx80Sign( b );
4280 if ( aSign == bSign ) {
4281 return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4282 }
4283 else {
4284 return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4285 }
4286
4287 }
4288
4289 /*----------------------------------------------------------------------------
4290 | Returns the result of subtracting the extended double-precision floating-
4291 | point values `a' and `b'. The operation is performed according to the
4292 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4293 *----------------------------------------------------------------------------*/
4294
4295 floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
4296 {
4297 flag aSign, bSign;
4298
4299 aSign = extractFloatx80Sign( a );
4300 bSign = extractFloatx80Sign( b );
4301 if ( aSign == bSign ) {
4302 return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4303 }
4304 else {
4305 return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4306 }
4307
4308 }
4309
4310 /*----------------------------------------------------------------------------
4311 | Returns the result of multiplying the extended double-precision floating-
4312 | point values `a' and `b'. The operation is performed according to the
4313 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4314 *----------------------------------------------------------------------------*/
4315
4316 floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
4317 {
4318 flag aSign, bSign, zSign;
4319 int32 aExp, bExp, zExp;
4320 uint64_t aSig, bSig, zSig0, zSig1;
4321 floatx80 z;
4322
4323 aSig = extractFloatx80Frac( a );
4324 aExp = extractFloatx80Exp( a );
4325 aSign = extractFloatx80Sign( a );
4326 bSig = extractFloatx80Frac( b );
4327 bExp = extractFloatx80Exp( b );
4328 bSign = extractFloatx80Sign( b );
4329 zSign = aSign ^ bSign;
4330 if ( aExp == 0x7FFF ) {
4331 if ( (uint64_t) ( aSig<<1 )
4332 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4333 return propagateFloatx80NaN( a, b STATUS_VAR );
4334 }
4335 if ( ( bExp | bSig ) == 0 ) goto invalid;
4336 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4337 }
4338 if ( bExp == 0x7FFF ) {
4339 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4340 if ( ( aExp | aSig ) == 0 ) {
4341 invalid:
4342 float_raise( float_flag_invalid STATUS_VAR);
4343 z.low = floatx80_default_nan_low;
4344 z.high = floatx80_default_nan_high;
4345 return z;
4346 }
4347 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4348 }
4349 if ( aExp == 0 ) {
4350 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4351 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4352 }
4353 if ( bExp == 0 ) {
4354 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
4355 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4356 }
4357 zExp = aExp + bExp - 0x3FFE;
4358 mul64To128( aSig, bSig, &zSig0, &zSig1 );
4359 if ( 0 < (int64_t) zSig0 ) {
4360 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
4361 --zExp;
4362 }
4363 return
4364 roundAndPackFloatx80(
4365 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4366
4367 }
4368
4369 /*----------------------------------------------------------------------------
4370 | Returns the result of dividing the extended double-precision floating-point
4371 | value `a' by the corresponding value `b'. The operation is performed
4372 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4373 *----------------------------------------------------------------------------*/
4374
4375 floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
4376 {
4377 flag aSign, bSign, zSign;
4378 int32 aExp, bExp, zExp;
4379 uint64_t aSig, bSig, zSig0, zSig1;
4380 uint64_t rem0, rem1, rem2, term0, term1, term2;
4381 floatx80 z;
4382
4383 aSig = extractFloatx80Frac( a );
4384 aExp = extractFloatx80Exp( a );
4385 aSign = extractFloatx80Sign( a );
4386 bSig = extractFloatx80Frac( b );
4387 bExp = extractFloatx80Exp( b );
4388 bSign = extractFloatx80Sign( b );
4389 zSign = aSign ^ bSign;
4390 if ( aExp == 0x7FFF ) {
4391 if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4392 if ( bExp == 0x7FFF ) {
4393 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4394 goto invalid;
4395 }
4396 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4397 }
4398 if ( bExp == 0x7FFF ) {
4399 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4400 return packFloatx80( zSign, 0, 0 );
4401 }
4402 if ( bExp == 0 ) {
4403 if ( bSig == 0 ) {
4404 if ( ( aExp | aSig ) == 0 ) {
4405 invalid:
4406 float_raise( float_flag_invalid STATUS_VAR);
4407 z.low = floatx80_default_nan_low;
4408 z.high = floatx80_default_nan_high;
4409 return z;
4410 }
4411 float_raise( float_flag_divbyzero STATUS_VAR);
4412 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4413 }
4414 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4415 }
4416 if ( aExp == 0 ) {
4417 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4418 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4419 }
4420 zExp = aExp - bExp + 0x3FFE;
4421 rem1 = 0;
4422 if ( bSig <= aSig ) {
4423 shift128Right( aSig, 0, 1, &aSig, &rem1 );
4424 ++zExp;
4425 }
4426 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
4427 mul64To128( bSig, zSig0, &term0, &term1 );
4428 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
4429 while ( (int64_t) rem0 < 0 ) {
4430 --zSig0;
4431 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4432 }
4433 zSig1 = estimateDiv128To64( rem1, 0, bSig );
4434 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
4435 mul64To128( bSig, zSig1, &term1, &term2 );
4436 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4437 while ( (int64_t) rem1 < 0 ) {
4438 --zSig1;
4439 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
4440 }
4441 zSig1 |= ( ( rem1 | rem2 ) != 0 );
4442 }
4443 return
4444 roundAndPackFloatx80(
4445 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4446
4447 }
4448
4449 /*----------------------------------------------------------------------------
4450 | Returns the remainder of the extended double-precision floating-point value
4451 | `a' with respect to the corresponding value `b'. The operation is performed
4452 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4453 *----------------------------------------------------------------------------*/
4454
4455 floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
4456 {
4457 flag aSign, zSign;
4458 int32 aExp, bExp, expDiff;
4459 uint64_t aSig0, aSig1, bSig;
4460 uint64_t q, term0, term1, alternateASig0, alternateASig1;
4461 floatx80 z;
4462
4463 aSig0 = extractFloatx80Frac( a );
4464 aExp = extractFloatx80Exp( a );
4465 aSign = extractFloatx80Sign( a );
4466 bSig = extractFloatx80Frac( b );
4467 bExp = extractFloatx80Exp( b );
4468 if ( aExp == 0x7FFF ) {
4469 if ( (uint64_t) ( aSig0<<1 )
4470 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4471 return propagateFloatx80NaN( a, b STATUS_VAR );
4472 }
4473 goto invalid;
4474 }
4475 if ( bExp == 0x7FFF ) {
4476 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4477 return a;
4478 }
4479 if ( bExp == 0 ) {
4480 if ( bSig == 0 ) {
4481 invalid:
4482 float_raise( float_flag_invalid STATUS_VAR);
4483 z.low = floatx80_default_nan_low;
4484 z.high = floatx80_default_nan_high;
4485 return z;
4486 }
4487 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4488 }
4489 if ( aExp == 0 ) {
4490 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
4491 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4492 }
4493 bSig |= LIT64( 0x8000000000000000 );
4494 zSign = aSign;
4495 expDiff = aExp - bExp;
4496 aSig1 = 0;
4497 if ( expDiff < 0 ) {
4498 if ( expDiff < -1 ) return a;
4499 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
4500 expDiff = 0;
4501 }
4502 q = ( bSig <= aSig0 );
4503 if ( q ) aSig0 -= bSig;
4504 expDiff -= 64;
4505 while ( 0 < expDiff ) {
4506 q = estimateDiv128To64( aSig0, aSig1, bSig );
4507 q = ( 2 < q ) ? q - 2 : 0;
4508 mul64To128( bSig, q, &term0, &term1 );
4509 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4510 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
4511 expDiff -= 62;
4512 }
4513 expDiff += 64;
4514 if ( 0 < expDiff ) {
4515 q = estimateDiv128To64( aSig0, aSig1, bSig );
4516 q = ( 2 < q ) ? q - 2 : 0;
4517 q >>= 64 - expDiff;
4518 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
4519 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4520 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4521 while ( le128( term0, term1, aSig0, aSig1 ) ) {
4522 ++q;
4523 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4524 }
4525 }
4526 else {
4527 term1 = 0;
4528 term0 = bSig;
4529 }
4530 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4531 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4532 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4533 && ( q & 1 ) )
4534 ) {
4535 aSig0 = alternateASig0;
4536 aSig1 = alternateASig1;
4537 zSign = ! zSign;
4538 }
4539 return
4540 normalizeRoundAndPackFloatx80(
4541 80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
4542
4543 }
4544
4545 /*----------------------------------------------------------------------------
4546 | Returns the square root of the extended double-precision floating-point
4547 | value `a'. The operation is performed according to the IEC/IEEE Standard
4548 | for Binary Floating-Point Arithmetic.
4549 *----------------------------------------------------------------------------*/
4550
4551 floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
4552 {
4553 flag aSign;
4554 int32 aExp, zExp;
4555 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4556 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4557 floatx80 z;
4558
4559 aSig0 = extractFloatx80Frac( a );
4560 aExp = extractFloatx80Exp( a );
4561 aSign = extractFloatx80Sign( a );
4562 if ( aExp == 0x7FFF ) {
4563 if ( (uint64_t) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
4564 if ( ! aSign ) return a;
4565 goto invalid;
4566 }
4567 if ( aSign ) {
4568 if ( ( aExp | aSig0 ) == 0 ) return a;
4569 invalid:
4570 float_raise( float_flag_invalid STATUS_VAR);
4571 z.low = floatx80_default_nan_low;
4572 z.high = floatx80_default_nan_high;
4573 return z;
4574 }
4575 if ( aExp == 0 ) {
4576 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4577 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4578 }
4579 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4580 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4581 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4582 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4583 doubleZSig0 = zSig0<<1;
4584 mul64To128( zSig0, zSig0, &term0, &term1 );
4585 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4586 while ( (int64_t) rem0 < 0 ) {
4587 --zSig0;
4588 doubleZSig0 -= 2;
4589 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4590 }
4591 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4592 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4593 if ( zSig1 == 0 ) zSig1 = 1;
4594 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4595 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4596 mul64To128( zSig1, zSig1, &term2, &term3 );
4597 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4598 while ( (int64_t) rem1 < 0 ) {
4599 --zSig1;
4600 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4601 term3 |= 1;
4602 term2 |= doubleZSig0;
4603 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4604 }
4605 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4606 }
4607 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4608 zSig0 |= doubleZSig0;
4609 return
4610 roundAndPackFloatx80(
4611 STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
4612
4613 }
4614
4615 /*----------------------------------------------------------------------------
4616 | Returns 1 if the extended double-precision floating-point value `a' is equal
4617 | to the corresponding value `b', and 0 otherwise. The invalid exception is
4618 | raised if either operand is a NaN. Otherwise, the comparison is performed
4619 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4620 *----------------------------------------------------------------------------*/
4621
4622 int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
4623 {
4624
4625 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4626 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4627 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4628 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4629 ) {
4630 float_raise( float_flag_invalid STATUS_VAR);
4631 return 0;
4632 }
4633 return
4634 ( a.low == b.low )
4635 && ( ( a.high == b.high )
4636 || ( ( a.low == 0 )
4637 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
4638 );
4639
4640 }
4641
4642 /*----------------------------------------------------------------------------
4643 | Returns 1 if the extended double-precision floating-point value `a' is
4644 | less than or equal to the corresponding value `b', and 0 otherwise. The
4645 | invalid exception is raised if either operand is a NaN. The comparison is
4646 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4647 | Arithmetic.
4648 *----------------------------------------------------------------------------*/
4649
4650 int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
4651 {
4652 flag aSign, bSign;
4653
4654 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4655 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4656 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4657 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4658 ) {
4659 float_raise( float_flag_invalid STATUS_VAR);
4660 return 0;
4661 }
4662 aSign = extractFloatx80Sign( a );
4663 bSign = extractFloatx80Sign( b );
4664 if ( aSign != bSign ) {
4665 return
4666 aSign
4667 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4668 == 0 );
4669 }
4670 return
4671 aSign ? le128( b.high, b.low, a.high, a.low )
4672 : le128( a.high, a.low, b.high, b.low );
4673
4674 }
4675
4676 /*----------------------------------------------------------------------------
4677 | Returns 1 if the extended double-precision floating-point value `a' is
4678 | less than the corresponding value `b', and 0 otherwise. The invalid
4679 | exception is raised if either operand is a NaN. The comparison is performed
4680 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4681 *----------------------------------------------------------------------------*/
4682
4683 int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
4684 {
4685 flag aSign, bSign;
4686
4687 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4688 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4689 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4690 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4691 ) {
4692 float_raise( float_flag_invalid STATUS_VAR);
4693 return 0;
4694 }
4695 aSign = extractFloatx80Sign( a );
4696 bSign = extractFloatx80Sign( b );
4697 if ( aSign != bSign ) {
4698 return
4699 aSign
4700 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4701 != 0 );
4702 }
4703 return
4704 aSign ? lt128( b.high, b.low, a.high, a.low )
4705 : lt128( a.high, a.low, b.high, b.low );
4706
4707 }
4708
4709 /*----------------------------------------------------------------------------
4710 | Returns 1 if the extended double-precision floating-point values `a' and `b'
4711 | cannot be compared, and 0 otherwise. The invalid exception is raised if
4712 | either operand is a NaN. The comparison is performed according to the
4713 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4714 *----------------------------------------------------------------------------*/
4715 int floatx80_unordered( floatx80 a, floatx80 b STATUS_PARAM )
4716 {
4717 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4718 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4719 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4720 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4721 ) {
4722 float_raise( float_flag_invalid STATUS_VAR);
4723 return 1;
4724 }
4725 return 0;
4726 }
4727
4728 /*----------------------------------------------------------------------------
4729 | Returns 1 if the extended double-precision floating-point value `a' is
4730 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4731 | cause an exception. The comparison is performed according to the IEC/IEEE
4732 | Standard for Binary Floating-Point Arithmetic.
4733 *----------------------------------------------------------------------------*/
4734
4735 int floatx80_eq_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4736 {
4737
4738 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4739 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4740 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4741 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4742 ) {
4743 if ( floatx80_is_signaling_nan( a )
4744 || floatx80_is_signaling_nan( b ) ) {
4745 float_raise( float_flag_invalid STATUS_VAR);
4746 }
4747 return 0;
4748 }
4749 return
4750 ( a.low == b.low )
4751 && ( ( a.high == b.high )
4752 || ( ( a.low == 0 )
4753 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
4754 );
4755
4756 }
4757
4758 /*----------------------------------------------------------------------------
4759 | Returns 1 if the extended double-precision floating-point value `a' is less
4760 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
4761 | do not cause an exception. Otherwise, the comparison is performed according
4762 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4763 *----------------------------------------------------------------------------*/
4764
4765 int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4766 {
4767 flag aSign, bSign;
4768
4769 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4770 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4771 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4772 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4773 ) {
4774 if ( floatx80_is_signaling_nan( a )
4775 || floatx80_is_signaling_nan( b ) ) {
4776 float_raise( float_flag_invalid STATUS_VAR);
4777 }
4778 return 0;
4779 }
4780 aSign = extractFloatx80Sign( a );
4781 bSign = extractFloatx80Sign( b );
4782 if ( aSign != bSign ) {
4783 return
4784 aSign
4785 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4786 == 0 );
4787 }
4788 return
4789 aSign ? le128( b.high, b.low, a.high, a.low )
4790 : le128( a.high, a.low, b.high, b.low );
4791
4792 }
4793
4794 /*----------------------------------------------------------------------------
4795 | Returns 1 if the extended double-precision floating-point value `a' is less
4796 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
4797 | an exception. Otherwise, the comparison is performed according to the
4798 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4799 *----------------------------------------------------------------------------*/
4800
4801 int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4802 {
4803 flag aSign, bSign;
4804
4805 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4806 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4807 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4808 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4809 ) {
4810 if ( floatx80_is_signaling_nan( a )
4811 || floatx80_is_signaling_nan( b ) ) {
4812 float_raise( float_flag_invalid STATUS_VAR);
4813 }
4814 return 0;
4815 }
4816 aSign = extractFloatx80Sign( a );
4817 bSign = extractFloatx80Sign( b );
4818 if ( aSign != bSign ) {
4819 return
4820 aSign
4821 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4822 != 0 );
4823 }
4824 return
4825 aSign ? lt128( b.high, b.low, a.high, a.low )
4826 : lt128( a.high, a.low, b.high, b.low );
4827
4828 }
4829
4830 /*----------------------------------------------------------------------------
4831 | Returns 1 if the extended double-precision floating-point values `a' and `b'
4832 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
4833 | The comparison is performed according to the IEC/IEEE Standard for Binary
4834 | Floating-Point Arithmetic.
4835 *----------------------------------------------------------------------------*/
4836 int floatx80_unordered_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4837 {
4838 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4839 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4840 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4841 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4842 ) {
4843 if ( floatx80_is_signaling_nan( a )
4844 || floatx80_is_signaling_nan( b ) ) {
4845 float_raise( float_flag_invalid STATUS_VAR);
4846 }
4847 return 1;
4848 }
4849 return 0;
4850 }
4851
4852 #endif
4853
4854 #ifdef FLOAT128
4855
4856 /*----------------------------------------------------------------------------
4857 | Returns the result of converting the quadruple-precision floating-point
4858 | value `a' to the 32-bit two's complement integer format. The conversion
4859 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4860 | Arithmetic---which means in particular that the conversion is rounded
4861 | according to the current rounding mode. If `a' is a NaN, the largest
4862 | positive integer is returned. Otherwise, if the conversion overflows, the
4863 | largest integer with the same sign as `a' is returned.
4864 *----------------------------------------------------------------------------*/
4865
4866 int32 float128_to_int32( float128 a STATUS_PARAM )
4867 {
4868 flag aSign;
4869 int32 aExp, shiftCount;
4870 uint64_t aSig0, aSig1;
4871
4872 aSig1 = extractFloat128Frac1( a );
4873 aSig0 = extractFloat128Frac0( a );
4874 aExp = extractFloat128Exp( a );
4875 aSign = extractFloat128Sign( a );
4876 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4877 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4878 aSig0 |= ( aSig1 != 0 );
4879 shiftCount = 0x4028 - aExp;
4880 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4881 return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
4882
4883 }
4884
4885 /*----------------------------------------------------------------------------
4886 | Returns the result of converting the quadruple-precision floating-point
4887 | value `a' to the 32-bit two's complement integer format. The conversion
4888 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4889 | Arithmetic, except that the conversion is always rounded toward zero. If
4890 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
4891 | conversion overflows, the largest integer with the same sign as `a' is
4892 | returned.
4893 *----------------------------------------------------------------------------*/
4894
4895 int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
4896 {
4897 flag aSign;
4898 int32 aExp, shiftCount;
4899 uint64_t aSig0, aSig1, savedASig;
4900 int32 z;
4901
4902 aSig1 = extractFloat128Frac1( a );
4903 aSig0 = extractFloat128Frac0( a );
4904 aExp = extractFloat128Exp( a );
4905 aSign = extractFloat128Sign( a );
4906 aSig0 |= ( aSig1 != 0 );
4907 if ( 0x401E < aExp ) {
4908 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4909 goto invalid;
4910 }
4911 else if ( aExp < 0x3FFF ) {
4912 if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
4913 return 0;
4914 }
4915 aSig0 |= LIT64( 0x0001000000000000 );
4916 shiftCount = 0x402F - aExp;
4917 savedASig = aSig0;
4918 aSig0 >>= shiftCount;
4919 z = aSig0;
4920 if ( aSign ) z = - z;
4921 if ( ( z < 0 ) ^ aSign ) {
4922 invalid:
4923 float_raise( float_flag_invalid STATUS_VAR);
4924 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4925 }
4926 if ( ( aSig0<<shiftCount ) != savedASig ) {
4927 STATUS(float_exception_flags) |= float_flag_inexact;
4928 }
4929 return z;
4930
4931 }
4932
4933 /*----------------------------------------------------------------------------
4934 | Returns the result of converting the quadruple-precision floating-point
4935 | value `a' to the 64-bit two's complement integer format. The conversion
4936 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4937 | Arithmetic---which means in particular that the conversion is rounded
4938 | according to the current rounding mode. If `a' is a NaN, the largest
4939 | positive integer is returned. Otherwise, if the conversion overflows, the
4940 | largest integer with the same sign as `a' is returned.
4941 *----------------------------------------------------------------------------*/
4942
4943 int64 float128_to_int64( float128 a STATUS_PARAM )
4944 {
4945 flag aSign;
4946 int32 aExp, shiftCount;
4947 uint64_t aSig0, aSig1;
4948
4949 aSig1 = extractFloat128Frac1( a );
4950 aSig0 = extractFloat128Frac0( a );
4951 aExp = extractFloat128Exp( a );
4952 aSign = extractFloat128Sign( a );
4953 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4954 shiftCount = 0x402F - aExp;
4955 if ( shiftCount <= 0 ) {
4956 if ( 0x403E < aExp ) {
4957 float_raise( float_flag_invalid STATUS_VAR);
4958 if ( ! aSign
4959 || ( ( aExp == 0x7FFF )
4960 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4961 )
4962 ) {
4963 return LIT64( 0x7FFFFFFFFFFFFFFF );
4964 }
4965 return (int64_t) LIT64( 0x8000000000000000 );
4966 }
4967 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4968 }
4969 else {
4970 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4971 }
4972 return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
4973
4974 }
4975
4976 /*----------------------------------------------------------------------------
4977 | Returns the result of converting the quadruple-precision floating-point
4978 | value `a' to the 64-bit two's complement integer format. The conversion
4979 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4980 | Arithmetic, except that the conversion is always rounded toward zero.
4981 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
4982 | the conversion overflows, the largest integer with the same sign as `a' is
4983 | returned.
4984 *----------------------------------------------------------------------------*/
4985
4986 int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
4987 {
4988 flag aSign;
4989 int32 aExp, shiftCount;
4990 uint64_t aSig0, aSig1;
4991 int64 z;
4992
4993 aSig1 = extractFloat128Frac1( a );
4994 aSig0 = extractFloat128Frac0( a );
4995 aExp = extractFloat128Exp( a );
4996 aSign = extractFloat128Sign( a );
4997 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4998 shiftCount = aExp - 0x402F;
4999 if ( 0 < shiftCount ) {
5000 if ( 0x403E <= aExp ) {
5001 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5002 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
5003 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5004 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
5005 }
5006 else {
5007 float_raise( float_flag_invalid STATUS_VAR);
5008 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5009 return LIT64( 0x7FFFFFFFFFFFFFFF );
5010 }
5011 }
5012 return (int64_t) LIT64( 0x8000000000000000 );
5013 }
5014 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
5015 if ( (uint64_t) ( aSig1<<shiftCount ) ) {
5016 STATUS(float_exception_flags) |= float_flag_inexact;
5017 }
5018 }
5019 else {
5020 if ( aExp < 0x3FFF ) {
5021 if ( aExp | aSig0 | aSig1 ) {
5022 STATUS(float_exception_flags) |= float_flag_inexact;
5023 }
5024 return 0;
5025 }
5026 z = aSig0>>( - shiftCount );
5027 if ( aSig1
5028 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5029 STATUS(float_exception_flags) |= float_flag_inexact;
5030 }
5031 }
5032 if ( aSign ) z = - z;
5033 return z;
5034
5035 }
5036
5037 /*----------------------------------------------------------------------------
5038 | Returns the result of converting the quadruple-precision floating-point
5039 | value `a' to the single-precision floating-point format. The conversion
5040 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5041 | Arithmetic.
5042 *----------------------------------------------------------------------------*/
5043
5044 float32 float128_to_float32( float128 a STATUS_PARAM )
5045 {
5046 flag aSign;
5047 int32 aExp;
5048 uint64_t aSig0, aSig1;
5049 uint32_t zSig;
5050
5051 aSig1 = extractFloat128Frac1( a );
5052 aSig0 = extractFloat128Frac0( a );
5053 aExp = extractFloat128Exp( a );
5054 aSign = extractFloat128Sign( a );
5055 if ( aExp == 0x7FFF ) {
5056 if ( aSig0 | aSig1 ) {
5057 return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5058 }
5059 return packFloat32( aSign, 0xFF, 0 );
5060 }
5061 aSig0 |= ( aSig1 != 0 );
5062 shift64RightJamming( aSig0, 18, &aSig0 );
5063 zSig = aSig0;
5064 if ( aExp || zSig ) {
5065 zSig |= 0x40000000;
5066 aExp -= 0x3F81;
5067 }
5068 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
5069
5070 }
5071
5072 /*----------------------------------------------------------------------------
5073 | Returns the result of converting the quadruple-precision floating-point
5074 | value `a' to the double-precision floating-point format. The conversion
5075 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5076 | Arithmetic.
5077 *----------------------------------------------------------------------------*/
5078
5079 float64 float128_to_float64( float128 a STATUS_PARAM )
5080 {
5081 flag aSign;
5082 int32 aExp;
5083 uint64_t aSig0, aSig1;
5084
5085 aSig1 = extractFloat128Frac1( a );
5086 aSig0 = extractFloat128Frac0( a );
5087 aExp = extractFloat128Exp( a );
5088 aSign = extractFloat128Sign( a );
5089 if ( aExp == 0x7FFF ) {
5090 if ( aSig0 | aSig1 ) {
5091 return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5092 }
5093 return packFloat64( aSign, 0x7FF, 0 );
5094 }
5095 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5096 aSig0 |= ( aSig1 != 0 );
5097 if ( aExp || aSig0 ) {
5098 aSig0 |= LIT64( 0x4000000000000000 );
5099 aExp -= 0x3C01;
5100 }
5101 return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
5102
5103 }
5104
5105 #ifdef FLOATX80
5106
5107 /*----------------------------------------------------------------------------
5108 | Returns the result of converting the quadruple-precision floating-point
5109 | value `a' to the extended double-precision floating-point format. The
5110 | conversion is performed according to the IEC/IEEE Standard for Binary
5111 | Floating-Point Arithmetic.
5112 *----------------------------------------------------------------------------*/
5113
5114 floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
5115 {
5116 flag aSign;
5117 int32 aExp;
5118 uint64_t aSig0, aSig1;
5119
5120 aSig1 = extractFloat128Frac1( a );
5121 aSig0 = extractFloat128Frac0( a );
5122 aExp = extractFloat128Exp( a );
5123 aSign = extractFloat128Sign( a );
5124 if ( aExp == 0x7FFF ) {
5125 if ( aSig0 | aSig1 ) {
5126 return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5127 }
5128 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5129 }
5130 if ( aExp == 0 ) {
5131 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5132 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5133 }
5134 else {
5135 aSig0 |= LIT64( 0x0001000000000000 );
5136 }
5137 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5138 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
5139
5140 }
5141
5142 #endif
5143
5144 /*----------------------------------------------------------------------------
5145 | Rounds the quadruple-precision floating-point value `a' to an integer, and
5146 | returns the result as a quadruple-precision floating-point value. The
5147 | operation is performed according to the IEC/IEEE Standard for Binary
5148 | Floating-Point Arithmetic.
5149 *----------------------------------------------------------------------------*/
5150
5151 float128 float128_round_to_int( float128 a STATUS_PARAM )
5152 {
5153 flag aSign;
5154 int32 aExp;
5155 uint64_t lastBitMask, roundBitsMask;
5156 int8 roundingMode;
5157 float128 z;
5158
5159 aExp = extractFloat128Exp( a );
5160 if ( 0x402F <= aExp ) {
5161 if ( 0x406F <= aExp ) {
5162 if ( ( aExp == 0x7FFF )
5163 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5164 ) {
5165 return propagateFloat128NaN( a, a STATUS_VAR );
5166 }
5167 return a;
5168 }
5169 lastBitMask = 1;
5170 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5171 roundBitsMask = lastBitMask - 1;
5172 z = a;
5173 roundingMode = STATUS(float_rounding_mode);
5174 if ( roundingMode == float_round_nearest_even ) {
5175 if ( lastBitMask ) {
5176 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
5177 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
5178 }
5179 else {
5180 if ( (int64_t) z.low < 0 ) {
5181 ++z.high;
5182 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
5183 }
5184 }
5185 }
5186 else if ( roundingMode != float_round_to_zero ) {
5187 if ( extractFloat128Sign( z )
5188 ^ ( roundingMode == float_round_up ) ) {
5189 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
5190 }
5191 }
5192 z.low &= ~ roundBitsMask;
5193 }
5194 else {
5195 if ( aExp < 0x3FFF ) {
5196 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
5197 STATUS(float_exception_flags) |= float_flag_inexact;
5198 aSign = extractFloat128Sign( a );
5199 switch ( STATUS(float_rounding_mode) ) {
5200 case float_round_nearest_even:
5201 if ( ( aExp == 0x3FFE )
5202 && ( extractFloat128Frac0( a )
5203 | extractFloat128Frac1( a ) )
5204 ) {
5205 return packFloat128( aSign, 0x3FFF, 0, 0 );
5206 }
5207 break;
5208 case float_round_down:
5209 return
5210 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
5211 : packFloat128( 0, 0, 0, 0 );
5212 case float_round_up:
5213 return
5214 aSign ? packFloat128( 1, 0, 0, 0 )
5215 : packFloat128( 0, 0x3FFF, 0, 0 );
5216 }
5217 return packFloat128( aSign, 0, 0, 0 );
5218 }
5219 lastBitMask = 1;
5220 lastBitMask <<= 0x402F - aExp;
5221 roundBitsMask = lastBitMask - 1;
5222 z.low = 0;
5223 z.high = a.high;
5224 roundingMode = STATUS(float_rounding_mode);
5225 if ( roundingMode == float_round_nearest_even ) {
5226 z.high += lastBitMask>>1;
5227 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
5228 z.high &= ~ lastBitMask;
5229 }
5230 }
5231 else if ( roundingMode != float_round_to_zero ) {
5232 if ( extractFloat128Sign( z )
5233 ^ ( roundingMode == float_round_up ) ) {
5234 z.high |= ( a.low != 0 );
5235 z.high += roundBitsMask;
5236 }
5237 }
5238 z.high &= ~ roundBitsMask;
5239 }
5240 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
5241 STATUS(float_exception_flags) |= float_flag_inexact;
5242 }
5243 return z;
5244
5245 }
5246
5247 /*----------------------------------------------------------------------------
5248 | Returns the result of adding the absolute values of the quadruple-precision
5249 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
5250 | before being returned. `zSign' is ignored if the result is a NaN.
5251 | The addition is performed according to the IEC/IEEE Standard for Binary
5252 | Floating-Point Arithmetic.
5253 *----------------------------------------------------------------------------*/
5254
5255 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5256 {
5257 int32 aExp, bExp, zExp;
5258 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5259 int32 expDiff;
5260
5261 aSig1 = extractFloat128Frac1( a );
5262 aSig0 = extractFloat128Frac0( a );
5263 aExp = extractFloat128Exp( a );
5264 bSig1 = extractFloat128Frac1( b );
5265 bSig0 = extractFloat128Frac0( b );
5266 bExp = extractFloat128Exp( b );
5267 expDiff = aExp - bExp;
5268 if ( 0 < expDiff ) {
5269 if ( aExp == 0x7FFF ) {
5270 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5271 return a;
5272 }
5273 if ( bExp == 0 ) {
5274 --expDiff;
5275 }
5276 else {
5277 bSig0 |= LIT64( 0x0001000000000000 );
5278 }
5279 shift128ExtraRightJamming(
5280 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
5281 zExp = aExp;
5282 }
5283 else if ( expDiff < 0 ) {
5284 if ( bExp == 0x7FFF ) {
5285 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5286 return packFloat128( zSign, 0x7FFF, 0, 0 );
5287 }
5288 if ( aExp == 0 ) {
5289 ++expDiff;
5290 }
5291 else {
5292 aSig0 |= LIT64( 0x0001000000000000 );
5293 }
5294 shift128ExtraRightJamming(
5295 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
5296 zExp = bExp;
5297 }
5298 else {
5299 if ( aExp == 0x7FFF ) {
5300 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5301 return propagateFloat128NaN( a, b STATUS_VAR );
5302 }
5303 return a;
5304 }
5305 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5306 if ( aExp == 0 ) {
5307 if (STATUS(flush_to_zero)) {
5308 if (zSig0 | zSig1) {
5309 float_raise(float_flag_output_denormal STATUS_VAR);
5310 }
5311 return packFloat128(zSign, 0, 0, 0);
5312 }
5313 return packFloat128( zSign, 0, zSig0, zSig1 );
5314 }
5315 zSig2 = 0;
5316 zSig0 |= LIT64( 0x0002000000000000 );
5317 zExp = aExp;
5318 goto shiftRight1;
5319 }
5320 aSig0 |= LIT64( 0x0001000000000000 );
5321 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5322 --zExp;
5323 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
5324 ++zExp;
5325 shiftRight1:
5326 shift128ExtraRightJamming(
5327 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5328 roundAndPack:
5329 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5330
5331 }
5332
5333 /*----------------------------------------------------------------------------
5334 | Returns the result of subtracting the absolute values of the quadruple-
5335 | precision floating-point values `a' and `b'. If `zSign' is 1, the
5336 | difference is negated before being returned. `zSign' is ignored if the
5337 | result is a NaN. The subtraction is performed according to the IEC/IEEE
5338 | Standard for Binary Floating-Point Arithmetic.
5339 *----------------------------------------------------------------------------*/
5340
5341 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5342 {
5343 int32 aExp, bExp, zExp;
5344 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
5345 int32 expDiff;
5346 float128 z;
5347
5348 aSig1 = extractFloat128Frac1( a );
5349 aSig0 = extractFloat128Frac0( a );
5350 aExp = extractFloat128Exp( a );
5351 bSig1 = extractFloat128Frac1( b );
5352 bSig0 = extractFloat128Frac0( b );
5353 bExp = extractFloat128Exp( b );
5354 expDiff = aExp - bExp;
5355 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5356 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
5357 if ( 0 < expDiff ) goto aExpBigger;
5358 if ( expDiff < 0 ) goto bExpBigger;
5359 if ( aExp == 0x7FFF ) {
5360 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5361 return propagateFloat128NaN( a, b STATUS_VAR );
5362 }
5363 float_raise( float_flag_invalid STATUS_VAR);
5364 z.low = float128_default_nan_low;
5365 z.high = float128_default_nan_high;
5366 return z;
5367 }
5368 if ( aExp == 0 ) {
5369 aExp = 1;
5370 bExp = 1;
5371 }
5372 if ( bSig0 < aSig0 ) goto aBigger;
5373 if ( aSig0 < bSig0 ) goto bBigger;
5374 if ( bSig1 < aSig1 ) goto aBigger;
5375 if ( aSig1 < bSig1 ) goto bBigger;
5376 return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
5377 bExpBigger:
5378 if ( bExp == 0x7FFF ) {
5379 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5380 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
5381 }
5382 if ( aExp == 0 ) {
5383 ++expDiff;
5384 }
5385 else {
5386 aSig0 |= LIT64( 0x4000000000000000 );
5387 }
5388 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5389 bSig0 |= LIT64( 0x4000000000000000 );
5390 bBigger:
5391 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
5392 zExp = bExp;
5393 zSign ^= 1;
5394 goto normalizeRoundAndPack;
5395 aExpBigger:
5396 if ( aExp == 0x7FFF ) {
5397 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5398 return a;
5399 }
5400 if ( bExp == 0 ) {
5401 --expDiff;
5402 }
5403 else {
5404 bSig0 |= LIT64( 0x4000000000000000 );
5405 }
5406 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
5407 aSig0 |= LIT64( 0x4000000000000000 );
5408 aBigger:
5409 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5410 zExp = aExp;
5411 normalizeRoundAndPack:
5412 --zExp;
5413 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
5414
5415 }
5416
5417 /*----------------------------------------------------------------------------
5418 | Returns the result of adding the quadruple-precision floating-point values
5419 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
5420 | for Binary Floating-Point Arithmetic.
5421 *----------------------------------------------------------------------------*/
5422
5423 float128 float128_add( float128 a, float128 b STATUS_PARAM )
5424 {
5425 flag aSign, bSign;
5426
5427 aSign = extractFloat128Sign( a );
5428 bSign = extractFloat128Sign( b );
5429 if ( aSign == bSign ) {
5430 return addFloat128Sigs( a, b, aSign STATUS_VAR );
5431 }
5432 else {
5433 return subFloat128Sigs( a, b, aSign STATUS_VAR );
5434 }
5435
5436 }
5437
5438 /*----------------------------------------------------------------------------
5439 | Returns the result of subtracting the quadruple-precision floating-point
5440 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5441 | Standard for Binary Floating-Point Arithmetic.
5442 *----------------------------------------------------------------------------*/
5443
5444 float128 float128_sub( float128 a, float128 b STATUS_PARAM )
5445 {
5446 flag aSign, bSign;
5447
5448 aSign = extractFloat128Sign( a );
5449 bSign = extractFloat128Sign( b );
5450 if ( aSign == bSign ) {
5451 return subFloat128Sigs( a, b, aSign STATUS_VAR );
5452 }
5453 else {
5454 return addFloat128Sigs( a, b, aSign STATUS_VAR );
5455 }
5456
5457 }
5458
5459 /*----------------------------------------------------------------------------
5460 | Returns the result of multiplying the quadruple-precision floating-point
5461 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5462 | Standard for Binary Floating-Point Arithmetic.
5463 *----------------------------------------------------------------------------*/
5464
5465 float128 float128_mul( float128 a, float128 b STATUS_PARAM )
5466 {
5467 flag aSign, bSign, zSign;
5468 int32 aExp, bExp, zExp;
5469 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
5470 float128 z;
5471
5472 aSig1 = extractFloat128Frac1( a );
5473 aSig0 = extractFloat128Frac0( a );
5474 aExp = extractFloat128Exp( a );
5475 aSign = extractFloat128Sign( a );
5476 bSig1 = extractFloat128Frac1( b );
5477 bSig0 = extractFloat128Frac0( b );
5478 bExp = extractFloat128Exp( b );
5479 bSign = extractFloat128Sign( b );
5480 zSign = aSign ^ bSign;
5481 if ( aExp == 0x7FFF ) {
5482 if ( ( aSig0 | aSig1 )
5483 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5484 return propagateFloat128NaN( a, b STATUS_VAR );
5485 }
5486 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
5487 return packFloat128( zSign, 0x7FFF, 0, 0 );
5488 }
5489 if ( bExp == 0x7FFF ) {
5490 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5491 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5492 invalid:
5493 float_raise( float_flag_invalid STATUS_VAR);
5494 z.low = float128_default_nan_low;
5495 z.high = float128_default_nan_high;
5496 return z;
5497 }
5498 return packFloat128( zSign, 0x7FFF, 0, 0 );
5499 }
5500 if ( aExp == 0 ) {
5501 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5502 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5503 }
5504 if ( bExp == 0 ) {
5505 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5506 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5507 }
5508 zExp = aExp + bExp - 0x4000;
5509 aSig0 |= LIT64( 0x0001000000000000 );
5510 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5511 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5512 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5513 zSig2 |= ( zSig3 != 0 );
5514 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5515 shift128ExtraRightJamming(
5516 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5517 ++zExp;
5518 }
5519 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5520
5521 }
5522
5523 /*----------------------------------------------------------------------------
5524 | Returns the result of dividing the quadruple-precision floating-point value
5525 | `a' by the corresponding value `b'. The operation is performed according to
5526 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5527 *----------------------------------------------------------------------------*/
5528
5529 float128 float128_div( float128 a, float128 b STATUS_PARAM )
5530 {
5531 flag aSign, bSign, zSign;
5532 int32 aExp, bExp, zExp;
5533 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5534 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5535 float128 z;
5536
5537 aSig1 = extractFloat128Frac1( a );
5538 aSig0 = extractFloat128Frac0( a );
5539 aExp = extractFloat128Exp( a );
5540 aSign = extractFloat128Sign( a );
5541 bSig1 = extractFloat128Frac1( b );
5542 bSig0 = extractFloat128Frac0( b );
5543 bExp = extractFloat128Exp( b );
5544 bSign = extractFloat128Sign( b );
5545 zSign = aSign ^ bSign;
5546 if ( aExp == 0x7FFF ) {
5547 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5548 if ( bExp == 0x7FFF ) {
5549 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5550 goto invalid;
5551 }
5552 return packFloat128( zSign, 0x7FFF, 0, 0 );
5553 }
5554 if ( bExp == 0x7FFF ) {
5555 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5556 return packFloat128( zSign, 0, 0, 0 );
5557 }
5558 if ( bExp == 0 ) {
5559 if ( ( bSig0 | bSig1 ) == 0 ) {
5560 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5561 invalid:
5562 float_raise( float_flag_invalid STATUS_VAR);
5563 z.low = float128_default_nan_low;
5564 z.high = float128_default_nan_high;
5565 return z;
5566 }
5567 float_raise( float_flag_divbyzero STATUS_VAR);
5568 return packFloat128( zSign, 0x7FFF, 0, 0 );
5569 }
5570 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5571 }
5572 if ( aExp == 0 ) {
5573 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5574 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5575 }
5576 zExp = aExp - bExp + 0x3FFD;
5577 shortShift128Left(
5578 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5579 shortShift128Left(
5580 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5581 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5582 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5583 ++zExp;
5584 }
5585 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5586 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5587 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5588 while ( (int64_t) rem0 < 0 ) {
5589 --zSig0;
5590 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5591 }
5592 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5593 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5594 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5595 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5596 while ( (int64_t) rem1 < 0 ) {
5597 --zSig1;
5598 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5599 }
5600 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5601 }
5602 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5603 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5604
5605 }
5606
5607 /*----------------------------------------------------------------------------
5608 | Returns the remainder of the quadruple-precision floating-point value `a'
5609 | with respect to the corresponding value `b'. The operation is performed
5610 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5611 *----------------------------------------------------------------------------*/
5612
5613 float128 float128_rem( float128 a, float128 b STATUS_PARAM )
5614 {
5615 flag aSign, zSign;
5616 int32 aExp, bExp, expDiff;
5617 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5618 uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
5619 int64_t sigMean0;
5620 float128 z;
5621
5622 aSig1 = extractFloat128Frac1( a );
5623 aSig0 = extractFloat128Frac0( a );
5624 aExp = extractFloat128Exp( a );
5625 aSign = extractFloat128Sign( a );
5626 bSig1 = extractFloat128Frac1( b );
5627 bSig0 = extractFloat128Frac0( b );
5628 bExp = extractFloat128Exp( b );
5629 if ( aExp == 0x7FFF ) {
5630 if ( ( aSig0 | aSig1 )
5631 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5632 return propagateFloat128NaN( a, b STATUS_VAR );
5633 }
5634 goto invalid;
5635 }
5636 if ( bExp == 0x7FFF ) {
5637 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5638 return a;
5639 }
5640 if ( bExp == 0 ) {
5641 if ( ( bSig0 | bSig1 ) == 0 ) {
5642 invalid:
5643 float_raise( float_flag_invalid STATUS_VAR);
5644 z.low = float128_default_nan_low;
5645 z.high = float128_default_nan_high;
5646 return z;
5647 }
5648 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5649 }
5650 if ( aExp == 0 ) {
5651 if ( ( aSig0 | aSig1 ) == 0 ) return a;
5652 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5653 }
5654 expDiff = aExp - bExp;
5655 if ( expDiff < -1 ) return a;
5656 shortShift128Left(
5657 aSig0 | LIT64( 0x0001000000000000 ),
5658 aSig1,
5659 15 - ( expDiff < 0 ),
5660 &aSig0,
5661 &aSig1
5662 );
5663 shortShift128Left(
5664 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5665 q = le128( bSig0, bSig1, aSig0, aSig1 );
5666 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5667 expDiff -= 64;
5668 while ( 0 < expDiff ) {
5669 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5670 q = ( 4 < q ) ? q - 4 : 0;
5671 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5672 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5673 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5674 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5675 expDiff -= 61;
5676 }
5677 if ( -64 < expDiff ) {
5678 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5679 q = ( 4 < q ) ? q - 4 : 0;
5680 q >>= - expDiff;
5681 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5682 expDiff += 52;
5683 if ( expDiff < 0 ) {
5684 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5685 }
5686 else {
5687 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5688 }
5689 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5690 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5691 }
5692 else {
5693 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5694 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5695 }
5696 do {
5697 alternateASig0 = aSig0;
5698 alternateASig1 = aSig1;
5699 ++q;
5700 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5701 } while ( 0 <= (int64_t) aSig0 );
5702 add128(
5703 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
5704 if ( ( sigMean0 < 0 )
5705 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5706 aSig0 = alternateASig0;
5707 aSig1 = alternateASig1;
5708 }
5709 zSign = ( (int64_t) aSig0 < 0 );
5710 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5711 return
5712 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
5713
5714 }
5715
5716 /*----------------------------------------------------------------------------
5717 | Returns the square root of the quadruple-precision floating-point value `a'.
5718 | The operation is performed according to the IEC/IEEE Standard for Binary
5719 | Floating-Point Arithmetic.
5720 *----------------------------------------------------------------------------*/
5721
5722 float128 float128_sqrt( float128 a STATUS_PARAM )
5723 {
5724 flag aSign;
5725 int32 aExp, zExp;
5726 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5727 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5728 float128 z;
5729
5730 aSig1 = extractFloat128Frac1( a );
5731 aSig0 = extractFloat128Frac0( a );
5732 aExp = extractFloat128Exp( a );
5733 aSign = extractFloat128Sign( a );
5734 if ( aExp == 0x7FFF ) {
5735 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
5736 if ( ! aSign ) return a;
5737 goto invalid;
5738 }
5739 if ( aSign ) {
5740 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5741 invalid:
5742 float_raise( float_flag_invalid STATUS_VAR);
5743 z.low = float128_default_nan_low;
5744 z.high = float128_default_nan_high;
5745 return z;
5746 }
5747 if ( aExp == 0 ) {
5748 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5749 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5750 }
5751 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5752 aSig0 |= LIT64( 0x0001000000000000 );
5753 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5754 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5755 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5756 doubleZSig0 = zSig0<<1;
5757 mul64To128( zSig0, zSig0, &term0, &term1 );
5758 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5759 while ( (int64_t) rem0 < 0 ) {
5760 --zSig0;
5761 doubleZSig0 -= 2;
5762 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5763 }
5764 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5765 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5766 if ( zSig1 == 0 ) zSig1 = 1;
5767 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5768 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5769 mul64To128( zSig1, zSig1, &term2, &term3 );
5770 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5771 while ( (int64_t) rem1 < 0 ) {
5772 --zSig1;
5773 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5774 term3 |= 1;
5775 term2 |= doubleZSig0;
5776 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5777 }
5778 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5779 }
5780 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5781 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5782
5783 }
5784
5785 /*----------------------------------------------------------------------------
5786 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5787 | the corresponding value `b', and 0 otherwise. The invalid exception is
5788 | raised if either operand is a NaN. Otherwise, the comparison is performed
5789 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5790 *----------------------------------------------------------------------------*/
5791
5792 int float128_eq( float128 a, float128 b STATUS_PARAM )
5793 {
5794
5795 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5796 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5797 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5798 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5799 ) {
5800 float_raise( float_flag_invalid STATUS_VAR);
5801 return 0;
5802 }
5803 return
5804 ( a.low == b.low )
5805 && ( ( a.high == b.high )
5806 || ( ( a.low == 0 )
5807 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5808 );
5809
5810 }
5811
5812 /*----------------------------------------------------------------------------
5813 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5814 | or equal to the corresponding value `b', and 0 otherwise. The invalid
5815 | exception is raised if either operand is a NaN. The comparison is performed
5816 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5817 *----------------------------------------------------------------------------*/
5818
5819 int float128_le( float128 a, float128 b STATUS_PARAM )
5820 {
5821 flag aSign, bSign;
5822
5823 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5824 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5825 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5826 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5827 ) {
5828 float_raise( float_flag_invalid STATUS_VAR);
5829 return 0;
5830 }
5831 aSign = extractFloat128Sign( a );
5832 bSign = extractFloat128Sign( b );
5833 if ( aSign != bSign ) {
5834 return
5835 aSign
5836 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5837 == 0 );
5838 }
5839 return
5840 aSign ? le128( b.high, b.low, a.high, a.low )
5841 : le128( a.high, a.low, b.high, b.low );
5842
5843 }
5844
5845 /*----------------------------------------------------------------------------
5846 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5847 | the corresponding value `b', and 0 otherwise. The invalid exception is
5848 | raised if either operand is a NaN. The comparison is performed according
5849 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5850 *----------------------------------------------------------------------------*/
5851
5852 int float128_lt( float128 a, float128 b STATUS_PARAM )
5853 {
5854 flag aSign, bSign;
5855
5856 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5857 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5858 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5859 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5860 ) {
5861 float_raise( float_flag_invalid STATUS_VAR);
5862 return 0;
5863 }
5864 aSign = extractFloat128Sign( a );
5865 bSign = extractFloat128Sign( b );
5866 if ( aSign != bSign ) {
5867 return
5868 aSign
5869 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5870 != 0 );
5871 }
5872 return
5873 aSign ? lt128( b.high, b.low, a.high, a.low )
5874 : lt128( a.high, a.low, b.high, b.low );
5875
5876 }
5877
5878 /*----------------------------------------------------------------------------
5879 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
5880 | be compared, and 0 otherwise. The invalid exception is raised if either
5881 | operand is a NaN. The comparison is performed according to the IEC/IEEE
5882 | Standard for Binary Floating-Point Arithmetic.
5883 *----------------------------------------------------------------------------*/
5884
5885 int float128_unordered( float128 a, float128 b STATUS_PARAM )
5886 {
5887 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5888 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5889 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5890 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5891 ) {
5892 float_raise( float_flag_invalid STATUS_VAR);
5893 return 1;
5894 }
5895 return 0;
5896 }
5897
5898 /*----------------------------------------------------------------------------
5899 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5900 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5901 | exception. The comparison is performed according to the IEC/IEEE Standard
5902 | for Binary Floating-Point Arithmetic.
5903 *----------------------------------------------------------------------------*/
5904
5905 int float128_eq_quiet( float128 a, float128 b STATUS_PARAM )
5906 {
5907
5908 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5909 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5910 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5911 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5912 ) {
5913 if ( float128_is_signaling_nan( a )
5914 || float128_is_signaling_nan( b ) ) {
5915 float_raise( float_flag_invalid STATUS_VAR);
5916 }
5917 return 0;
5918 }
5919 return
5920 ( a.low == b.low )
5921 && ( ( a.high == b.high )
5922 || ( ( a.low == 0 )
5923 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5924 );
5925
5926 }
5927
5928 /*----------------------------------------------------------------------------
5929 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5930 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5931 | cause an exception. Otherwise, the comparison is performed according to the
5932 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5933 *----------------------------------------------------------------------------*/
5934
5935 int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
5936 {
5937 flag aSign, bSign;
5938
5939 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5940 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5941 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5942 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5943 ) {
5944 if ( float128_is_signaling_nan( a )
5945 || float128_is_signaling_nan( b ) ) {
5946 float_raise( float_flag_invalid STATUS_VAR);
5947 }
5948 return 0;
5949 }
5950 aSign = extractFloat128Sign( a );
5951 bSign = extractFloat128Sign( b );
5952 if ( aSign != bSign ) {
5953 return
5954 aSign
5955 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5956 == 0 );
5957 }
5958 return
5959 aSign ? le128( b.high, b.low, a.high, a.low )
5960 : le128( a.high, a.low, b.high, b.low );
5961
5962 }
5963
5964 /*----------------------------------------------------------------------------
5965 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5966 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5967 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
5968 | Standard for Binary Floating-Point Arithmetic.
5969 *----------------------------------------------------------------------------*/
5970
5971 int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
5972 {
5973 flag aSign, bSign;
5974
5975 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5976 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5977 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5978 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5979 ) {
5980 if ( float128_is_signaling_nan( a )
5981 || float128_is_signaling_nan( b ) ) {
5982 float_raise( float_flag_invalid STATUS_VAR);
5983 }
5984 return 0;
5985 }
5986 aSign = extractFloat128Sign( a );
5987 bSign = extractFloat128Sign( b );
5988 if ( aSign != bSign ) {
5989 return
5990 aSign
5991 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5992 != 0 );
5993 }
5994 return
5995 aSign ? lt128( b.high, b.low, a.high, a.low )
5996 : lt128( a.high, a.low, b.high, b.low );
5997
5998 }
5999
6000 /*----------------------------------------------------------------------------
6001 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6002 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
6003 | comparison is performed according to the IEC/IEEE Standard for Binary
6004 | Floating-Point Arithmetic.
6005 *----------------------------------------------------------------------------*/
6006
6007 int float128_unordered_quiet( float128 a, float128 b STATUS_PARAM )
6008 {
6009 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6010 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6011 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6012 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6013 ) {
6014 if ( float128_is_signaling_nan( a )
6015 || float128_is_signaling_nan( b ) ) {
6016 float_raise( float_flag_invalid STATUS_VAR);
6017 }
6018 return 1;
6019 }
6020 return 0;
6021 }
6022
6023 #endif
6024
6025 /* misc functions */
6026 float32 uint32_to_float32( unsigned int a STATUS_PARAM )
6027 {
6028 return int64_to_float32(a STATUS_VAR);
6029 }
6030
6031 float64 uint32_to_float64( unsigned int a STATUS_PARAM )
6032 {
6033 return int64_to_float64(a STATUS_VAR);
6034 }
6035
6036 unsigned int float32_to_uint32( float32 a STATUS_PARAM )
6037 {
6038 int64_t v;
6039 unsigned int res;
6040
6041 v = float32_to_int64(a STATUS_VAR);
6042 if (v < 0) {
6043 res = 0;
6044 float_raise( float_flag_invalid STATUS_VAR);
6045 } else if (v > 0xffffffff) {
6046 res = 0xffffffff;
6047 float_raise( float_flag_invalid STATUS_VAR);
6048 } else {
6049 res = v;
6050 }
6051 return res;
6052 }
6053
6054 unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
6055 {
6056 int64_t v;
6057 unsigned int res;
6058
6059 v = float32_to_int64_round_to_zero(a STATUS_VAR);
6060 if (v < 0) {
6061 res = 0;
6062 float_raise( float_flag_invalid STATUS_VAR);
6063 } else if (v > 0xffffffff) {
6064 res = 0xffffffff;
6065 float_raise( float_flag_invalid STATUS_VAR);
6066 } else {
6067 res = v;
6068 }
6069 return res;
6070 }
6071
6072 unsigned int float32_to_uint16_round_to_zero( float32 a STATUS_PARAM )
6073 {
6074 int64_t v;
6075 unsigned int res;
6076
6077 v = float32_to_int64_round_to_zero(a STATUS_VAR);
6078 if (v < 0) {
6079 res = 0;
6080 float_raise( float_flag_invalid STATUS_VAR);
6081 } else if (v > 0xffff) {
6082 res = 0xffff;
6083 float_raise( float_flag_invalid STATUS_VAR);
6084 } else {
6085 res = v;
6086 }
6087 return res;
6088 }
6089
6090 unsigned int float64_to_uint32( float64 a STATUS_PARAM )
6091 {
6092 int64_t v;
6093 unsigned int res;
6094
6095 v = float64_to_int64(a STATUS_VAR);
6096 if (v < 0) {
6097 res = 0;
6098 float_raise( float_flag_invalid STATUS_VAR);
6099 } else if (v > 0xffffffff) {
6100 res = 0xffffffff;
6101 float_raise( float_flag_invalid STATUS_VAR);
6102 } else {
6103 res = v;
6104 }
6105 return res;
6106 }
6107
6108 unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
6109 {
6110 int64_t v;
6111 unsigned int res;
6112
6113 v = float64_to_int64_round_to_zero(a STATUS_VAR);
6114 if (v < 0) {
6115 res = 0;
6116 float_raise( float_flag_invalid STATUS_VAR);
6117 } else if (v > 0xffffffff) {
6118 res = 0xffffffff;
6119 float_raise( float_flag_invalid STATUS_VAR);
6120 } else {
6121 res = v;
6122 }
6123 return res;
6124 }
6125
6126 unsigned int float64_to_uint16_round_to_zero( float64 a STATUS_PARAM )
6127 {
6128 int64_t v;
6129 unsigned int res;
6130
6131 v = float64_to_int64_round_to_zero(a STATUS_VAR);
6132 if (v < 0) {
6133 res = 0;
6134 float_raise( float_flag_invalid STATUS_VAR);
6135 } else if (v > 0xffff) {
6136 res = 0xffff;
6137 float_raise( float_flag_invalid STATUS_VAR);
6138 } else {
6139 res = v;
6140 }
6141 return res;
6142 }
6143
6144 /* FIXME: This looks broken. */
6145 uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
6146 {
6147 int64_t v;
6148
6149 v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
6150 v += float64_val(a);
6151 v = float64_to_int64(make_float64(v) STATUS_VAR);
6152
6153 return v - INT64_MIN;
6154 }
6155
6156 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
6157 {
6158 int64_t v;
6159
6160 v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
6161 v += float64_val(a);
6162 v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);
6163
6164 return v - INT64_MIN;
6165 }
6166
6167 #define COMPARE(s, nan_exp) \
6168 INLINE int float ## s ## _compare_internal( float ## s a, float ## s b, \
6169 int is_quiet STATUS_PARAM ) \
6170 { \
6171 flag aSign, bSign; \
6172 uint ## s ## _t av, bv; \
6173 a = float ## s ## _squash_input_denormal(a STATUS_VAR); \
6174 b = float ## s ## _squash_input_denormal(b STATUS_VAR); \
6175 \
6176 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
6177 extractFloat ## s ## Frac( a ) ) || \
6178 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
6179 extractFloat ## s ## Frac( b ) )) { \
6180 if (!is_quiet || \
6181 float ## s ## _is_signaling_nan( a ) || \
6182 float ## s ## _is_signaling_nan( b ) ) { \
6183 float_raise( float_flag_invalid STATUS_VAR); \
6184 } \
6185 return float_relation_unordered; \
6186 } \
6187 aSign = extractFloat ## s ## Sign( a ); \
6188 bSign = extractFloat ## s ## Sign( b ); \
6189 av = float ## s ## _val(a); \
6190 bv = float ## s ## _val(b); \
6191 if ( aSign != bSign ) { \
6192 if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) { \
6193 /* zero case */ \
6194 return float_relation_equal; \
6195 } else { \
6196 return 1 - (2 * aSign); \
6197 } \
6198 } else { \
6199 if (av == bv) { \
6200 return float_relation_equal; \
6201 } else { \
6202 return 1 - 2 * (aSign ^ ( av < bv )); \
6203 } \
6204 } \
6205 } \
6206 \
6207 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM ) \
6208 { \
6209 return float ## s ## _compare_internal(a, b, 0 STATUS_VAR); \
6210 } \
6211 \
6212 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \
6213 { \
6214 return float ## s ## _compare_internal(a, b, 1 STATUS_VAR); \
6215 }
6216
6217 COMPARE(32, 0xff)
6218 COMPARE(64, 0x7ff)
6219
6220 INLINE int floatx80_compare_internal( floatx80 a, floatx80 b,
6221 int is_quiet STATUS_PARAM )
6222 {
6223 flag aSign, bSign;
6224
6225 if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
6226 ( extractFloatx80Frac( a )<<1 ) ) ||
6227 ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
6228 ( extractFloatx80Frac( b )<<1 ) )) {
6229 if (!is_quiet ||
6230 floatx80_is_signaling_nan( a ) ||
6231 floatx80_is_signaling_nan( b ) ) {
6232 float_raise( float_flag_invalid STATUS_VAR);
6233 }
6234 return float_relation_unordered;
6235 }
6236 aSign = extractFloatx80Sign( a );
6237 bSign = extractFloatx80Sign( b );
6238 if ( aSign != bSign ) {
6239
6240 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
6241 ( ( a.low | b.low ) == 0 ) ) {
6242 /* zero case */
6243 return float_relation_equal;
6244 } else {
6245 return 1 - (2 * aSign);
6246 }
6247 } else {
6248 if (a.low == b.low && a.high == b.high) {
6249 return float_relation_equal;
6250 } else {
6251 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6252 }
6253 }
6254 }
6255
6256 int floatx80_compare( floatx80 a, floatx80 b STATUS_PARAM )
6257 {
6258 return floatx80_compare_internal(a, b, 0 STATUS_VAR);
6259 }
6260
6261 int floatx80_compare_quiet( floatx80 a, floatx80 b STATUS_PARAM )
6262 {
6263 return floatx80_compare_internal(a, b, 1 STATUS_VAR);
6264 }
6265
6266 INLINE int float128_compare_internal( float128 a, float128 b,
6267 int is_quiet STATUS_PARAM )
6268 {
6269 flag aSign, bSign;
6270
6271 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6272 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6273 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6274 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6275 if (!is_quiet ||
6276 float128_is_signaling_nan( a ) ||
6277 float128_is_signaling_nan( b ) ) {
6278 float_raise( float_flag_invalid STATUS_VAR);
6279 }
6280 return float_relation_unordered;
6281 }
6282 aSign = extractFloat128Sign( a );
6283 bSign = extractFloat128Sign( b );
6284 if ( aSign != bSign ) {
6285 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6286 /* zero case */
6287 return float_relation_equal;
6288 } else {
6289 return 1 - (2 * aSign);
6290 }
6291 } else {
6292 if (a.low == b.low && a.high == b.high) {
6293 return float_relation_equal;
6294 } else {
6295 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6296 }
6297 }
6298 }
6299
6300 int float128_compare( float128 a, float128 b STATUS_PARAM )
6301 {
6302 return float128_compare_internal(a, b, 0 STATUS_VAR);
6303 }
6304
6305 int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
6306 {
6307 return float128_compare_internal(a, b, 1 STATUS_VAR);
6308 }
6309
6310 /* min() and max() functions. These can't be implemented as
6311 * 'compare and pick one input' because that would mishandle
6312 * NaNs and +0 vs -0.
6313 */
6314 #define MINMAX(s, nan_exp) \
6315 INLINE float ## s float ## s ## _minmax(float ## s a, float ## s b, \
6316 int ismin STATUS_PARAM ) \
6317 { \
6318 flag aSign, bSign; \
6319 uint ## s ## _t av, bv; \
6320 a = float ## s ## _squash_input_denormal(a STATUS_VAR); \
6321 b = float ## s ## _squash_input_denormal(b STATUS_VAR); \
6322 if (float ## s ## _is_any_nan(a) || \
6323 float ## s ## _is_any_nan(b)) { \
6324 return propagateFloat ## s ## NaN(a, b STATUS_VAR); \
6325 } \
6326 aSign = extractFloat ## s ## Sign(a); \
6327 bSign = extractFloat ## s ## Sign(b); \
6328 av = float ## s ## _val(a); \
6329 bv = float ## s ## _val(b); \
6330 if (aSign != bSign) { \
6331 if (ismin) { \
6332 return aSign ? a : b; \
6333 } else { \
6334 return aSign ? b : a; \
6335 } \
6336 } else { \
6337 if (ismin) { \
6338 return (aSign ^ (av < bv)) ? a : b; \
6339 } else { \
6340 return (aSign ^ (av < bv)) ? b : a; \
6341 } \
6342 } \
6343 } \
6344 \
6345 float ## s float ## s ## _min(float ## s a, float ## s b STATUS_PARAM) \
6346 { \
6347 return float ## s ## _minmax(a, b, 1 STATUS_VAR); \
6348 } \
6349 \
6350 float ## s float ## s ## _max(float ## s a, float ## s b STATUS_PARAM) \
6351 { \
6352 return float ## s ## _minmax(a, b, 0 STATUS_VAR); \
6353 }
6354
6355 MINMAX(32, 0xff)
6356 MINMAX(64, 0x7ff)
6357
6358
6359 /* Multiply A by 2 raised to the power N. */
6360 float32 float32_scalbn( float32 a, int n STATUS_PARAM )
6361 {
6362 flag aSign;
6363 int16_t aExp;
6364 uint32_t aSig;
6365
6366 a = float32_squash_input_denormal(a STATUS_VAR);
6367 aSig = extractFloat32Frac( a );
6368 aExp = extractFloat32Exp( a );
6369 aSign = extractFloat32Sign( a );
6370
6371 if ( aExp == 0xFF ) {
6372 if ( aSig ) {
6373 return propagateFloat32NaN( a, a STATUS_VAR );
6374 }
6375 return a;
6376 }
6377 if ( aExp != 0 )
6378 aSig |= 0x00800000;
6379 else if ( aSig == 0 )
6380 return a;
6381
6382 if (n > 0x200) {
6383 n = 0x200;
6384 } else if (n < -0x200) {
6385 n = -0x200;
6386 }
6387
6388 aExp += n - 1;
6389 aSig <<= 7;
6390 return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
6391 }
6392
6393 float64 float64_scalbn( float64 a, int n STATUS_PARAM )
6394 {
6395 flag aSign;
6396 int16_t aExp;
6397 uint64_t aSig;
6398
6399 a = float64_squash_input_denormal(a STATUS_VAR);
6400 aSig = extractFloat64Frac( a );
6401 aExp = extractFloat64Exp( a );
6402 aSign = extractFloat64Sign( a );
6403
6404 if ( aExp == 0x7FF ) {
6405 if ( aSig ) {
6406 return propagateFloat64NaN( a, a STATUS_VAR );
6407 }
6408 return a;
6409 }
6410 if ( aExp != 0 )
6411 aSig |= LIT64( 0x0010000000000000 );
6412 else if ( aSig == 0 )
6413 return a;
6414
6415 if (n > 0x1000) {
6416 n = 0x1000;
6417 } else if (n < -0x1000) {
6418 n = -0x1000;
6419 }
6420
6421 aExp += n - 1;
6422 aSig <<= 10;
6423 return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
6424 }
6425
6426 #ifdef FLOATX80
6427 floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
6428 {
6429 flag aSign;
6430 int32_t aExp;
6431 uint64_t aSig;
6432
6433 aSig = extractFloatx80Frac( a );
6434 aExp = extractFloatx80Exp( a );
6435 aSign = extractFloatx80Sign( a );
6436
6437 if ( aExp == 0x7FFF ) {
6438 if ( aSig<<1 ) {
6439 return propagateFloatx80NaN( a, a STATUS_VAR );
6440 }
6441 return a;
6442 }
6443
6444 if (aExp == 0 && aSig == 0)
6445 return a;
6446
6447 if (n > 0x10000) {
6448 n = 0x10000;
6449 } else if (n < -0x10000) {
6450 n = -0x10000;
6451 }
6452
6453 aExp += n;
6454 return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
6455 aSign, aExp, aSig, 0 STATUS_VAR );
6456 }
6457 #endif
6458
6459 #ifdef FLOAT128
6460 float128 float128_scalbn( float128 a, int n STATUS_PARAM )
6461 {
6462 flag aSign;
6463 int32_t aExp;
6464 uint64_t aSig0, aSig1;
6465
6466 aSig1 = extractFloat128Frac1( a );
6467 aSig0 = extractFloat128Frac0( a );
6468 aExp = extractFloat128Exp( a );
6469 aSign = extractFloat128Sign( a );
6470 if ( aExp == 0x7FFF ) {
6471 if ( aSig0 | aSig1 ) {
6472 return propagateFloat128NaN( a, a STATUS_VAR );
6473 }
6474 return a;
6475 }
6476 if ( aExp != 0 )
6477 aSig0 |= LIT64( 0x0001000000000000 );
6478 else if ( aSig0 == 0 && aSig1 == 0 )
6479 return a;
6480
6481 if (n > 0x10000) {
6482 n = 0x10000;
6483 } else if (n < -0x10000) {
6484 n = -0x10000;
6485 }
6486
6487 aExp += n - 1;
6488 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
6489 STATUS_VAR );
6490
6491 }
6492 #endif