]> git.proxmox.com Git - qemu.git/blob - fpu/softfloat.c
softfloat: move float*_eq and float*_eq_quiet
[qemu.git] / fpu / softfloat.c
1 /*
2 * QEMU float support
3 *
4 * Derived from SoftFloat.
5 */
6
7 /*============================================================================
8
9 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
10 Package, Release 2b.
11
12 Written by John R. Hauser. This work was made possible in part by the
13 International Computer Science Institute, located at Suite 600, 1947 Center
14 Street, Berkeley, California 94704. Funding was partially provided by the
15 National Science Foundation under grant MIP-9311980. The original version
16 of this code was written as part of a project to build a fixed-point vector
17 processor in collaboration with the University of California at Berkeley,
18 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
19 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
20 arithmetic/SoftFloat.html'.
21
22 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
23 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
24 RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
25 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
26 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
27 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
28 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
29 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
30
31 Derivative works are acceptable, even for commercial purposes, so long as
32 (1) the source code for the derivative work includes prominent notice that
33 the work is derivative, and (2) the source code includes prominent notice with
34 these four paragraphs for those parts of this code that are retained.
35
36 =============================================================================*/
37
38 #include "softfloat.h"
39
40 /*----------------------------------------------------------------------------
41 | Primitive arithmetic functions, including multi-word arithmetic, and
42 | division and square root approximations. (Can be specialized to target if
43 | desired.)
44 *----------------------------------------------------------------------------*/
45 #include "softfloat-macros.h"
46
47 /*----------------------------------------------------------------------------
48 | Functions and definitions to determine: (1) whether tininess for underflow
49 | is detected before or after rounding by default, (2) what (if anything)
50 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
51 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
52 | are propagated from function inputs to output. These details are target-
53 | specific.
54 *----------------------------------------------------------------------------*/
55 #include "softfloat-specialize.h"
56
57 void set_float_rounding_mode(int val STATUS_PARAM)
58 {
59 STATUS(float_rounding_mode) = val;
60 }
61
62 void set_float_exception_flags(int val STATUS_PARAM)
63 {
64 STATUS(float_exception_flags) = val;
65 }
66
67 #ifdef FLOATX80
68 void set_floatx80_rounding_precision(int val STATUS_PARAM)
69 {
70 STATUS(floatx80_rounding_precision) = val;
71 }
72 #endif
73
74 /*----------------------------------------------------------------------------
75 | Returns the fraction bits of the half-precision floating-point value `a'.
76 *----------------------------------------------------------------------------*/
77
78 INLINE uint32_t extractFloat16Frac(float16 a)
79 {
80 return float16_val(a) & 0x3ff;
81 }
82
83 /*----------------------------------------------------------------------------
84 | Returns the exponent bits of the half-precision floating-point value `a'.
85 *----------------------------------------------------------------------------*/
86
87 INLINE int16 extractFloat16Exp(float16 a)
88 {
89 return (float16_val(a) >> 10) & 0x1f;
90 }
91
92 /*----------------------------------------------------------------------------
93 | Returns the sign bit of the single-precision floating-point value `a'.
94 *----------------------------------------------------------------------------*/
95
96 INLINE flag extractFloat16Sign(float16 a)
97 {
98 return float16_val(a)>>15;
99 }
100
101 /*----------------------------------------------------------------------------
102 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
103 | and 7, and returns the properly rounded 32-bit integer corresponding to the
104 | input. If `zSign' is 1, the input is negated before being converted to an
105 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
106 | is simply rounded to an integer, with the inexact exception raised if the
107 | input cannot be represented exactly as an integer. However, if the fixed-
108 | point input is too large, the invalid exception is raised and the largest
109 | positive or negative integer is returned.
110 *----------------------------------------------------------------------------*/
111
112 static int32 roundAndPackInt32( flag zSign, uint64_t absZ STATUS_PARAM)
113 {
114 int8 roundingMode;
115 flag roundNearestEven;
116 int8 roundIncrement, roundBits;
117 int32 z;
118
119 roundingMode = STATUS(float_rounding_mode);
120 roundNearestEven = ( roundingMode == float_round_nearest_even );
121 roundIncrement = 0x40;
122 if ( ! roundNearestEven ) {
123 if ( roundingMode == float_round_to_zero ) {
124 roundIncrement = 0;
125 }
126 else {
127 roundIncrement = 0x7F;
128 if ( zSign ) {
129 if ( roundingMode == float_round_up ) roundIncrement = 0;
130 }
131 else {
132 if ( roundingMode == float_round_down ) roundIncrement = 0;
133 }
134 }
135 }
136 roundBits = absZ & 0x7F;
137 absZ = ( absZ + roundIncrement )>>7;
138 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
139 z = absZ;
140 if ( zSign ) z = - z;
141 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
142 float_raise( float_flag_invalid STATUS_VAR);
143 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
144 }
145 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
146 return z;
147
148 }
149
150 /*----------------------------------------------------------------------------
151 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
152 | `absZ1', with binary point between bits 63 and 64 (between the input words),
153 | and returns the properly rounded 64-bit integer corresponding to the input.
154 | If `zSign' is 1, the input is negated before being converted to an integer.
155 | Ordinarily, the fixed-point input is simply rounded to an integer, with
156 | the inexact exception raised if the input cannot be represented exactly as
157 | an integer. However, if the fixed-point input is too large, the invalid
158 | exception is raised and the largest positive or negative integer is
159 | returned.
160 *----------------------------------------------------------------------------*/
161
162 static int64 roundAndPackInt64( flag zSign, uint64_t absZ0, uint64_t absZ1 STATUS_PARAM)
163 {
164 int8 roundingMode;
165 flag roundNearestEven, increment;
166 int64 z;
167
168 roundingMode = STATUS(float_rounding_mode);
169 roundNearestEven = ( roundingMode == float_round_nearest_even );
170 increment = ( (int64_t) absZ1 < 0 );
171 if ( ! roundNearestEven ) {
172 if ( roundingMode == float_round_to_zero ) {
173 increment = 0;
174 }
175 else {
176 if ( zSign ) {
177 increment = ( roundingMode == float_round_down ) && absZ1;
178 }
179 else {
180 increment = ( roundingMode == float_round_up ) && absZ1;
181 }
182 }
183 }
184 if ( increment ) {
185 ++absZ0;
186 if ( absZ0 == 0 ) goto overflow;
187 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
188 }
189 z = absZ0;
190 if ( zSign ) z = - z;
191 if ( z && ( ( z < 0 ) ^ zSign ) ) {
192 overflow:
193 float_raise( float_flag_invalid STATUS_VAR);
194 return
195 zSign ? (int64_t) LIT64( 0x8000000000000000 )
196 : LIT64( 0x7FFFFFFFFFFFFFFF );
197 }
198 if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
199 return z;
200
201 }
202
203 /*----------------------------------------------------------------------------
204 | Returns the fraction bits of the single-precision floating-point value `a'.
205 *----------------------------------------------------------------------------*/
206
207 INLINE uint32_t extractFloat32Frac( float32 a )
208 {
209
210 return float32_val(a) & 0x007FFFFF;
211
212 }
213
214 /*----------------------------------------------------------------------------
215 | Returns the exponent bits of the single-precision floating-point value `a'.
216 *----------------------------------------------------------------------------*/
217
218 INLINE int16 extractFloat32Exp( float32 a )
219 {
220
221 return ( float32_val(a)>>23 ) & 0xFF;
222
223 }
224
225 /*----------------------------------------------------------------------------
226 | Returns the sign bit of the single-precision floating-point value `a'.
227 *----------------------------------------------------------------------------*/
228
229 INLINE flag extractFloat32Sign( float32 a )
230 {
231
232 return float32_val(a)>>31;
233
234 }
235
236 /*----------------------------------------------------------------------------
237 | If `a' is denormal and we are in flush-to-zero mode then set the
238 | input-denormal exception and return zero. Otherwise just return the value.
239 *----------------------------------------------------------------------------*/
240 static float32 float32_squash_input_denormal(float32 a STATUS_PARAM)
241 {
242 if (STATUS(flush_inputs_to_zero)) {
243 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
244 float_raise(float_flag_input_denormal STATUS_VAR);
245 return make_float32(float32_val(a) & 0x80000000);
246 }
247 }
248 return a;
249 }
250
251 /*----------------------------------------------------------------------------
252 | Normalizes the subnormal single-precision floating-point value represented
253 | by the denormalized significand `aSig'. The normalized exponent and
254 | significand are stored at the locations pointed to by `zExpPtr' and
255 | `zSigPtr', respectively.
256 *----------------------------------------------------------------------------*/
257
258 static void
259 normalizeFloat32Subnormal( uint32_t aSig, int16 *zExpPtr, uint32_t *zSigPtr )
260 {
261 int8 shiftCount;
262
263 shiftCount = countLeadingZeros32( aSig ) - 8;
264 *zSigPtr = aSig<<shiftCount;
265 *zExpPtr = 1 - shiftCount;
266
267 }
268
269 /*----------------------------------------------------------------------------
270 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
271 | single-precision floating-point value, returning the result. After being
272 | shifted into the proper positions, the three fields are simply added
273 | together to form the result. This means that any integer portion of `zSig'
274 | will be added into the exponent. Since a properly normalized significand
275 | will have an integer portion equal to 1, the `zExp' input should be 1 less
276 | than the desired result exponent whenever `zSig' is a complete, normalized
277 | significand.
278 *----------------------------------------------------------------------------*/
279
280 INLINE float32 packFloat32( flag zSign, int16 zExp, uint32_t zSig )
281 {
282
283 return make_float32(
284 ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
285
286 }
287
288 /*----------------------------------------------------------------------------
289 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
290 | and significand `zSig', and returns the proper single-precision floating-
291 | point value corresponding to the abstract input. Ordinarily, the abstract
292 | value is simply rounded and packed into the single-precision format, with
293 | the inexact exception raised if the abstract input cannot be represented
294 | exactly. However, if the abstract value is too large, the overflow and
295 | inexact exceptions are raised and an infinity or maximal finite value is
296 | returned. If the abstract value is too small, the input value is rounded to
297 | a subnormal number, and the underflow and inexact exceptions are raised if
298 | the abstract input cannot be represented exactly as a subnormal single-
299 | precision floating-point number.
300 | The input significand `zSig' has its binary point between bits 30
301 | and 29, which is 7 bits to the left of the usual location. This shifted
302 | significand must be normalized or smaller. If `zSig' is not normalized,
303 | `zExp' must be 0; in that case, the result returned is a subnormal number,
304 | and it must not require rounding. In the usual case that `zSig' is
305 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
306 | The handling of underflow and overflow follows the IEC/IEEE Standard for
307 | Binary Floating-Point Arithmetic.
308 *----------------------------------------------------------------------------*/
309
310 static float32 roundAndPackFloat32( flag zSign, int16 zExp, uint32_t zSig STATUS_PARAM)
311 {
312 int8 roundingMode;
313 flag roundNearestEven;
314 int8 roundIncrement, roundBits;
315 flag isTiny;
316
317 roundingMode = STATUS(float_rounding_mode);
318 roundNearestEven = ( roundingMode == float_round_nearest_even );
319 roundIncrement = 0x40;
320 if ( ! roundNearestEven ) {
321 if ( roundingMode == float_round_to_zero ) {
322 roundIncrement = 0;
323 }
324 else {
325 roundIncrement = 0x7F;
326 if ( zSign ) {
327 if ( roundingMode == float_round_up ) roundIncrement = 0;
328 }
329 else {
330 if ( roundingMode == float_round_down ) roundIncrement = 0;
331 }
332 }
333 }
334 roundBits = zSig & 0x7F;
335 if ( 0xFD <= (uint16_t) zExp ) {
336 if ( ( 0xFD < zExp )
337 || ( ( zExp == 0xFD )
338 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
339 ) {
340 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
341 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
342 }
343 if ( zExp < 0 ) {
344 if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
345 isTiny =
346 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
347 || ( zExp < -1 )
348 || ( zSig + roundIncrement < 0x80000000 );
349 shift32RightJamming( zSig, - zExp, &zSig );
350 zExp = 0;
351 roundBits = zSig & 0x7F;
352 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
353 }
354 }
355 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
356 zSig = ( zSig + roundIncrement )>>7;
357 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
358 if ( zSig == 0 ) zExp = 0;
359 return packFloat32( zSign, zExp, zSig );
360
361 }
362
363 /*----------------------------------------------------------------------------
364 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
365 | and significand `zSig', and returns the proper single-precision floating-
366 | point value corresponding to the abstract input. This routine is just like
367 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
368 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
369 | floating-point exponent.
370 *----------------------------------------------------------------------------*/
371
372 static float32
373 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, uint32_t zSig STATUS_PARAM)
374 {
375 int8 shiftCount;
376
377 shiftCount = countLeadingZeros32( zSig ) - 1;
378 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
379
380 }
381
382 /*----------------------------------------------------------------------------
383 | Returns the fraction bits of the double-precision floating-point value `a'.
384 *----------------------------------------------------------------------------*/
385
386 INLINE uint64_t extractFloat64Frac( float64 a )
387 {
388
389 return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
390
391 }
392
393 /*----------------------------------------------------------------------------
394 | Returns the exponent bits of the double-precision floating-point value `a'.
395 *----------------------------------------------------------------------------*/
396
397 INLINE int16 extractFloat64Exp( float64 a )
398 {
399
400 return ( float64_val(a)>>52 ) & 0x7FF;
401
402 }
403
404 /*----------------------------------------------------------------------------
405 | Returns the sign bit of the double-precision floating-point value `a'.
406 *----------------------------------------------------------------------------*/
407
408 INLINE flag extractFloat64Sign( float64 a )
409 {
410
411 return float64_val(a)>>63;
412
413 }
414
415 /*----------------------------------------------------------------------------
416 | If `a' is denormal and we are in flush-to-zero mode then set the
417 | input-denormal exception and return zero. Otherwise just return the value.
418 *----------------------------------------------------------------------------*/
419 static float64 float64_squash_input_denormal(float64 a STATUS_PARAM)
420 {
421 if (STATUS(flush_inputs_to_zero)) {
422 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
423 float_raise(float_flag_input_denormal STATUS_VAR);
424 return make_float64(float64_val(a) & (1ULL << 63));
425 }
426 }
427 return a;
428 }
429
430 /*----------------------------------------------------------------------------
431 | Normalizes the subnormal double-precision floating-point value represented
432 | by the denormalized significand `aSig'. The normalized exponent and
433 | significand are stored at the locations pointed to by `zExpPtr' and
434 | `zSigPtr', respectively.
435 *----------------------------------------------------------------------------*/
436
437 static void
438 normalizeFloat64Subnormal( uint64_t aSig, int16 *zExpPtr, uint64_t *zSigPtr )
439 {
440 int8 shiftCount;
441
442 shiftCount = countLeadingZeros64( aSig ) - 11;
443 *zSigPtr = aSig<<shiftCount;
444 *zExpPtr = 1 - shiftCount;
445
446 }
447
448 /*----------------------------------------------------------------------------
449 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
450 | double-precision floating-point value, returning the result. After being
451 | shifted into the proper positions, the three fields are simply added
452 | together to form the result. This means that any integer portion of `zSig'
453 | will be added into the exponent. Since a properly normalized significand
454 | will have an integer portion equal to 1, the `zExp' input should be 1 less
455 | than the desired result exponent whenever `zSig' is a complete, normalized
456 | significand.
457 *----------------------------------------------------------------------------*/
458
459 INLINE float64 packFloat64( flag zSign, int16 zExp, uint64_t zSig )
460 {
461
462 return make_float64(
463 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
464
465 }
466
467 /*----------------------------------------------------------------------------
468 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
469 | and significand `zSig', and returns the proper double-precision floating-
470 | point value corresponding to the abstract input. Ordinarily, the abstract
471 | value is simply rounded and packed into the double-precision format, with
472 | the inexact exception raised if the abstract input cannot be represented
473 | exactly. However, if the abstract value is too large, the overflow and
474 | inexact exceptions are raised and an infinity or maximal finite value is
475 | returned. If the abstract value is too small, the input value is rounded
476 | to a subnormal number, and the underflow and inexact exceptions are raised
477 | if the abstract input cannot be represented exactly as a subnormal double-
478 | precision floating-point number.
479 | The input significand `zSig' has its binary point between bits 62
480 | and 61, which is 10 bits to the left of the usual location. This shifted
481 | significand must be normalized or smaller. If `zSig' is not normalized,
482 | `zExp' must be 0; in that case, the result returned is a subnormal number,
483 | and it must not require rounding. In the usual case that `zSig' is
484 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
485 | The handling of underflow and overflow follows the IEC/IEEE Standard for
486 | Binary Floating-Point Arithmetic.
487 *----------------------------------------------------------------------------*/
488
489 static float64 roundAndPackFloat64( flag zSign, int16 zExp, uint64_t zSig STATUS_PARAM)
490 {
491 int8 roundingMode;
492 flag roundNearestEven;
493 int16 roundIncrement, roundBits;
494 flag isTiny;
495
496 roundingMode = STATUS(float_rounding_mode);
497 roundNearestEven = ( roundingMode == float_round_nearest_even );
498 roundIncrement = 0x200;
499 if ( ! roundNearestEven ) {
500 if ( roundingMode == float_round_to_zero ) {
501 roundIncrement = 0;
502 }
503 else {
504 roundIncrement = 0x3FF;
505 if ( zSign ) {
506 if ( roundingMode == float_round_up ) roundIncrement = 0;
507 }
508 else {
509 if ( roundingMode == float_round_down ) roundIncrement = 0;
510 }
511 }
512 }
513 roundBits = zSig & 0x3FF;
514 if ( 0x7FD <= (uint16_t) zExp ) {
515 if ( ( 0x7FD < zExp )
516 || ( ( zExp == 0x7FD )
517 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
518 ) {
519 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
520 return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
521 }
522 if ( zExp < 0 ) {
523 if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
524 isTiny =
525 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
526 || ( zExp < -1 )
527 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
528 shift64RightJamming( zSig, - zExp, &zSig );
529 zExp = 0;
530 roundBits = zSig & 0x3FF;
531 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
532 }
533 }
534 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
535 zSig = ( zSig + roundIncrement )>>10;
536 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
537 if ( zSig == 0 ) zExp = 0;
538 return packFloat64( zSign, zExp, zSig );
539
540 }
541
542 /*----------------------------------------------------------------------------
543 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
544 | and significand `zSig', and returns the proper double-precision floating-
545 | point value corresponding to the abstract input. This routine is just like
546 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
547 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
548 | floating-point exponent.
549 *----------------------------------------------------------------------------*/
550
551 static float64
552 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, uint64_t zSig STATUS_PARAM)
553 {
554 int8 shiftCount;
555
556 shiftCount = countLeadingZeros64( zSig ) - 1;
557 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
558
559 }
560
561 #ifdef FLOATX80
562
563 /*----------------------------------------------------------------------------
564 | Returns the fraction bits of the extended double-precision floating-point
565 | value `a'.
566 *----------------------------------------------------------------------------*/
567
568 INLINE uint64_t extractFloatx80Frac( floatx80 a )
569 {
570
571 return a.low;
572
573 }
574
575 /*----------------------------------------------------------------------------
576 | Returns the exponent bits of the extended double-precision floating-point
577 | value `a'.
578 *----------------------------------------------------------------------------*/
579
580 INLINE int32 extractFloatx80Exp( floatx80 a )
581 {
582
583 return a.high & 0x7FFF;
584
585 }
586
587 /*----------------------------------------------------------------------------
588 | Returns the sign bit of the extended double-precision floating-point value
589 | `a'.
590 *----------------------------------------------------------------------------*/
591
592 INLINE flag extractFloatx80Sign( floatx80 a )
593 {
594
595 return a.high>>15;
596
597 }
598
599 /*----------------------------------------------------------------------------
600 | Normalizes the subnormal extended double-precision floating-point value
601 | represented by the denormalized significand `aSig'. The normalized exponent
602 | and significand are stored at the locations pointed to by `zExpPtr' and
603 | `zSigPtr', respectively.
604 *----------------------------------------------------------------------------*/
605
606 static void
607 normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr )
608 {
609 int8 shiftCount;
610
611 shiftCount = countLeadingZeros64( aSig );
612 *zSigPtr = aSig<<shiftCount;
613 *zExpPtr = 1 - shiftCount;
614
615 }
616
617 /*----------------------------------------------------------------------------
618 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
619 | extended double-precision floating-point value, returning the result.
620 *----------------------------------------------------------------------------*/
621
622 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, uint64_t zSig )
623 {
624 floatx80 z;
625
626 z.low = zSig;
627 z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
628 return z;
629
630 }
631
632 /*----------------------------------------------------------------------------
633 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
634 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
635 | and returns the proper extended double-precision floating-point value
636 | corresponding to the abstract input. Ordinarily, the abstract value is
637 | rounded and packed into the extended double-precision format, with the
638 | inexact exception raised if the abstract input cannot be represented
639 | exactly. However, if the abstract value is too large, the overflow and
640 | inexact exceptions are raised and an infinity or maximal finite value is
641 | returned. If the abstract value is too small, the input value is rounded to
642 | a subnormal number, and the underflow and inexact exceptions are raised if
643 | the abstract input cannot be represented exactly as a subnormal extended
644 | double-precision floating-point number.
645 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
646 | number of bits as single or double precision, respectively. Otherwise, the
647 | result is rounded to the full precision of the extended double-precision
648 | format.
649 | The input significand must be normalized or smaller. If the input
650 | significand is not normalized, `zExp' must be 0; in that case, the result
651 | returned is a subnormal number, and it must not require rounding. The
652 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
653 | Floating-Point Arithmetic.
654 *----------------------------------------------------------------------------*/
655
656 static floatx80
657 roundAndPackFloatx80(
658 int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
659 STATUS_PARAM)
660 {
661 int8 roundingMode;
662 flag roundNearestEven, increment, isTiny;
663 int64 roundIncrement, roundMask, roundBits;
664
665 roundingMode = STATUS(float_rounding_mode);
666 roundNearestEven = ( roundingMode == float_round_nearest_even );
667 if ( roundingPrecision == 80 ) goto precision80;
668 if ( roundingPrecision == 64 ) {
669 roundIncrement = LIT64( 0x0000000000000400 );
670 roundMask = LIT64( 0x00000000000007FF );
671 }
672 else if ( roundingPrecision == 32 ) {
673 roundIncrement = LIT64( 0x0000008000000000 );
674 roundMask = LIT64( 0x000000FFFFFFFFFF );
675 }
676 else {
677 goto precision80;
678 }
679 zSig0 |= ( zSig1 != 0 );
680 if ( ! roundNearestEven ) {
681 if ( roundingMode == float_round_to_zero ) {
682 roundIncrement = 0;
683 }
684 else {
685 roundIncrement = roundMask;
686 if ( zSign ) {
687 if ( roundingMode == float_round_up ) roundIncrement = 0;
688 }
689 else {
690 if ( roundingMode == float_round_down ) roundIncrement = 0;
691 }
692 }
693 }
694 roundBits = zSig0 & roundMask;
695 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
696 if ( ( 0x7FFE < zExp )
697 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
698 ) {
699 goto overflow;
700 }
701 if ( zExp <= 0 ) {
702 if ( STATUS(flush_to_zero) ) return packFloatx80( zSign, 0, 0 );
703 isTiny =
704 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
705 || ( zExp < 0 )
706 || ( zSig0 <= zSig0 + roundIncrement );
707 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
708 zExp = 0;
709 roundBits = zSig0 & roundMask;
710 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
711 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
712 zSig0 += roundIncrement;
713 if ( (int64_t) zSig0 < 0 ) zExp = 1;
714 roundIncrement = roundMask + 1;
715 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
716 roundMask |= roundIncrement;
717 }
718 zSig0 &= ~ roundMask;
719 return packFloatx80( zSign, zExp, zSig0 );
720 }
721 }
722 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
723 zSig0 += roundIncrement;
724 if ( zSig0 < roundIncrement ) {
725 ++zExp;
726 zSig0 = LIT64( 0x8000000000000000 );
727 }
728 roundIncrement = roundMask + 1;
729 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
730 roundMask |= roundIncrement;
731 }
732 zSig0 &= ~ roundMask;
733 if ( zSig0 == 0 ) zExp = 0;
734 return packFloatx80( zSign, zExp, zSig0 );
735 precision80:
736 increment = ( (int64_t) zSig1 < 0 );
737 if ( ! roundNearestEven ) {
738 if ( roundingMode == float_round_to_zero ) {
739 increment = 0;
740 }
741 else {
742 if ( zSign ) {
743 increment = ( roundingMode == float_round_down ) && zSig1;
744 }
745 else {
746 increment = ( roundingMode == float_round_up ) && zSig1;
747 }
748 }
749 }
750 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
751 if ( ( 0x7FFE < zExp )
752 || ( ( zExp == 0x7FFE )
753 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
754 && increment
755 )
756 ) {
757 roundMask = 0;
758 overflow:
759 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
760 if ( ( roundingMode == float_round_to_zero )
761 || ( zSign && ( roundingMode == float_round_up ) )
762 || ( ! zSign && ( roundingMode == float_round_down ) )
763 ) {
764 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
765 }
766 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
767 }
768 if ( zExp <= 0 ) {
769 isTiny =
770 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
771 || ( zExp < 0 )
772 || ! increment
773 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
774 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
775 zExp = 0;
776 if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
777 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
778 if ( roundNearestEven ) {
779 increment = ( (int64_t) zSig1 < 0 );
780 }
781 else {
782 if ( zSign ) {
783 increment = ( roundingMode == float_round_down ) && zSig1;
784 }
785 else {
786 increment = ( roundingMode == float_round_up ) && zSig1;
787 }
788 }
789 if ( increment ) {
790 ++zSig0;
791 zSig0 &=
792 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
793 if ( (int64_t) zSig0 < 0 ) zExp = 1;
794 }
795 return packFloatx80( zSign, zExp, zSig0 );
796 }
797 }
798 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
799 if ( increment ) {
800 ++zSig0;
801 if ( zSig0 == 0 ) {
802 ++zExp;
803 zSig0 = LIT64( 0x8000000000000000 );
804 }
805 else {
806 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
807 }
808 }
809 else {
810 if ( zSig0 == 0 ) zExp = 0;
811 }
812 return packFloatx80( zSign, zExp, zSig0 );
813
814 }
815
816 /*----------------------------------------------------------------------------
817 | Takes an abstract floating-point value having sign `zSign', exponent
818 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
819 | and returns the proper extended double-precision floating-point value
820 | corresponding to the abstract input. This routine is just like
821 | `roundAndPackFloatx80' except that the input significand does not have to be
822 | normalized.
823 *----------------------------------------------------------------------------*/
824
825 static floatx80
826 normalizeRoundAndPackFloatx80(
827 int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
828 STATUS_PARAM)
829 {
830 int8 shiftCount;
831
832 if ( zSig0 == 0 ) {
833 zSig0 = zSig1;
834 zSig1 = 0;
835 zExp -= 64;
836 }
837 shiftCount = countLeadingZeros64( zSig0 );
838 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
839 zExp -= shiftCount;
840 return
841 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
842
843 }
844
845 #endif
846
847 #ifdef FLOAT128
848
849 /*----------------------------------------------------------------------------
850 | Returns the least-significant 64 fraction bits of the quadruple-precision
851 | floating-point value `a'.
852 *----------------------------------------------------------------------------*/
853
854 INLINE uint64_t extractFloat128Frac1( float128 a )
855 {
856
857 return a.low;
858
859 }
860
861 /*----------------------------------------------------------------------------
862 | Returns the most-significant 48 fraction bits of the quadruple-precision
863 | floating-point value `a'.
864 *----------------------------------------------------------------------------*/
865
866 INLINE uint64_t extractFloat128Frac0( float128 a )
867 {
868
869 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
870
871 }
872
873 /*----------------------------------------------------------------------------
874 | Returns the exponent bits of the quadruple-precision floating-point value
875 | `a'.
876 *----------------------------------------------------------------------------*/
877
878 INLINE int32 extractFloat128Exp( float128 a )
879 {
880
881 return ( a.high>>48 ) & 0x7FFF;
882
883 }
884
885 /*----------------------------------------------------------------------------
886 | Returns the sign bit of the quadruple-precision floating-point value `a'.
887 *----------------------------------------------------------------------------*/
888
889 INLINE flag extractFloat128Sign( float128 a )
890 {
891
892 return a.high>>63;
893
894 }
895
896 /*----------------------------------------------------------------------------
897 | Normalizes the subnormal quadruple-precision floating-point value
898 | represented by the denormalized significand formed by the concatenation of
899 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
900 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
901 | significand are stored at the location pointed to by `zSig0Ptr', and the
902 | least significant 64 bits of the normalized significand are stored at the
903 | location pointed to by `zSig1Ptr'.
904 *----------------------------------------------------------------------------*/
905
906 static void
907 normalizeFloat128Subnormal(
908 uint64_t aSig0,
909 uint64_t aSig1,
910 int32 *zExpPtr,
911 uint64_t *zSig0Ptr,
912 uint64_t *zSig1Ptr
913 )
914 {
915 int8 shiftCount;
916
917 if ( aSig0 == 0 ) {
918 shiftCount = countLeadingZeros64( aSig1 ) - 15;
919 if ( shiftCount < 0 ) {
920 *zSig0Ptr = aSig1>>( - shiftCount );
921 *zSig1Ptr = aSig1<<( shiftCount & 63 );
922 }
923 else {
924 *zSig0Ptr = aSig1<<shiftCount;
925 *zSig1Ptr = 0;
926 }
927 *zExpPtr = - shiftCount - 63;
928 }
929 else {
930 shiftCount = countLeadingZeros64( aSig0 ) - 15;
931 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
932 *zExpPtr = 1 - shiftCount;
933 }
934
935 }
936
937 /*----------------------------------------------------------------------------
938 | Packs the sign `zSign', the exponent `zExp', and the significand formed
939 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
940 | floating-point value, returning the result. After being shifted into the
941 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
942 | added together to form the most significant 32 bits of the result. This
943 | means that any integer portion of `zSig0' will be added into the exponent.
944 | Since a properly normalized significand will have an integer portion equal
945 | to 1, the `zExp' input should be 1 less than the desired result exponent
946 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
947 | significand.
948 *----------------------------------------------------------------------------*/
949
950 INLINE float128
951 packFloat128( flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 )
952 {
953 float128 z;
954
955 z.low = zSig1;
956 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
957 return z;
958
959 }
960
961 /*----------------------------------------------------------------------------
962 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
963 | and extended significand formed by the concatenation of `zSig0', `zSig1',
964 | and `zSig2', and returns the proper quadruple-precision floating-point value
965 | corresponding to the abstract input. Ordinarily, the abstract value is
966 | simply rounded and packed into the quadruple-precision format, with the
967 | inexact exception raised if the abstract input cannot be represented
968 | exactly. However, if the abstract value is too large, the overflow and
969 | inexact exceptions are raised and an infinity or maximal finite value is
970 | returned. If the abstract value is too small, the input value is rounded to
971 | a subnormal number, and the underflow and inexact exceptions are raised if
972 | the abstract input cannot be represented exactly as a subnormal quadruple-
973 | precision floating-point number.
974 | The input significand must be normalized or smaller. If the input
975 | significand is not normalized, `zExp' must be 0; in that case, the result
976 | returned is a subnormal number, and it must not require rounding. In the
977 | usual case that the input significand is normalized, `zExp' must be 1 less
978 | than the ``true'' floating-point exponent. The handling of underflow and
979 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
980 *----------------------------------------------------------------------------*/
981
982 static float128
983 roundAndPackFloat128(
984 flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1, uint64_t zSig2 STATUS_PARAM)
985 {
986 int8 roundingMode;
987 flag roundNearestEven, increment, isTiny;
988
989 roundingMode = STATUS(float_rounding_mode);
990 roundNearestEven = ( roundingMode == float_round_nearest_even );
991 increment = ( (int64_t) zSig2 < 0 );
992 if ( ! roundNearestEven ) {
993 if ( roundingMode == float_round_to_zero ) {
994 increment = 0;
995 }
996 else {
997 if ( zSign ) {
998 increment = ( roundingMode == float_round_down ) && zSig2;
999 }
1000 else {
1001 increment = ( roundingMode == float_round_up ) && zSig2;
1002 }
1003 }
1004 }
1005 if ( 0x7FFD <= (uint32_t) zExp ) {
1006 if ( ( 0x7FFD < zExp )
1007 || ( ( zExp == 0x7FFD )
1008 && eq128(
1009 LIT64( 0x0001FFFFFFFFFFFF ),
1010 LIT64( 0xFFFFFFFFFFFFFFFF ),
1011 zSig0,
1012 zSig1
1013 )
1014 && increment
1015 )
1016 ) {
1017 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
1018 if ( ( roundingMode == float_round_to_zero )
1019 || ( zSign && ( roundingMode == float_round_up ) )
1020 || ( ! zSign && ( roundingMode == float_round_down ) )
1021 ) {
1022 return
1023 packFloat128(
1024 zSign,
1025 0x7FFE,
1026 LIT64( 0x0000FFFFFFFFFFFF ),
1027 LIT64( 0xFFFFFFFFFFFFFFFF )
1028 );
1029 }
1030 return packFloat128( zSign, 0x7FFF, 0, 0 );
1031 }
1032 if ( zExp < 0 ) {
1033 if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
1034 isTiny =
1035 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
1036 || ( zExp < -1 )
1037 || ! increment
1038 || lt128(
1039 zSig0,
1040 zSig1,
1041 LIT64( 0x0001FFFFFFFFFFFF ),
1042 LIT64( 0xFFFFFFFFFFFFFFFF )
1043 );
1044 shift128ExtraRightJamming(
1045 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1046 zExp = 0;
1047 if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
1048 if ( roundNearestEven ) {
1049 increment = ( (int64_t) zSig2 < 0 );
1050 }
1051 else {
1052 if ( zSign ) {
1053 increment = ( roundingMode == float_round_down ) && zSig2;
1054 }
1055 else {
1056 increment = ( roundingMode == float_round_up ) && zSig2;
1057 }
1058 }
1059 }
1060 }
1061 if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1062 if ( increment ) {
1063 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1064 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1065 }
1066 else {
1067 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1068 }
1069 return packFloat128( zSign, zExp, zSig0, zSig1 );
1070
1071 }
1072
1073 /*----------------------------------------------------------------------------
1074 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1075 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1076 | returns the proper quadruple-precision floating-point value corresponding
1077 | to the abstract input. This routine is just like `roundAndPackFloat128'
1078 | except that the input significand has fewer bits and does not have to be
1079 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1080 | point exponent.
1081 *----------------------------------------------------------------------------*/
1082
1083 static float128
1084 normalizeRoundAndPackFloat128(
1085 flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 STATUS_PARAM)
1086 {
1087 int8 shiftCount;
1088 uint64_t zSig2;
1089
1090 if ( zSig0 == 0 ) {
1091 zSig0 = zSig1;
1092 zSig1 = 0;
1093 zExp -= 64;
1094 }
1095 shiftCount = countLeadingZeros64( zSig0 ) - 15;
1096 if ( 0 <= shiftCount ) {
1097 zSig2 = 0;
1098 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1099 }
1100 else {
1101 shift128ExtraRightJamming(
1102 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1103 }
1104 zExp -= shiftCount;
1105 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1106
1107 }
1108
1109 #endif
1110
1111 /*----------------------------------------------------------------------------
1112 | Returns the result of converting the 32-bit two's complement integer `a'
1113 | to the single-precision floating-point format. The conversion is performed
1114 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1115 *----------------------------------------------------------------------------*/
1116
1117 float32 int32_to_float32( int32 a STATUS_PARAM )
1118 {
1119 flag zSign;
1120
1121 if ( a == 0 ) return float32_zero;
1122 if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1123 zSign = ( a < 0 );
1124 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1125
1126 }
1127
1128 /*----------------------------------------------------------------------------
1129 | Returns the result of converting the 32-bit two's complement integer `a'
1130 | to the double-precision floating-point format. The conversion is performed
1131 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1132 *----------------------------------------------------------------------------*/
1133
1134 float64 int32_to_float64( int32 a STATUS_PARAM )
1135 {
1136 flag zSign;
1137 uint32 absA;
1138 int8 shiftCount;
1139 uint64_t zSig;
1140
1141 if ( a == 0 ) return float64_zero;
1142 zSign = ( a < 0 );
1143 absA = zSign ? - a : a;
1144 shiftCount = countLeadingZeros32( absA ) + 21;
1145 zSig = absA;
1146 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1147
1148 }
1149
1150 #ifdef FLOATX80
1151
1152 /*----------------------------------------------------------------------------
1153 | Returns the result of converting the 32-bit two's complement integer `a'
1154 | to the extended double-precision floating-point format. The conversion
1155 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1156 | Arithmetic.
1157 *----------------------------------------------------------------------------*/
1158
1159 floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1160 {
1161 flag zSign;
1162 uint32 absA;
1163 int8 shiftCount;
1164 uint64_t zSig;
1165
1166 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1167 zSign = ( a < 0 );
1168 absA = zSign ? - a : a;
1169 shiftCount = countLeadingZeros32( absA ) + 32;
1170 zSig = absA;
1171 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1172
1173 }
1174
1175 #endif
1176
1177 #ifdef FLOAT128
1178
1179 /*----------------------------------------------------------------------------
1180 | Returns the result of converting the 32-bit two's complement integer `a' to
1181 | the quadruple-precision floating-point format. The conversion is performed
1182 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1183 *----------------------------------------------------------------------------*/
1184
1185 float128 int32_to_float128( int32 a STATUS_PARAM )
1186 {
1187 flag zSign;
1188 uint32 absA;
1189 int8 shiftCount;
1190 uint64_t zSig0;
1191
1192 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1193 zSign = ( a < 0 );
1194 absA = zSign ? - a : a;
1195 shiftCount = countLeadingZeros32( absA ) + 17;
1196 zSig0 = absA;
1197 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1198
1199 }
1200
1201 #endif
1202
1203 /*----------------------------------------------------------------------------
1204 | Returns the result of converting the 64-bit two's complement integer `a'
1205 | to the single-precision floating-point format. The conversion is performed
1206 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1207 *----------------------------------------------------------------------------*/
1208
1209 float32 int64_to_float32( int64 a STATUS_PARAM )
1210 {
1211 flag zSign;
1212 uint64 absA;
1213 int8 shiftCount;
1214
1215 if ( a == 0 ) return float32_zero;
1216 zSign = ( a < 0 );
1217 absA = zSign ? - a : a;
1218 shiftCount = countLeadingZeros64( absA ) - 40;
1219 if ( 0 <= shiftCount ) {
1220 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1221 }
1222 else {
1223 shiftCount += 7;
1224 if ( shiftCount < 0 ) {
1225 shift64RightJamming( absA, - shiftCount, &absA );
1226 }
1227 else {
1228 absA <<= shiftCount;
1229 }
1230 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1231 }
1232
1233 }
1234
1235 float32 uint64_to_float32( uint64 a STATUS_PARAM )
1236 {
1237 int8 shiftCount;
1238
1239 if ( a == 0 ) return float32_zero;
1240 shiftCount = countLeadingZeros64( a ) - 40;
1241 if ( 0 <= shiftCount ) {
1242 return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount );
1243 }
1244 else {
1245 shiftCount += 7;
1246 if ( shiftCount < 0 ) {
1247 shift64RightJamming( a, - shiftCount, &a );
1248 }
1249 else {
1250 a <<= shiftCount;
1251 }
1252 return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR );
1253 }
1254 }
1255
1256 /*----------------------------------------------------------------------------
1257 | Returns the result of converting the 64-bit two's complement integer `a'
1258 | to the double-precision floating-point format. The conversion is performed
1259 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1260 *----------------------------------------------------------------------------*/
1261
1262 float64 int64_to_float64( int64 a STATUS_PARAM )
1263 {
1264 flag zSign;
1265
1266 if ( a == 0 ) return float64_zero;
1267 if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
1268 return packFloat64( 1, 0x43E, 0 );
1269 }
1270 zSign = ( a < 0 );
1271 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1272
1273 }
1274
1275 float64 uint64_to_float64( uint64 a STATUS_PARAM )
1276 {
1277 if ( a == 0 ) return float64_zero;
1278 return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR );
1279
1280 }
1281
1282 #ifdef FLOATX80
1283
1284 /*----------------------------------------------------------------------------
1285 | Returns the result of converting the 64-bit two's complement integer `a'
1286 | to the extended double-precision floating-point format. The conversion
1287 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1288 | Arithmetic.
1289 *----------------------------------------------------------------------------*/
1290
1291 floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1292 {
1293 flag zSign;
1294 uint64 absA;
1295 int8 shiftCount;
1296
1297 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1298 zSign = ( a < 0 );
1299 absA = zSign ? - a : a;
1300 shiftCount = countLeadingZeros64( absA );
1301 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1302
1303 }
1304
1305 #endif
1306
1307 #ifdef FLOAT128
1308
1309 /*----------------------------------------------------------------------------
1310 | Returns the result of converting the 64-bit two's complement integer `a' to
1311 | the quadruple-precision floating-point format. The conversion is performed
1312 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1313 *----------------------------------------------------------------------------*/
1314
1315 float128 int64_to_float128( int64 a STATUS_PARAM )
1316 {
1317 flag zSign;
1318 uint64 absA;
1319 int8 shiftCount;
1320 int32 zExp;
1321 uint64_t zSig0, zSig1;
1322
1323 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1324 zSign = ( a < 0 );
1325 absA = zSign ? - a : a;
1326 shiftCount = countLeadingZeros64( absA ) + 49;
1327 zExp = 0x406E - shiftCount;
1328 if ( 64 <= shiftCount ) {
1329 zSig1 = 0;
1330 zSig0 = absA;
1331 shiftCount -= 64;
1332 }
1333 else {
1334 zSig1 = absA;
1335 zSig0 = 0;
1336 }
1337 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1338 return packFloat128( zSign, zExp, zSig0, zSig1 );
1339
1340 }
1341
1342 #endif
1343
1344 /*----------------------------------------------------------------------------
1345 | Returns the result of converting the single-precision floating-point value
1346 | `a' to the 32-bit two's complement integer format. The conversion is
1347 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1348 | Arithmetic---which means in particular that the conversion is rounded
1349 | according to the current rounding mode. If `a' is a NaN, the largest
1350 | positive integer is returned. Otherwise, if the conversion overflows, the
1351 | largest integer with the same sign as `a' is returned.
1352 *----------------------------------------------------------------------------*/
1353
1354 int32 float32_to_int32( float32 a STATUS_PARAM )
1355 {
1356 flag aSign;
1357 int16 aExp, shiftCount;
1358 uint32_t aSig;
1359 uint64_t aSig64;
1360
1361 a = float32_squash_input_denormal(a STATUS_VAR);
1362 aSig = extractFloat32Frac( a );
1363 aExp = extractFloat32Exp( a );
1364 aSign = extractFloat32Sign( a );
1365 if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1366 if ( aExp ) aSig |= 0x00800000;
1367 shiftCount = 0xAF - aExp;
1368 aSig64 = aSig;
1369 aSig64 <<= 32;
1370 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1371 return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1372
1373 }
1374
1375 /*----------------------------------------------------------------------------
1376 | Returns the result of converting the single-precision floating-point value
1377 | `a' to the 32-bit two's complement integer format. The conversion is
1378 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1379 | Arithmetic, except that the conversion is always rounded toward zero.
1380 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1381 | the conversion overflows, the largest integer with the same sign as `a' is
1382 | returned.
1383 *----------------------------------------------------------------------------*/
1384
1385 int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1386 {
1387 flag aSign;
1388 int16 aExp, shiftCount;
1389 uint32_t aSig;
1390 int32 z;
1391 a = float32_squash_input_denormal(a STATUS_VAR);
1392
1393 aSig = extractFloat32Frac( a );
1394 aExp = extractFloat32Exp( a );
1395 aSign = extractFloat32Sign( a );
1396 shiftCount = aExp - 0x9E;
1397 if ( 0 <= shiftCount ) {
1398 if ( float32_val(a) != 0xCF000000 ) {
1399 float_raise( float_flag_invalid STATUS_VAR);
1400 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1401 }
1402 return (int32_t) 0x80000000;
1403 }
1404 else if ( aExp <= 0x7E ) {
1405 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1406 return 0;
1407 }
1408 aSig = ( aSig | 0x00800000 )<<8;
1409 z = aSig>>( - shiftCount );
1410 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1411 STATUS(float_exception_flags) |= float_flag_inexact;
1412 }
1413 if ( aSign ) z = - z;
1414 return z;
1415
1416 }
1417
1418 /*----------------------------------------------------------------------------
1419 | Returns the result of converting the single-precision floating-point value
1420 | `a' to the 16-bit two's complement integer format. The conversion is
1421 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1422 | Arithmetic, except that the conversion is always rounded toward zero.
1423 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1424 | the conversion overflows, the largest integer with the same sign as `a' is
1425 | returned.
1426 *----------------------------------------------------------------------------*/
1427
1428 int16 float32_to_int16_round_to_zero( float32 a STATUS_PARAM )
1429 {
1430 flag aSign;
1431 int16 aExp, shiftCount;
1432 uint32_t aSig;
1433 int32 z;
1434
1435 aSig = extractFloat32Frac( a );
1436 aExp = extractFloat32Exp( a );
1437 aSign = extractFloat32Sign( a );
1438 shiftCount = aExp - 0x8E;
1439 if ( 0 <= shiftCount ) {
1440 if ( float32_val(a) != 0xC7000000 ) {
1441 float_raise( float_flag_invalid STATUS_VAR);
1442 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1443 return 0x7FFF;
1444 }
1445 }
1446 return (int32_t) 0xffff8000;
1447 }
1448 else if ( aExp <= 0x7E ) {
1449 if ( aExp | aSig ) {
1450 STATUS(float_exception_flags) |= float_flag_inexact;
1451 }
1452 return 0;
1453 }
1454 shiftCount -= 0x10;
1455 aSig = ( aSig | 0x00800000 )<<8;
1456 z = aSig>>( - shiftCount );
1457 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1458 STATUS(float_exception_flags) |= float_flag_inexact;
1459 }
1460 if ( aSign ) {
1461 z = - z;
1462 }
1463 return z;
1464
1465 }
1466
1467 /*----------------------------------------------------------------------------
1468 | Returns the result of converting the single-precision floating-point value
1469 | `a' to the 64-bit two's complement integer format. The conversion is
1470 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1471 | Arithmetic---which means in particular that the conversion is rounded
1472 | according to the current rounding mode. If `a' is a NaN, the largest
1473 | positive integer is returned. Otherwise, if the conversion overflows, the
1474 | largest integer with the same sign as `a' is returned.
1475 *----------------------------------------------------------------------------*/
1476
1477 int64 float32_to_int64( float32 a STATUS_PARAM )
1478 {
1479 flag aSign;
1480 int16 aExp, shiftCount;
1481 uint32_t aSig;
1482 uint64_t aSig64, aSigExtra;
1483 a = float32_squash_input_denormal(a STATUS_VAR);
1484
1485 aSig = extractFloat32Frac( a );
1486 aExp = extractFloat32Exp( a );
1487 aSign = extractFloat32Sign( a );
1488 shiftCount = 0xBE - aExp;
1489 if ( shiftCount < 0 ) {
1490 float_raise( float_flag_invalid STATUS_VAR);
1491 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1492 return LIT64( 0x7FFFFFFFFFFFFFFF );
1493 }
1494 return (int64_t) LIT64( 0x8000000000000000 );
1495 }
1496 if ( aExp ) aSig |= 0x00800000;
1497 aSig64 = aSig;
1498 aSig64 <<= 40;
1499 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1500 return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1501
1502 }
1503
1504 /*----------------------------------------------------------------------------
1505 | Returns the result of converting the single-precision floating-point value
1506 | `a' to the 64-bit two's complement integer format. The conversion is
1507 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1508 | Arithmetic, except that the conversion is always rounded toward zero. If
1509 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1510 | conversion overflows, the largest integer with the same sign as `a' is
1511 | returned.
1512 *----------------------------------------------------------------------------*/
1513
1514 int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1515 {
1516 flag aSign;
1517 int16 aExp, shiftCount;
1518 uint32_t aSig;
1519 uint64_t aSig64;
1520 int64 z;
1521 a = float32_squash_input_denormal(a STATUS_VAR);
1522
1523 aSig = extractFloat32Frac( a );
1524 aExp = extractFloat32Exp( a );
1525 aSign = extractFloat32Sign( a );
1526 shiftCount = aExp - 0xBE;
1527 if ( 0 <= shiftCount ) {
1528 if ( float32_val(a) != 0xDF000000 ) {
1529 float_raise( float_flag_invalid STATUS_VAR);
1530 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1531 return LIT64( 0x7FFFFFFFFFFFFFFF );
1532 }
1533 }
1534 return (int64_t) LIT64( 0x8000000000000000 );
1535 }
1536 else if ( aExp <= 0x7E ) {
1537 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1538 return 0;
1539 }
1540 aSig64 = aSig | 0x00800000;
1541 aSig64 <<= 40;
1542 z = aSig64>>( - shiftCount );
1543 if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
1544 STATUS(float_exception_flags) |= float_flag_inexact;
1545 }
1546 if ( aSign ) z = - z;
1547 return z;
1548
1549 }
1550
1551 /*----------------------------------------------------------------------------
1552 | Returns the result of converting the single-precision floating-point value
1553 | `a' to the double-precision floating-point format. The conversion is
1554 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1555 | Arithmetic.
1556 *----------------------------------------------------------------------------*/
1557
1558 float64 float32_to_float64( float32 a STATUS_PARAM )
1559 {
1560 flag aSign;
1561 int16 aExp;
1562 uint32_t aSig;
1563 a = float32_squash_input_denormal(a STATUS_VAR);
1564
1565 aSig = extractFloat32Frac( a );
1566 aExp = extractFloat32Exp( a );
1567 aSign = extractFloat32Sign( a );
1568 if ( aExp == 0xFF ) {
1569 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1570 return packFloat64( aSign, 0x7FF, 0 );
1571 }
1572 if ( aExp == 0 ) {
1573 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1574 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1575 --aExp;
1576 }
1577 return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
1578
1579 }
1580
1581 #ifdef FLOATX80
1582
1583 /*----------------------------------------------------------------------------
1584 | Returns the result of converting the single-precision floating-point value
1585 | `a' to the extended double-precision floating-point format. The conversion
1586 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1587 | Arithmetic.
1588 *----------------------------------------------------------------------------*/
1589
1590 floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1591 {
1592 flag aSign;
1593 int16 aExp;
1594 uint32_t aSig;
1595
1596 a = float32_squash_input_denormal(a STATUS_VAR);
1597 aSig = extractFloat32Frac( a );
1598 aExp = extractFloat32Exp( a );
1599 aSign = extractFloat32Sign( a );
1600 if ( aExp == 0xFF ) {
1601 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1602 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1603 }
1604 if ( aExp == 0 ) {
1605 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1606 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1607 }
1608 aSig |= 0x00800000;
1609 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
1610
1611 }
1612
1613 #endif
1614
1615 #ifdef FLOAT128
1616
1617 /*----------------------------------------------------------------------------
1618 | Returns the result of converting the single-precision floating-point value
1619 | `a' to the double-precision floating-point format. The conversion is
1620 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1621 | Arithmetic.
1622 *----------------------------------------------------------------------------*/
1623
1624 float128 float32_to_float128( float32 a STATUS_PARAM )
1625 {
1626 flag aSign;
1627 int16 aExp;
1628 uint32_t aSig;
1629
1630 a = float32_squash_input_denormal(a STATUS_VAR);
1631 aSig = extractFloat32Frac( a );
1632 aExp = extractFloat32Exp( a );
1633 aSign = extractFloat32Sign( a );
1634 if ( aExp == 0xFF ) {
1635 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1636 return packFloat128( aSign, 0x7FFF, 0, 0 );
1637 }
1638 if ( aExp == 0 ) {
1639 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1640 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1641 --aExp;
1642 }
1643 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
1644
1645 }
1646
1647 #endif
1648
1649 /*----------------------------------------------------------------------------
1650 | Rounds the single-precision floating-point value `a' to an integer, and
1651 | returns the result as a single-precision floating-point value. The
1652 | operation is performed according to the IEC/IEEE Standard for Binary
1653 | Floating-Point Arithmetic.
1654 *----------------------------------------------------------------------------*/
1655
1656 float32 float32_round_to_int( float32 a STATUS_PARAM)
1657 {
1658 flag aSign;
1659 int16 aExp;
1660 uint32_t lastBitMask, roundBitsMask;
1661 int8 roundingMode;
1662 uint32_t z;
1663 a = float32_squash_input_denormal(a STATUS_VAR);
1664
1665 aExp = extractFloat32Exp( a );
1666 if ( 0x96 <= aExp ) {
1667 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1668 return propagateFloat32NaN( a, a STATUS_VAR );
1669 }
1670 return a;
1671 }
1672 if ( aExp <= 0x7E ) {
1673 if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
1674 STATUS(float_exception_flags) |= float_flag_inexact;
1675 aSign = extractFloat32Sign( a );
1676 switch ( STATUS(float_rounding_mode) ) {
1677 case float_round_nearest_even:
1678 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1679 return packFloat32( aSign, 0x7F, 0 );
1680 }
1681 break;
1682 case float_round_down:
1683 return make_float32(aSign ? 0xBF800000 : 0);
1684 case float_round_up:
1685 return make_float32(aSign ? 0x80000000 : 0x3F800000);
1686 }
1687 return packFloat32( aSign, 0, 0 );
1688 }
1689 lastBitMask = 1;
1690 lastBitMask <<= 0x96 - aExp;
1691 roundBitsMask = lastBitMask - 1;
1692 z = float32_val(a);
1693 roundingMode = STATUS(float_rounding_mode);
1694 if ( roundingMode == float_round_nearest_even ) {
1695 z += lastBitMask>>1;
1696 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1697 }
1698 else if ( roundingMode != float_round_to_zero ) {
1699 if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
1700 z += roundBitsMask;
1701 }
1702 }
1703 z &= ~ roundBitsMask;
1704 if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1705 return make_float32(z);
1706
1707 }
1708
1709 /*----------------------------------------------------------------------------
1710 | Returns the result of adding the absolute values of the single-precision
1711 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1712 | before being returned. `zSign' is ignored if the result is a NaN.
1713 | The addition is performed according to the IEC/IEEE Standard for Binary
1714 | Floating-Point Arithmetic.
1715 *----------------------------------------------------------------------------*/
1716
1717 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1718 {
1719 int16 aExp, bExp, zExp;
1720 uint32_t aSig, bSig, zSig;
1721 int16 expDiff;
1722
1723 aSig = extractFloat32Frac( a );
1724 aExp = extractFloat32Exp( a );
1725 bSig = extractFloat32Frac( b );
1726 bExp = extractFloat32Exp( b );
1727 expDiff = aExp - bExp;
1728 aSig <<= 6;
1729 bSig <<= 6;
1730 if ( 0 < expDiff ) {
1731 if ( aExp == 0xFF ) {
1732 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1733 return a;
1734 }
1735 if ( bExp == 0 ) {
1736 --expDiff;
1737 }
1738 else {
1739 bSig |= 0x20000000;
1740 }
1741 shift32RightJamming( bSig, expDiff, &bSig );
1742 zExp = aExp;
1743 }
1744 else if ( expDiff < 0 ) {
1745 if ( bExp == 0xFF ) {
1746 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1747 return packFloat32( zSign, 0xFF, 0 );
1748 }
1749 if ( aExp == 0 ) {
1750 ++expDiff;
1751 }
1752 else {
1753 aSig |= 0x20000000;
1754 }
1755 shift32RightJamming( aSig, - expDiff, &aSig );
1756 zExp = bExp;
1757 }
1758 else {
1759 if ( aExp == 0xFF ) {
1760 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1761 return a;
1762 }
1763 if ( aExp == 0 ) {
1764 if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
1765 return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1766 }
1767 zSig = 0x40000000 + aSig + bSig;
1768 zExp = aExp;
1769 goto roundAndPack;
1770 }
1771 aSig |= 0x20000000;
1772 zSig = ( aSig + bSig )<<1;
1773 --zExp;
1774 if ( (int32_t) zSig < 0 ) {
1775 zSig = aSig + bSig;
1776 ++zExp;
1777 }
1778 roundAndPack:
1779 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1780
1781 }
1782
1783 /*----------------------------------------------------------------------------
1784 | Returns the result of subtracting the absolute values of the single-
1785 | precision floating-point values `a' and `b'. If `zSign' is 1, the
1786 | difference is negated before being returned. `zSign' is ignored if the
1787 | result is a NaN. The subtraction is performed according to the IEC/IEEE
1788 | Standard for Binary Floating-Point Arithmetic.
1789 *----------------------------------------------------------------------------*/
1790
1791 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1792 {
1793 int16 aExp, bExp, zExp;
1794 uint32_t aSig, bSig, zSig;
1795 int16 expDiff;
1796
1797 aSig = extractFloat32Frac( a );
1798 aExp = extractFloat32Exp( a );
1799 bSig = extractFloat32Frac( b );
1800 bExp = extractFloat32Exp( b );
1801 expDiff = aExp - bExp;
1802 aSig <<= 7;
1803 bSig <<= 7;
1804 if ( 0 < expDiff ) goto aExpBigger;
1805 if ( expDiff < 0 ) goto bExpBigger;
1806 if ( aExp == 0xFF ) {
1807 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1808 float_raise( float_flag_invalid STATUS_VAR);
1809 return float32_default_nan;
1810 }
1811 if ( aExp == 0 ) {
1812 aExp = 1;
1813 bExp = 1;
1814 }
1815 if ( bSig < aSig ) goto aBigger;
1816 if ( aSig < bSig ) goto bBigger;
1817 return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1818 bExpBigger:
1819 if ( bExp == 0xFF ) {
1820 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1821 return packFloat32( zSign ^ 1, 0xFF, 0 );
1822 }
1823 if ( aExp == 0 ) {
1824 ++expDiff;
1825 }
1826 else {
1827 aSig |= 0x40000000;
1828 }
1829 shift32RightJamming( aSig, - expDiff, &aSig );
1830 bSig |= 0x40000000;
1831 bBigger:
1832 zSig = bSig - aSig;
1833 zExp = bExp;
1834 zSign ^= 1;
1835 goto normalizeRoundAndPack;
1836 aExpBigger:
1837 if ( aExp == 0xFF ) {
1838 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1839 return a;
1840 }
1841 if ( bExp == 0 ) {
1842 --expDiff;
1843 }
1844 else {
1845 bSig |= 0x40000000;
1846 }
1847 shift32RightJamming( bSig, expDiff, &bSig );
1848 aSig |= 0x40000000;
1849 aBigger:
1850 zSig = aSig - bSig;
1851 zExp = aExp;
1852 normalizeRoundAndPack:
1853 --zExp;
1854 return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1855
1856 }
1857
1858 /*----------------------------------------------------------------------------
1859 | Returns the result of adding the single-precision floating-point values `a'
1860 | and `b'. The operation is performed according to the IEC/IEEE Standard for
1861 | Binary Floating-Point Arithmetic.
1862 *----------------------------------------------------------------------------*/
1863
1864 float32 float32_add( float32 a, float32 b STATUS_PARAM )
1865 {
1866 flag aSign, bSign;
1867 a = float32_squash_input_denormal(a STATUS_VAR);
1868 b = float32_squash_input_denormal(b STATUS_VAR);
1869
1870 aSign = extractFloat32Sign( a );
1871 bSign = extractFloat32Sign( b );
1872 if ( aSign == bSign ) {
1873 return addFloat32Sigs( a, b, aSign STATUS_VAR);
1874 }
1875 else {
1876 return subFloat32Sigs( a, b, aSign STATUS_VAR );
1877 }
1878
1879 }
1880
1881 /*----------------------------------------------------------------------------
1882 | Returns the result of subtracting the single-precision floating-point values
1883 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1884 | for Binary Floating-Point Arithmetic.
1885 *----------------------------------------------------------------------------*/
1886
1887 float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1888 {
1889 flag aSign, bSign;
1890 a = float32_squash_input_denormal(a STATUS_VAR);
1891 b = float32_squash_input_denormal(b STATUS_VAR);
1892
1893 aSign = extractFloat32Sign( a );
1894 bSign = extractFloat32Sign( b );
1895 if ( aSign == bSign ) {
1896 return subFloat32Sigs( a, b, aSign STATUS_VAR );
1897 }
1898 else {
1899 return addFloat32Sigs( a, b, aSign STATUS_VAR );
1900 }
1901
1902 }
1903
1904 /*----------------------------------------------------------------------------
1905 | Returns the result of multiplying the single-precision floating-point values
1906 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1907 | for Binary Floating-Point Arithmetic.
1908 *----------------------------------------------------------------------------*/
1909
1910 float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1911 {
1912 flag aSign, bSign, zSign;
1913 int16 aExp, bExp, zExp;
1914 uint32_t aSig, bSig;
1915 uint64_t zSig64;
1916 uint32_t zSig;
1917
1918 a = float32_squash_input_denormal(a STATUS_VAR);
1919 b = float32_squash_input_denormal(b STATUS_VAR);
1920
1921 aSig = extractFloat32Frac( a );
1922 aExp = extractFloat32Exp( a );
1923 aSign = extractFloat32Sign( a );
1924 bSig = extractFloat32Frac( b );
1925 bExp = extractFloat32Exp( b );
1926 bSign = extractFloat32Sign( b );
1927 zSign = aSign ^ bSign;
1928 if ( aExp == 0xFF ) {
1929 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1930 return propagateFloat32NaN( a, b STATUS_VAR );
1931 }
1932 if ( ( bExp | bSig ) == 0 ) {
1933 float_raise( float_flag_invalid STATUS_VAR);
1934 return float32_default_nan;
1935 }
1936 return packFloat32( zSign, 0xFF, 0 );
1937 }
1938 if ( bExp == 0xFF ) {
1939 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1940 if ( ( aExp | aSig ) == 0 ) {
1941 float_raise( float_flag_invalid STATUS_VAR);
1942 return float32_default_nan;
1943 }
1944 return packFloat32( zSign, 0xFF, 0 );
1945 }
1946 if ( aExp == 0 ) {
1947 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1948 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1949 }
1950 if ( bExp == 0 ) {
1951 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1952 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1953 }
1954 zExp = aExp + bExp - 0x7F;
1955 aSig = ( aSig | 0x00800000 )<<7;
1956 bSig = ( bSig | 0x00800000 )<<8;
1957 shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
1958 zSig = zSig64;
1959 if ( 0 <= (int32_t) ( zSig<<1 ) ) {
1960 zSig <<= 1;
1961 --zExp;
1962 }
1963 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1964
1965 }
1966
1967 /*----------------------------------------------------------------------------
1968 | Returns the result of dividing the single-precision floating-point value `a'
1969 | by the corresponding value `b'. The operation is performed according to the
1970 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1971 *----------------------------------------------------------------------------*/
1972
1973 float32 float32_div( float32 a, float32 b STATUS_PARAM )
1974 {
1975 flag aSign, bSign, zSign;
1976 int16 aExp, bExp, zExp;
1977 uint32_t aSig, bSig, zSig;
1978 a = float32_squash_input_denormal(a STATUS_VAR);
1979 b = float32_squash_input_denormal(b STATUS_VAR);
1980
1981 aSig = extractFloat32Frac( a );
1982 aExp = extractFloat32Exp( a );
1983 aSign = extractFloat32Sign( a );
1984 bSig = extractFloat32Frac( b );
1985 bExp = extractFloat32Exp( b );
1986 bSign = extractFloat32Sign( b );
1987 zSign = aSign ^ bSign;
1988 if ( aExp == 0xFF ) {
1989 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1990 if ( bExp == 0xFF ) {
1991 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1992 float_raise( float_flag_invalid STATUS_VAR);
1993 return float32_default_nan;
1994 }
1995 return packFloat32( zSign, 0xFF, 0 );
1996 }
1997 if ( bExp == 0xFF ) {
1998 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1999 return packFloat32( zSign, 0, 0 );
2000 }
2001 if ( bExp == 0 ) {
2002 if ( bSig == 0 ) {
2003 if ( ( aExp | aSig ) == 0 ) {
2004 float_raise( float_flag_invalid STATUS_VAR);
2005 return float32_default_nan;
2006 }
2007 float_raise( float_flag_divbyzero STATUS_VAR);
2008 return packFloat32( zSign, 0xFF, 0 );
2009 }
2010 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2011 }
2012 if ( aExp == 0 ) {
2013 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2014 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2015 }
2016 zExp = aExp - bExp + 0x7D;
2017 aSig = ( aSig | 0x00800000 )<<7;
2018 bSig = ( bSig | 0x00800000 )<<8;
2019 if ( bSig <= ( aSig + aSig ) ) {
2020 aSig >>= 1;
2021 ++zExp;
2022 }
2023 zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
2024 if ( ( zSig & 0x3F ) == 0 ) {
2025 zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
2026 }
2027 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2028
2029 }
2030
2031 /*----------------------------------------------------------------------------
2032 | Returns the remainder of the single-precision floating-point value `a'
2033 | with respect to the corresponding value `b'. The operation is performed
2034 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2035 *----------------------------------------------------------------------------*/
2036
2037 float32 float32_rem( float32 a, float32 b STATUS_PARAM )
2038 {
2039 flag aSign, zSign;
2040 int16 aExp, bExp, expDiff;
2041 uint32_t aSig, bSig;
2042 uint32_t q;
2043 uint64_t aSig64, bSig64, q64;
2044 uint32_t alternateASig;
2045 int32_t sigMean;
2046 a = float32_squash_input_denormal(a STATUS_VAR);
2047 b = float32_squash_input_denormal(b STATUS_VAR);
2048
2049 aSig = extractFloat32Frac( a );
2050 aExp = extractFloat32Exp( a );
2051 aSign = extractFloat32Sign( a );
2052 bSig = extractFloat32Frac( b );
2053 bExp = extractFloat32Exp( b );
2054 if ( aExp == 0xFF ) {
2055 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2056 return propagateFloat32NaN( a, b STATUS_VAR );
2057 }
2058 float_raise( float_flag_invalid STATUS_VAR);
2059 return float32_default_nan;
2060 }
2061 if ( bExp == 0xFF ) {
2062 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2063 return a;
2064 }
2065 if ( bExp == 0 ) {
2066 if ( bSig == 0 ) {
2067 float_raise( float_flag_invalid STATUS_VAR);
2068 return float32_default_nan;
2069 }
2070 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2071 }
2072 if ( aExp == 0 ) {
2073 if ( aSig == 0 ) return a;
2074 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2075 }
2076 expDiff = aExp - bExp;
2077 aSig |= 0x00800000;
2078 bSig |= 0x00800000;
2079 if ( expDiff < 32 ) {
2080 aSig <<= 8;
2081 bSig <<= 8;
2082 if ( expDiff < 0 ) {
2083 if ( expDiff < -1 ) return a;
2084 aSig >>= 1;
2085 }
2086 q = ( bSig <= aSig );
2087 if ( q ) aSig -= bSig;
2088 if ( 0 < expDiff ) {
2089 q = ( ( (uint64_t) aSig )<<32 ) / bSig;
2090 q >>= 32 - expDiff;
2091 bSig >>= 2;
2092 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2093 }
2094 else {
2095 aSig >>= 2;
2096 bSig >>= 2;
2097 }
2098 }
2099 else {
2100 if ( bSig <= aSig ) aSig -= bSig;
2101 aSig64 = ( (uint64_t) aSig )<<40;
2102 bSig64 = ( (uint64_t) bSig )<<40;
2103 expDiff -= 64;
2104 while ( 0 < expDiff ) {
2105 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2106 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2107 aSig64 = - ( ( bSig * q64 )<<38 );
2108 expDiff -= 62;
2109 }
2110 expDiff += 64;
2111 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2112 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2113 q = q64>>( 64 - expDiff );
2114 bSig <<= 6;
2115 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2116 }
2117 do {
2118 alternateASig = aSig;
2119 ++q;
2120 aSig -= bSig;
2121 } while ( 0 <= (int32_t) aSig );
2122 sigMean = aSig + alternateASig;
2123 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2124 aSig = alternateASig;
2125 }
2126 zSign = ( (int32_t) aSig < 0 );
2127 if ( zSign ) aSig = - aSig;
2128 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2129
2130 }
2131
2132 /*----------------------------------------------------------------------------
2133 | Returns the square root of the single-precision floating-point value `a'.
2134 | The operation is performed according to the IEC/IEEE Standard for Binary
2135 | Floating-Point Arithmetic.
2136 *----------------------------------------------------------------------------*/
2137
2138 float32 float32_sqrt( float32 a STATUS_PARAM )
2139 {
2140 flag aSign;
2141 int16 aExp, zExp;
2142 uint32_t aSig, zSig;
2143 uint64_t rem, term;
2144 a = float32_squash_input_denormal(a STATUS_VAR);
2145
2146 aSig = extractFloat32Frac( a );
2147 aExp = extractFloat32Exp( a );
2148 aSign = extractFloat32Sign( a );
2149 if ( aExp == 0xFF ) {
2150 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2151 if ( ! aSign ) return a;
2152 float_raise( float_flag_invalid STATUS_VAR);
2153 return float32_default_nan;
2154 }
2155 if ( aSign ) {
2156 if ( ( aExp | aSig ) == 0 ) return a;
2157 float_raise( float_flag_invalid STATUS_VAR);
2158 return float32_default_nan;
2159 }
2160 if ( aExp == 0 ) {
2161 if ( aSig == 0 ) return float32_zero;
2162 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2163 }
2164 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2165 aSig = ( aSig | 0x00800000 )<<8;
2166 zSig = estimateSqrt32( aExp, aSig ) + 2;
2167 if ( ( zSig & 0x7F ) <= 5 ) {
2168 if ( zSig < 2 ) {
2169 zSig = 0x7FFFFFFF;
2170 goto roundAndPack;
2171 }
2172 aSig >>= aExp & 1;
2173 term = ( (uint64_t) zSig ) * zSig;
2174 rem = ( ( (uint64_t) aSig )<<32 ) - term;
2175 while ( (int64_t) rem < 0 ) {
2176 --zSig;
2177 rem += ( ( (uint64_t) zSig )<<1 ) | 1;
2178 }
2179 zSig |= ( rem != 0 );
2180 }
2181 shift32RightJamming( zSig, 1, &zSig );
2182 roundAndPack:
2183 return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2184
2185 }
2186
2187 /*----------------------------------------------------------------------------
2188 | Returns the binary exponential of the single-precision floating-point value
2189 | `a'. The operation is performed according to the IEC/IEEE Standard for
2190 | Binary Floating-Point Arithmetic.
2191 |
2192 | Uses the following identities:
2193 |
2194 | 1. -------------------------------------------------------------------------
2195 | x x*ln(2)
2196 | 2 = e
2197 |
2198 | 2. -------------------------------------------------------------------------
2199 | 2 3 4 5 n
2200 | x x x x x x x
2201 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2202 | 1! 2! 3! 4! 5! n!
2203 *----------------------------------------------------------------------------*/
2204
2205 static const float64 float32_exp2_coefficients[15] =
2206 {
2207 const_float64( 0x3ff0000000000000ll ), /* 1 */
2208 const_float64( 0x3fe0000000000000ll ), /* 2 */
2209 const_float64( 0x3fc5555555555555ll ), /* 3 */
2210 const_float64( 0x3fa5555555555555ll ), /* 4 */
2211 const_float64( 0x3f81111111111111ll ), /* 5 */
2212 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
2213 const_float64( 0x3f2a01a01a01a01all ), /* 7 */
2214 const_float64( 0x3efa01a01a01a01all ), /* 8 */
2215 const_float64( 0x3ec71de3a556c734ll ), /* 9 */
2216 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2217 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2218 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2219 const_float64( 0x3de6124613a86d09ll ), /* 13 */
2220 const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2221 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2222 };
2223
2224 float32 float32_exp2( float32 a STATUS_PARAM )
2225 {
2226 flag aSign;
2227 int16 aExp;
2228 uint32_t aSig;
2229 float64 r, x, xn;
2230 int i;
2231 a = float32_squash_input_denormal(a STATUS_VAR);
2232
2233 aSig = extractFloat32Frac( a );
2234 aExp = extractFloat32Exp( a );
2235 aSign = extractFloat32Sign( a );
2236
2237 if ( aExp == 0xFF) {
2238 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2239 return (aSign) ? float32_zero : a;
2240 }
2241 if (aExp == 0) {
2242 if (aSig == 0) return float32_one;
2243 }
2244
2245 float_raise( float_flag_inexact STATUS_VAR);
2246
2247 /* ******************************* */
2248 /* using float64 for approximation */
2249 /* ******************************* */
2250 x = float32_to_float64(a STATUS_VAR);
2251 x = float64_mul(x, float64_ln2 STATUS_VAR);
2252
2253 xn = x;
2254 r = float64_one;
2255 for (i = 0 ; i < 15 ; i++) {
2256 float64 f;
2257
2258 f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2259 r = float64_add(r, f STATUS_VAR);
2260
2261 xn = float64_mul(xn, x STATUS_VAR);
2262 }
2263
2264 return float64_to_float32(r, status);
2265 }
2266
2267 /*----------------------------------------------------------------------------
2268 | Returns the binary log of the single-precision floating-point value `a'.
2269 | The operation is performed according to the IEC/IEEE Standard for Binary
2270 | Floating-Point Arithmetic.
2271 *----------------------------------------------------------------------------*/
2272 float32 float32_log2( float32 a STATUS_PARAM )
2273 {
2274 flag aSign, zSign;
2275 int16 aExp;
2276 uint32_t aSig, zSig, i;
2277
2278 a = float32_squash_input_denormal(a STATUS_VAR);
2279 aSig = extractFloat32Frac( a );
2280 aExp = extractFloat32Exp( a );
2281 aSign = extractFloat32Sign( a );
2282
2283 if ( aExp == 0 ) {
2284 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2285 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2286 }
2287 if ( aSign ) {
2288 float_raise( float_flag_invalid STATUS_VAR);
2289 return float32_default_nan;
2290 }
2291 if ( aExp == 0xFF ) {
2292 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2293 return a;
2294 }
2295
2296 aExp -= 0x7F;
2297 aSig |= 0x00800000;
2298 zSign = aExp < 0;
2299 zSig = aExp << 23;
2300
2301 for (i = 1 << 22; i > 0; i >>= 1) {
2302 aSig = ( (uint64_t)aSig * aSig ) >> 23;
2303 if ( aSig & 0x01000000 ) {
2304 aSig >>= 1;
2305 zSig |= i;
2306 }
2307 }
2308
2309 if ( zSign )
2310 zSig = -zSig;
2311
2312 return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2313 }
2314
2315 /*----------------------------------------------------------------------------
2316 | Returns 1 if the single-precision floating-point value `a' is equal to
2317 | the corresponding value `b', and 0 otherwise. The invalid exception is
2318 | raised if either operand is a NaN. Otherwise, the comparison is performed
2319 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2320 *----------------------------------------------------------------------------*/
2321
2322 int float32_eq( float32 a, float32 b STATUS_PARAM )
2323 {
2324 uint32_t av, bv;
2325 a = float32_squash_input_denormal(a STATUS_VAR);
2326 b = float32_squash_input_denormal(b STATUS_VAR);
2327
2328 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2329 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2330 ) {
2331 float_raise( float_flag_invalid STATUS_VAR);
2332 return 0;
2333 }
2334 av = float32_val(a);
2335 bv = float32_val(b);
2336 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2337 }
2338
2339 /*----------------------------------------------------------------------------
2340 | Returns 1 if the single-precision floating-point value `a' is less than
2341 | or equal to the corresponding value `b', and 0 otherwise. The comparison
2342 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2343 | Arithmetic.
2344 *----------------------------------------------------------------------------*/
2345
2346 int float32_le( float32 a, float32 b STATUS_PARAM )
2347 {
2348 flag aSign, bSign;
2349 uint32_t av, bv;
2350 a = float32_squash_input_denormal(a STATUS_VAR);
2351 b = float32_squash_input_denormal(b STATUS_VAR);
2352
2353 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2354 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2355 ) {
2356 float_raise( float_flag_invalid STATUS_VAR);
2357 return 0;
2358 }
2359 aSign = extractFloat32Sign( a );
2360 bSign = extractFloat32Sign( b );
2361 av = float32_val(a);
2362 bv = float32_val(b);
2363 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2364 return ( av == bv ) || ( aSign ^ ( av < bv ) );
2365
2366 }
2367
2368 /*----------------------------------------------------------------------------
2369 | Returns 1 if the single-precision floating-point value `a' is less than
2370 | the corresponding value `b', and 0 otherwise. The comparison is performed
2371 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2372 *----------------------------------------------------------------------------*/
2373
2374 int float32_lt( float32 a, float32 b STATUS_PARAM )
2375 {
2376 flag aSign, bSign;
2377 uint32_t av, bv;
2378 a = float32_squash_input_denormal(a STATUS_VAR);
2379 b = float32_squash_input_denormal(b STATUS_VAR);
2380
2381 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2382 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2383 ) {
2384 float_raise( float_flag_invalid STATUS_VAR);
2385 return 0;
2386 }
2387 aSign = extractFloat32Sign( a );
2388 bSign = extractFloat32Sign( b );
2389 av = float32_val(a);
2390 bv = float32_val(b);
2391 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2392 return ( av != bv ) && ( aSign ^ ( av < bv ) );
2393
2394 }
2395
2396 /*----------------------------------------------------------------------------
2397 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2398 | be compared, and 0 otherwise. The comparison is performed according to the
2399 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2400 *----------------------------------------------------------------------------*/
2401
2402 int float32_unordered( float32 a, float32 b STATUS_PARAM )
2403 {
2404 a = float32_squash_input_denormal(a STATUS_VAR);
2405 b = float32_squash_input_denormal(b STATUS_VAR);
2406
2407 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2408 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2409 ) {
2410 float_raise( float_flag_invalid STATUS_VAR);
2411 return 1;
2412 }
2413 return 0;
2414 }
2415
2416 /*----------------------------------------------------------------------------
2417 | Returns 1 if the single-precision floating-point value `a' is equal to
2418 | the corresponding value `b', and 0 otherwise. The comparison is performed
2419 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2420 *----------------------------------------------------------------------------*/
2421
2422 int float32_eq_quiet( float32 a, float32 b STATUS_PARAM )
2423 {
2424 a = float32_squash_input_denormal(a STATUS_VAR);
2425 b = float32_squash_input_denormal(b STATUS_VAR);
2426
2427 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2428 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2429 ) {
2430 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2431 float_raise( float_flag_invalid STATUS_VAR);
2432 }
2433 return 0;
2434 }
2435 return ( float32_val(a) == float32_val(b) ) ||
2436 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2437 }
2438
2439 /*----------------------------------------------------------------------------
2440 | Returns 1 if the single-precision floating-point value `a' is less than or
2441 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2442 | cause an exception. Otherwise, the comparison is performed according to the
2443 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2444 *----------------------------------------------------------------------------*/
2445
2446 int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2447 {
2448 flag aSign, bSign;
2449 uint32_t av, bv;
2450 a = float32_squash_input_denormal(a STATUS_VAR);
2451 b = float32_squash_input_denormal(b STATUS_VAR);
2452
2453 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2454 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2455 ) {
2456 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2457 float_raise( float_flag_invalid STATUS_VAR);
2458 }
2459 return 0;
2460 }
2461 aSign = extractFloat32Sign( a );
2462 bSign = extractFloat32Sign( b );
2463 av = float32_val(a);
2464 bv = float32_val(b);
2465 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2466 return ( av == bv ) || ( aSign ^ ( av < bv ) );
2467
2468 }
2469
2470 /*----------------------------------------------------------------------------
2471 | Returns 1 if the single-precision floating-point value `a' is less than
2472 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2473 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
2474 | Standard for Binary Floating-Point Arithmetic.
2475 *----------------------------------------------------------------------------*/
2476
2477 int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2478 {
2479 flag aSign, bSign;
2480 uint32_t av, bv;
2481 a = float32_squash_input_denormal(a STATUS_VAR);
2482 b = float32_squash_input_denormal(b STATUS_VAR);
2483
2484 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2485 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2486 ) {
2487 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2488 float_raise( float_flag_invalid STATUS_VAR);
2489 }
2490 return 0;
2491 }
2492 aSign = extractFloat32Sign( a );
2493 bSign = extractFloat32Sign( b );
2494 av = float32_val(a);
2495 bv = float32_val(b);
2496 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2497 return ( av != bv ) && ( aSign ^ ( av < bv ) );
2498
2499 }
2500
2501 /*----------------------------------------------------------------------------
2502 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2503 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
2504 | comparison is performed according to the IEC/IEEE Standard for Binary
2505 | Floating-Point Arithmetic.
2506 *----------------------------------------------------------------------------*/
2507
2508 int float32_unordered_quiet( float32 a, float32 b STATUS_PARAM )
2509 {
2510 a = float32_squash_input_denormal(a STATUS_VAR);
2511 b = float32_squash_input_denormal(b STATUS_VAR);
2512
2513 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2514 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2515 ) {
2516 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2517 float_raise( float_flag_invalid STATUS_VAR);
2518 }
2519 return 1;
2520 }
2521 return 0;
2522 }
2523
2524 /*----------------------------------------------------------------------------
2525 | Returns the result of converting the double-precision floating-point value
2526 | `a' to the 32-bit two's complement integer format. The conversion is
2527 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2528 | Arithmetic---which means in particular that the conversion is rounded
2529 | according to the current rounding mode. If `a' is a NaN, the largest
2530 | positive integer is returned. Otherwise, if the conversion overflows, the
2531 | largest integer with the same sign as `a' is returned.
2532 *----------------------------------------------------------------------------*/
2533
2534 int32 float64_to_int32( float64 a STATUS_PARAM )
2535 {
2536 flag aSign;
2537 int16 aExp, shiftCount;
2538 uint64_t aSig;
2539 a = float64_squash_input_denormal(a STATUS_VAR);
2540
2541 aSig = extractFloat64Frac( a );
2542 aExp = extractFloat64Exp( a );
2543 aSign = extractFloat64Sign( a );
2544 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2545 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2546 shiftCount = 0x42C - aExp;
2547 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2548 return roundAndPackInt32( aSign, aSig STATUS_VAR );
2549
2550 }
2551
2552 /*----------------------------------------------------------------------------
2553 | Returns the result of converting the double-precision floating-point value
2554 | `a' to the 32-bit two's complement integer format. The conversion is
2555 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2556 | Arithmetic, except that the conversion is always rounded toward zero.
2557 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2558 | the conversion overflows, the largest integer with the same sign as `a' is
2559 | returned.
2560 *----------------------------------------------------------------------------*/
2561
2562 int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2563 {
2564 flag aSign;
2565 int16 aExp, shiftCount;
2566 uint64_t aSig, savedASig;
2567 int32 z;
2568 a = float64_squash_input_denormal(a STATUS_VAR);
2569
2570 aSig = extractFloat64Frac( a );
2571 aExp = extractFloat64Exp( a );
2572 aSign = extractFloat64Sign( a );
2573 if ( 0x41E < aExp ) {
2574 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2575 goto invalid;
2576 }
2577 else if ( aExp < 0x3FF ) {
2578 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2579 return 0;
2580 }
2581 aSig |= LIT64( 0x0010000000000000 );
2582 shiftCount = 0x433 - aExp;
2583 savedASig = aSig;
2584 aSig >>= shiftCount;
2585 z = aSig;
2586 if ( aSign ) z = - z;
2587 if ( ( z < 0 ) ^ aSign ) {
2588 invalid:
2589 float_raise( float_flag_invalid STATUS_VAR);
2590 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2591 }
2592 if ( ( aSig<<shiftCount ) != savedASig ) {
2593 STATUS(float_exception_flags) |= float_flag_inexact;
2594 }
2595 return z;
2596
2597 }
2598
2599 /*----------------------------------------------------------------------------
2600 | Returns the result of converting the double-precision floating-point value
2601 | `a' to the 16-bit two's complement integer format. The conversion is
2602 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2603 | Arithmetic, except that the conversion is always rounded toward zero.
2604 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2605 | the conversion overflows, the largest integer with the same sign as `a' is
2606 | returned.
2607 *----------------------------------------------------------------------------*/
2608
2609 int16 float64_to_int16_round_to_zero( float64 a STATUS_PARAM )
2610 {
2611 flag aSign;
2612 int16 aExp, shiftCount;
2613 uint64_t aSig, savedASig;
2614 int32 z;
2615
2616 aSig = extractFloat64Frac( a );
2617 aExp = extractFloat64Exp( a );
2618 aSign = extractFloat64Sign( a );
2619 if ( 0x40E < aExp ) {
2620 if ( ( aExp == 0x7FF ) && aSig ) {
2621 aSign = 0;
2622 }
2623 goto invalid;
2624 }
2625 else if ( aExp < 0x3FF ) {
2626 if ( aExp || aSig ) {
2627 STATUS(float_exception_flags) |= float_flag_inexact;
2628 }
2629 return 0;
2630 }
2631 aSig |= LIT64( 0x0010000000000000 );
2632 shiftCount = 0x433 - aExp;
2633 savedASig = aSig;
2634 aSig >>= shiftCount;
2635 z = aSig;
2636 if ( aSign ) {
2637 z = - z;
2638 }
2639 if ( ( (int16_t)z < 0 ) ^ aSign ) {
2640 invalid:
2641 float_raise( float_flag_invalid STATUS_VAR);
2642 return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
2643 }
2644 if ( ( aSig<<shiftCount ) != savedASig ) {
2645 STATUS(float_exception_flags) |= float_flag_inexact;
2646 }
2647 return z;
2648 }
2649
2650 /*----------------------------------------------------------------------------
2651 | Returns the result of converting the double-precision floating-point value
2652 | `a' to the 64-bit two's complement integer format. The conversion is
2653 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2654 | Arithmetic---which means in particular that the conversion is rounded
2655 | according to the current rounding mode. If `a' is a NaN, the largest
2656 | positive integer is returned. Otherwise, if the conversion overflows, the
2657 | largest integer with the same sign as `a' is returned.
2658 *----------------------------------------------------------------------------*/
2659
2660 int64 float64_to_int64( float64 a STATUS_PARAM )
2661 {
2662 flag aSign;
2663 int16 aExp, shiftCount;
2664 uint64_t aSig, aSigExtra;
2665 a = float64_squash_input_denormal(a STATUS_VAR);
2666
2667 aSig = extractFloat64Frac( a );
2668 aExp = extractFloat64Exp( a );
2669 aSign = extractFloat64Sign( a );
2670 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2671 shiftCount = 0x433 - aExp;
2672 if ( shiftCount <= 0 ) {
2673 if ( 0x43E < aExp ) {
2674 float_raise( float_flag_invalid STATUS_VAR);
2675 if ( ! aSign
2676 || ( ( aExp == 0x7FF )
2677 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2678 ) {
2679 return LIT64( 0x7FFFFFFFFFFFFFFF );
2680 }
2681 return (int64_t) LIT64( 0x8000000000000000 );
2682 }
2683 aSigExtra = 0;
2684 aSig <<= - shiftCount;
2685 }
2686 else {
2687 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2688 }
2689 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2690
2691 }
2692
2693 /*----------------------------------------------------------------------------
2694 | Returns the result of converting the double-precision floating-point value
2695 | `a' to the 64-bit two's complement integer format. The conversion is
2696 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2697 | Arithmetic, except that the conversion is always rounded toward zero.
2698 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2699 | the conversion overflows, the largest integer with the same sign as `a' is
2700 | returned.
2701 *----------------------------------------------------------------------------*/
2702
2703 int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2704 {
2705 flag aSign;
2706 int16 aExp, shiftCount;
2707 uint64_t aSig;
2708 int64 z;
2709 a = float64_squash_input_denormal(a STATUS_VAR);
2710
2711 aSig = extractFloat64Frac( a );
2712 aExp = extractFloat64Exp( a );
2713 aSign = extractFloat64Sign( a );
2714 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2715 shiftCount = aExp - 0x433;
2716 if ( 0 <= shiftCount ) {
2717 if ( 0x43E <= aExp ) {
2718 if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
2719 float_raise( float_flag_invalid STATUS_VAR);
2720 if ( ! aSign
2721 || ( ( aExp == 0x7FF )
2722 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2723 ) {
2724 return LIT64( 0x7FFFFFFFFFFFFFFF );
2725 }
2726 }
2727 return (int64_t) LIT64( 0x8000000000000000 );
2728 }
2729 z = aSig<<shiftCount;
2730 }
2731 else {
2732 if ( aExp < 0x3FE ) {
2733 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2734 return 0;
2735 }
2736 z = aSig>>( - shiftCount );
2737 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
2738 STATUS(float_exception_flags) |= float_flag_inexact;
2739 }
2740 }
2741 if ( aSign ) z = - z;
2742 return z;
2743
2744 }
2745
2746 /*----------------------------------------------------------------------------
2747 | Returns the result of converting the double-precision floating-point value
2748 | `a' to the single-precision floating-point format. The conversion is
2749 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2750 | Arithmetic.
2751 *----------------------------------------------------------------------------*/
2752
2753 float32 float64_to_float32( float64 a STATUS_PARAM )
2754 {
2755 flag aSign;
2756 int16 aExp;
2757 uint64_t aSig;
2758 uint32_t zSig;
2759 a = float64_squash_input_denormal(a STATUS_VAR);
2760
2761 aSig = extractFloat64Frac( a );
2762 aExp = extractFloat64Exp( a );
2763 aSign = extractFloat64Sign( a );
2764 if ( aExp == 0x7FF ) {
2765 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2766 return packFloat32( aSign, 0xFF, 0 );
2767 }
2768 shift64RightJamming( aSig, 22, &aSig );
2769 zSig = aSig;
2770 if ( aExp || zSig ) {
2771 zSig |= 0x40000000;
2772 aExp -= 0x381;
2773 }
2774 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2775
2776 }
2777
2778
2779 /*----------------------------------------------------------------------------
2780 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2781 | half-precision floating-point value, returning the result. After being
2782 | shifted into the proper positions, the three fields are simply added
2783 | together to form the result. This means that any integer portion of `zSig'
2784 | will be added into the exponent. Since a properly normalized significand
2785 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2786 | than the desired result exponent whenever `zSig' is a complete, normalized
2787 | significand.
2788 *----------------------------------------------------------------------------*/
2789 static float16 packFloat16(flag zSign, int16 zExp, uint16_t zSig)
2790 {
2791 return make_float16(
2792 (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
2793 }
2794
2795 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
2796 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
2797
2798 float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM)
2799 {
2800 flag aSign;
2801 int16 aExp;
2802 uint32_t aSig;
2803
2804 aSign = extractFloat16Sign(a);
2805 aExp = extractFloat16Exp(a);
2806 aSig = extractFloat16Frac(a);
2807
2808 if (aExp == 0x1f && ieee) {
2809 if (aSig) {
2810 return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
2811 }
2812 return packFloat32(aSign, 0xff, aSig << 13);
2813 }
2814 if (aExp == 0) {
2815 int8 shiftCount;
2816
2817 if (aSig == 0) {
2818 return packFloat32(aSign, 0, 0);
2819 }
2820
2821 shiftCount = countLeadingZeros32( aSig ) - 21;
2822 aSig = aSig << shiftCount;
2823 aExp = -shiftCount;
2824 }
2825 return packFloat32( aSign, aExp + 0x70, aSig << 13);
2826 }
2827
2828 float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM)
2829 {
2830 flag aSign;
2831 int16 aExp;
2832 uint32_t aSig;
2833 uint32_t mask;
2834 uint32_t increment;
2835 int8 roundingMode;
2836 a = float32_squash_input_denormal(a STATUS_VAR);
2837
2838 aSig = extractFloat32Frac( a );
2839 aExp = extractFloat32Exp( a );
2840 aSign = extractFloat32Sign( a );
2841 if ( aExp == 0xFF ) {
2842 if (aSig) {
2843 /* Input is a NaN */
2844 float16 r = commonNaNToFloat16( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2845 if (!ieee) {
2846 return packFloat16(aSign, 0, 0);
2847 }
2848 return r;
2849 }
2850 /* Infinity */
2851 if (!ieee) {
2852 float_raise(float_flag_invalid STATUS_VAR);
2853 return packFloat16(aSign, 0x1f, 0x3ff);
2854 }
2855 return packFloat16(aSign, 0x1f, 0);
2856 }
2857 if (aExp == 0 && aSig == 0) {
2858 return packFloat16(aSign, 0, 0);
2859 }
2860 /* Decimal point between bits 22 and 23. */
2861 aSig |= 0x00800000;
2862 aExp -= 0x7f;
2863 if (aExp < -14) {
2864 mask = 0x00ffffff;
2865 if (aExp >= -24) {
2866 mask >>= 25 + aExp;
2867 }
2868 } else {
2869 mask = 0x00001fff;
2870 }
2871 if (aSig & mask) {
2872 float_raise( float_flag_underflow STATUS_VAR );
2873 roundingMode = STATUS(float_rounding_mode);
2874 switch (roundingMode) {
2875 case float_round_nearest_even:
2876 increment = (mask + 1) >> 1;
2877 if ((aSig & mask) == increment) {
2878 increment = aSig & (increment << 1);
2879 }
2880 break;
2881 case float_round_up:
2882 increment = aSign ? 0 : mask;
2883 break;
2884 case float_round_down:
2885 increment = aSign ? mask : 0;
2886 break;
2887 default: /* round_to_zero */
2888 increment = 0;
2889 break;
2890 }
2891 aSig += increment;
2892 if (aSig >= 0x01000000) {
2893 aSig >>= 1;
2894 aExp++;
2895 }
2896 } else if (aExp < -14
2897 && STATUS(float_detect_tininess) == float_tininess_before_rounding) {
2898 float_raise( float_flag_underflow STATUS_VAR);
2899 }
2900
2901 if (ieee) {
2902 if (aExp > 15) {
2903 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
2904 return packFloat16(aSign, 0x1f, 0);
2905 }
2906 } else {
2907 if (aExp > 16) {
2908 float_raise(float_flag_invalid | float_flag_inexact STATUS_VAR);
2909 return packFloat16(aSign, 0x1f, 0x3ff);
2910 }
2911 }
2912 if (aExp < -24) {
2913 return packFloat16(aSign, 0, 0);
2914 }
2915 if (aExp < -14) {
2916 aSig >>= -14 - aExp;
2917 aExp = -14;
2918 }
2919 return packFloat16(aSign, aExp + 14, aSig >> 13);
2920 }
2921
2922 #ifdef FLOATX80
2923
2924 /*----------------------------------------------------------------------------
2925 | Returns the result of converting the double-precision floating-point value
2926 | `a' to the extended double-precision floating-point format. The conversion
2927 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2928 | Arithmetic.
2929 *----------------------------------------------------------------------------*/
2930
2931 floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
2932 {
2933 flag aSign;
2934 int16 aExp;
2935 uint64_t aSig;
2936
2937 a = float64_squash_input_denormal(a STATUS_VAR);
2938 aSig = extractFloat64Frac( a );
2939 aExp = extractFloat64Exp( a );
2940 aSign = extractFloat64Sign( a );
2941 if ( aExp == 0x7FF ) {
2942 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2943 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2944 }
2945 if ( aExp == 0 ) {
2946 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2947 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2948 }
2949 return
2950 packFloatx80(
2951 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2952
2953 }
2954
2955 #endif
2956
2957 #ifdef FLOAT128
2958
2959 /*----------------------------------------------------------------------------
2960 | Returns the result of converting the double-precision floating-point value
2961 | `a' to the quadruple-precision floating-point format. The conversion is
2962 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2963 | Arithmetic.
2964 *----------------------------------------------------------------------------*/
2965
2966 float128 float64_to_float128( float64 a STATUS_PARAM )
2967 {
2968 flag aSign;
2969 int16 aExp;
2970 uint64_t aSig, zSig0, zSig1;
2971
2972 a = float64_squash_input_denormal(a STATUS_VAR);
2973 aSig = extractFloat64Frac( a );
2974 aExp = extractFloat64Exp( a );
2975 aSign = extractFloat64Sign( a );
2976 if ( aExp == 0x7FF ) {
2977 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2978 return packFloat128( aSign, 0x7FFF, 0, 0 );
2979 }
2980 if ( aExp == 0 ) {
2981 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2982 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2983 --aExp;
2984 }
2985 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2986 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2987
2988 }
2989
2990 #endif
2991
2992 /*----------------------------------------------------------------------------
2993 | Rounds the double-precision floating-point value `a' to an integer, and
2994 | returns the result as a double-precision floating-point value. The
2995 | operation is performed according to the IEC/IEEE Standard for Binary
2996 | Floating-Point Arithmetic.
2997 *----------------------------------------------------------------------------*/
2998
2999 float64 float64_round_to_int( float64 a STATUS_PARAM )
3000 {
3001 flag aSign;
3002 int16 aExp;
3003 uint64_t lastBitMask, roundBitsMask;
3004 int8 roundingMode;
3005 uint64_t z;
3006 a = float64_squash_input_denormal(a STATUS_VAR);
3007
3008 aExp = extractFloat64Exp( a );
3009 if ( 0x433 <= aExp ) {
3010 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
3011 return propagateFloat64NaN( a, a STATUS_VAR );
3012 }
3013 return a;
3014 }
3015 if ( aExp < 0x3FF ) {
3016 if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
3017 STATUS(float_exception_flags) |= float_flag_inexact;
3018 aSign = extractFloat64Sign( a );
3019 switch ( STATUS(float_rounding_mode) ) {
3020 case float_round_nearest_even:
3021 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3022 return packFloat64( aSign, 0x3FF, 0 );
3023 }
3024 break;
3025 case float_round_down:
3026 return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
3027 case float_round_up:
3028 return make_float64(
3029 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
3030 }
3031 return packFloat64( aSign, 0, 0 );
3032 }
3033 lastBitMask = 1;
3034 lastBitMask <<= 0x433 - aExp;
3035 roundBitsMask = lastBitMask - 1;
3036 z = float64_val(a);
3037 roundingMode = STATUS(float_rounding_mode);
3038 if ( roundingMode == float_round_nearest_even ) {
3039 z += lastBitMask>>1;
3040 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
3041 }
3042 else if ( roundingMode != float_round_to_zero ) {
3043 if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
3044 z += roundBitsMask;
3045 }
3046 }
3047 z &= ~ roundBitsMask;
3048 if ( z != float64_val(a) )
3049 STATUS(float_exception_flags) |= float_flag_inexact;
3050 return make_float64(z);
3051
3052 }
3053
3054 float64 float64_trunc_to_int( float64 a STATUS_PARAM)
3055 {
3056 int oldmode;
3057 float64 res;
3058 oldmode = STATUS(float_rounding_mode);
3059 STATUS(float_rounding_mode) = float_round_to_zero;
3060 res = float64_round_to_int(a STATUS_VAR);
3061 STATUS(float_rounding_mode) = oldmode;
3062 return res;
3063 }
3064
3065 /*----------------------------------------------------------------------------
3066 | Returns the result of adding the absolute values of the double-precision
3067 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
3068 | before being returned. `zSign' is ignored if the result is a NaN.
3069 | The addition is performed according to the IEC/IEEE Standard for Binary
3070 | Floating-Point Arithmetic.
3071 *----------------------------------------------------------------------------*/
3072
3073 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3074 {
3075 int16 aExp, bExp, zExp;
3076 uint64_t aSig, bSig, zSig;
3077 int16 expDiff;
3078
3079 aSig = extractFloat64Frac( a );
3080 aExp = extractFloat64Exp( a );
3081 bSig = extractFloat64Frac( b );
3082 bExp = extractFloat64Exp( b );
3083 expDiff = aExp - bExp;
3084 aSig <<= 9;
3085 bSig <<= 9;
3086 if ( 0 < expDiff ) {
3087 if ( aExp == 0x7FF ) {
3088 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3089 return a;
3090 }
3091 if ( bExp == 0 ) {
3092 --expDiff;
3093 }
3094 else {
3095 bSig |= LIT64( 0x2000000000000000 );
3096 }
3097 shift64RightJamming( bSig, expDiff, &bSig );
3098 zExp = aExp;
3099 }
3100 else if ( expDiff < 0 ) {
3101 if ( bExp == 0x7FF ) {
3102 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3103 return packFloat64( zSign, 0x7FF, 0 );
3104 }
3105 if ( aExp == 0 ) {
3106 ++expDiff;
3107 }
3108 else {
3109 aSig |= LIT64( 0x2000000000000000 );
3110 }
3111 shift64RightJamming( aSig, - expDiff, &aSig );
3112 zExp = bExp;
3113 }
3114 else {
3115 if ( aExp == 0x7FF ) {
3116 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3117 return a;
3118 }
3119 if ( aExp == 0 ) {
3120 if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
3121 return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3122 }
3123 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3124 zExp = aExp;
3125 goto roundAndPack;
3126 }
3127 aSig |= LIT64( 0x2000000000000000 );
3128 zSig = ( aSig + bSig )<<1;
3129 --zExp;
3130 if ( (int64_t) zSig < 0 ) {
3131 zSig = aSig + bSig;
3132 ++zExp;
3133 }
3134 roundAndPack:
3135 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3136
3137 }
3138
3139 /*----------------------------------------------------------------------------
3140 | Returns the result of subtracting the absolute values of the double-
3141 | precision floating-point values `a' and `b'. If `zSign' is 1, the
3142 | difference is negated before being returned. `zSign' is ignored if the
3143 | result is a NaN. The subtraction is performed according to the IEC/IEEE
3144 | Standard for Binary Floating-Point Arithmetic.
3145 *----------------------------------------------------------------------------*/
3146
3147 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3148 {
3149 int16 aExp, bExp, zExp;
3150 uint64_t aSig, bSig, zSig;
3151 int16 expDiff;
3152
3153 aSig = extractFloat64Frac( a );
3154 aExp = extractFloat64Exp( a );
3155 bSig = extractFloat64Frac( b );
3156 bExp = extractFloat64Exp( b );
3157 expDiff = aExp - bExp;
3158 aSig <<= 10;
3159 bSig <<= 10;
3160 if ( 0 < expDiff ) goto aExpBigger;
3161 if ( expDiff < 0 ) goto bExpBigger;
3162 if ( aExp == 0x7FF ) {
3163 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3164 float_raise( float_flag_invalid STATUS_VAR);
3165 return float64_default_nan;
3166 }
3167 if ( aExp == 0 ) {
3168 aExp = 1;
3169 bExp = 1;
3170 }
3171 if ( bSig < aSig ) goto aBigger;
3172 if ( aSig < bSig ) goto bBigger;
3173 return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3174 bExpBigger:
3175 if ( bExp == 0x7FF ) {
3176 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3177 return packFloat64( zSign ^ 1, 0x7FF, 0 );
3178 }
3179 if ( aExp == 0 ) {
3180 ++expDiff;
3181 }
3182 else {
3183 aSig |= LIT64( 0x4000000000000000 );
3184 }
3185 shift64RightJamming( aSig, - expDiff, &aSig );
3186 bSig |= LIT64( 0x4000000000000000 );
3187 bBigger:
3188 zSig = bSig - aSig;
3189 zExp = bExp;
3190 zSign ^= 1;
3191 goto normalizeRoundAndPack;
3192 aExpBigger:
3193 if ( aExp == 0x7FF ) {
3194 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3195 return a;
3196 }
3197 if ( bExp == 0 ) {
3198 --expDiff;
3199 }
3200 else {
3201 bSig |= LIT64( 0x4000000000000000 );
3202 }
3203 shift64RightJamming( bSig, expDiff, &bSig );
3204 aSig |= LIT64( 0x4000000000000000 );
3205 aBigger:
3206 zSig = aSig - bSig;
3207 zExp = aExp;
3208 normalizeRoundAndPack:
3209 --zExp;
3210 return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3211
3212 }
3213
3214 /*----------------------------------------------------------------------------
3215 | Returns the result of adding the double-precision floating-point values `a'
3216 | and `b'. The operation is performed according to the IEC/IEEE Standard for
3217 | Binary Floating-Point Arithmetic.
3218 *----------------------------------------------------------------------------*/
3219
3220 float64 float64_add( float64 a, float64 b STATUS_PARAM )
3221 {
3222 flag aSign, bSign;
3223 a = float64_squash_input_denormal(a STATUS_VAR);
3224 b = float64_squash_input_denormal(b STATUS_VAR);
3225
3226 aSign = extractFloat64Sign( a );
3227 bSign = extractFloat64Sign( b );
3228 if ( aSign == bSign ) {
3229 return addFloat64Sigs( a, b, aSign STATUS_VAR );
3230 }
3231 else {
3232 return subFloat64Sigs( a, b, aSign STATUS_VAR );
3233 }
3234
3235 }
3236
3237 /*----------------------------------------------------------------------------
3238 | Returns the result of subtracting the double-precision floating-point values
3239 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3240 | for Binary Floating-Point Arithmetic.
3241 *----------------------------------------------------------------------------*/
3242
3243 float64 float64_sub( float64 a, float64 b STATUS_PARAM )
3244 {
3245 flag aSign, bSign;
3246 a = float64_squash_input_denormal(a STATUS_VAR);
3247 b = float64_squash_input_denormal(b STATUS_VAR);
3248
3249 aSign = extractFloat64Sign( a );
3250 bSign = extractFloat64Sign( b );
3251 if ( aSign == bSign ) {
3252 return subFloat64Sigs( a, b, aSign STATUS_VAR );
3253 }
3254 else {
3255 return addFloat64Sigs( a, b, aSign STATUS_VAR );
3256 }
3257
3258 }
3259
3260 /*----------------------------------------------------------------------------
3261 | Returns the result of multiplying the double-precision floating-point values
3262 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3263 | for Binary Floating-Point Arithmetic.
3264 *----------------------------------------------------------------------------*/
3265
3266 float64 float64_mul( float64 a, float64 b STATUS_PARAM )
3267 {
3268 flag aSign, bSign, zSign;
3269 int16 aExp, bExp, zExp;
3270 uint64_t aSig, bSig, zSig0, zSig1;
3271
3272 a = float64_squash_input_denormal(a STATUS_VAR);
3273 b = float64_squash_input_denormal(b STATUS_VAR);
3274
3275 aSig = extractFloat64Frac( a );
3276 aExp = extractFloat64Exp( a );
3277 aSign = extractFloat64Sign( a );
3278 bSig = extractFloat64Frac( b );
3279 bExp = extractFloat64Exp( b );
3280 bSign = extractFloat64Sign( b );
3281 zSign = aSign ^ bSign;
3282 if ( aExp == 0x7FF ) {
3283 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3284 return propagateFloat64NaN( a, b STATUS_VAR );
3285 }
3286 if ( ( bExp | bSig ) == 0 ) {
3287 float_raise( float_flag_invalid STATUS_VAR);
3288 return float64_default_nan;
3289 }
3290 return packFloat64( zSign, 0x7FF, 0 );
3291 }
3292 if ( bExp == 0x7FF ) {
3293 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3294 if ( ( aExp | aSig ) == 0 ) {
3295 float_raise( float_flag_invalid STATUS_VAR);
3296 return float64_default_nan;
3297 }
3298 return packFloat64( zSign, 0x7FF, 0 );
3299 }
3300 if ( aExp == 0 ) {
3301 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3302 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3303 }
3304 if ( bExp == 0 ) {
3305 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
3306 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3307 }
3308 zExp = aExp + bExp - 0x3FF;
3309 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3310 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3311 mul64To128( aSig, bSig, &zSig0, &zSig1 );
3312 zSig0 |= ( zSig1 != 0 );
3313 if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
3314 zSig0 <<= 1;
3315 --zExp;
3316 }
3317 return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
3318
3319 }
3320
3321 /*----------------------------------------------------------------------------
3322 | Returns the result of dividing the double-precision floating-point value `a'
3323 | by the corresponding value `b'. The operation is performed according to
3324 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3325 *----------------------------------------------------------------------------*/
3326
3327 float64 float64_div( float64 a, float64 b STATUS_PARAM )
3328 {
3329 flag aSign, bSign, zSign;
3330 int16 aExp, bExp, zExp;
3331 uint64_t aSig, bSig, zSig;
3332 uint64_t rem0, rem1;
3333 uint64_t term0, term1;
3334 a = float64_squash_input_denormal(a STATUS_VAR);
3335 b = float64_squash_input_denormal(b STATUS_VAR);
3336
3337 aSig = extractFloat64Frac( a );
3338 aExp = extractFloat64Exp( a );
3339 aSign = extractFloat64Sign( a );
3340 bSig = extractFloat64Frac( b );
3341 bExp = extractFloat64Exp( b );
3342 bSign = extractFloat64Sign( b );
3343 zSign = aSign ^ bSign;
3344 if ( aExp == 0x7FF ) {
3345 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3346 if ( bExp == 0x7FF ) {
3347 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3348 float_raise( float_flag_invalid STATUS_VAR);
3349 return float64_default_nan;
3350 }
3351 return packFloat64( zSign, 0x7FF, 0 );
3352 }
3353 if ( bExp == 0x7FF ) {
3354 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3355 return packFloat64( zSign, 0, 0 );
3356 }
3357 if ( bExp == 0 ) {
3358 if ( bSig == 0 ) {
3359 if ( ( aExp | aSig ) == 0 ) {
3360 float_raise( float_flag_invalid STATUS_VAR);
3361 return float64_default_nan;
3362 }
3363 float_raise( float_flag_divbyzero STATUS_VAR);
3364 return packFloat64( zSign, 0x7FF, 0 );
3365 }
3366 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3367 }
3368 if ( aExp == 0 ) {
3369 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3370 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3371 }
3372 zExp = aExp - bExp + 0x3FD;
3373 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3374 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3375 if ( bSig <= ( aSig + aSig ) ) {
3376 aSig >>= 1;
3377 ++zExp;
3378 }
3379 zSig = estimateDiv128To64( aSig, 0, bSig );
3380 if ( ( zSig & 0x1FF ) <= 2 ) {
3381 mul64To128( bSig, zSig, &term0, &term1 );
3382 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3383 while ( (int64_t) rem0 < 0 ) {
3384 --zSig;
3385 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3386 }
3387 zSig |= ( rem1 != 0 );
3388 }
3389 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3390
3391 }
3392
3393 /*----------------------------------------------------------------------------
3394 | Returns the remainder of the double-precision floating-point value `a'
3395 | with respect to the corresponding value `b'. The operation is performed
3396 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3397 *----------------------------------------------------------------------------*/
3398
3399 float64 float64_rem( float64 a, float64 b STATUS_PARAM )
3400 {
3401 flag aSign, zSign;
3402 int16 aExp, bExp, expDiff;
3403 uint64_t aSig, bSig;
3404 uint64_t q, alternateASig;
3405 int64_t sigMean;
3406
3407 a = float64_squash_input_denormal(a STATUS_VAR);
3408 b = float64_squash_input_denormal(b STATUS_VAR);
3409 aSig = extractFloat64Frac( a );
3410 aExp = extractFloat64Exp( a );
3411 aSign = extractFloat64Sign( a );
3412 bSig = extractFloat64Frac( b );
3413 bExp = extractFloat64Exp( b );
3414 if ( aExp == 0x7FF ) {
3415 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3416 return propagateFloat64NaN( a, b STATUS_VAR );
3417 }
3418 float_raise( float_flag_invalid STATUS_VAR);
3419 return float64_default_nan;
3420 }
3421 if ( bExp == 0x7FF ) {
3422 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3423 return a;
3424 }
3425 if ( bExp == 0 ) {
3426 if ( bSig == 0 ) {
3427 float_raise( float_flag_invalid STATUS_VAR);
3428 return float64_default_nan;
3429 }
3430 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3431 }
3432 if ( aExp == 0 ) {
3433 if ( aSig == 0 ) return a;
3434 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3435 }
3436 expDiff = aExp - bExp;
3437 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3438 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3439 if ( expDiff < 0 ) {
3440 if ( expDiff < -1 ) return a;
3441 aSig >>= 1;
3442 }
3443 q = ( bSig <= aSig );
3444 if ( q ) aSig -= bSig;
3445 expDiff -= 64;
3446 while ( 0 < expDiff ) {
3447 q = estimateDiv128To64( aSig, 0, bSig );
3448 q = ( 2 < q ) ? q - 2 : 0;
3449 aSig = - ( ( bSig>>2 ) * q );
3450 expDiff -= 62;
3451 }
3452 expDiff += 64;
3453 if ( 0 < expDiff ) {
3454 q = estimateDiv128To64( aSig, 0, bSig );
3455 q = ( 2 < q ) ? q - 2 : 0;
3456 q >>= 64 - expDiff;
3457 bSig >>= 2;
3458 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3459 }
3460 else {
3461 aSig >>= 2;
3462 bSig >>= 2;
3463 }
3464 do {
3465 alternateASig = aSig;
3466 ++q;
3467 aSig -= bSig;
3468 } while ( 0 <= (int64_t) aSig );
3469 sigMean = aSig + alternateASig;
3470 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3471 aSig = alternateASig;
3472 }
3473 zSign = ( (int64_t) aSig < 0 );
3474 if ( zSign ) aSig = - aSig;
3475 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
3476
3477 }
3478
3479 /*----------------------------------------------------------------------------
3480 | Returns the square root of the double-precision floating-point value `a'.
3481 | The operation is performed according to the IEC/IEEE Standard for Binary
3482 | Floating-Point Arithmetic.
3483 *----------------------------------------------------------------------------*/
3484
3485 float64 float64_sqrt( float64 a STATUS_PARAM )
3486 {
3487 flag aSign;
3488 int16 aExp, zExp;
3489 uint64_t aSig, zSig, doubleZSig;
3490 uint64_t rem0, rem1, term0, term1;
3491 a = float64_squash_input_denormal(a STATUS_VAR);
3492
3493 aSig = extractFloat64Frac( a );
3494 aExp = extractFloat64Exp( a );
3495 aSign = extractFloat64Sign( a );
3496 if ( aExp == 0x7FF ) {
3497 if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
3498 if ( ! aSign ) return a;
3499 float_raise( float_flag_invalid STATUS_VAR);
3500 return float64_default_nan;
3501 }
3502 if ( aSign ) {
3503 if ( ( aExp | aSig ) == 0 ) return a;
3504 float_raise( float_flag_invalid STATUS_VAR);
3505 return float64_default_nan;
3506 }
3507 if ( aExp == 0 ) {
3508 if ( aSig == 0 ) return float64_zero;
3509 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3510 }
3511 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3512 aSig |= LIT64( 0x0010000000000000 );
3513 zSig = estimateSqrt32( aExp, aSig>>21 );
3514 aSig <<= 9 - ( aExp & 1 );
3515 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3516 if ( ( zSig & 0x1FF ) <= 5 ) {
3517 doubleZSig = zSig<<1;
3518 mul64To128( zSig, zSig, &term0, &term1 );
3519 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3520 while ( (int64_t) rem0 < 0 ) {
3521 --zSig;
3522 doubleZSig -= 2;
3523 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3524 }
3525 zSig |= ( ( rem0 | rem1 ) != 0 );
3526 }
3527 return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
3528
3529 }
3530
3531 /*----------------------------------------------------------------------------
3532 | Returns the binary log of the double-precision floating-point value `a'.
3533 | The operation is performed according to the IEC/IEEE Standard for Binary
3534 | Floating-Point Arithmetic.
3535 *----------------------------------------------------------------------------*/
3536 float64 float64_log2( float64 a STATUS_PARAM )
3537 {
3538 flag aSign, zSign;
3539 int16 aExp;
3540 uint64_t aSig, aSig0, aSig1, zSig, i;
3541 a = float64_squash_input_denormal(a STATUS_VAR);
3542
3543 aSig = extractFloat64Frac( a );
3544 aExp = extractFloat64Exp( a );
3545 aSign = extractFloat64Sign( a );
3546
3547 if ( aExp == 0 ) {
3548 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
3549 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3550 }
3551 if ( aSign ) {
3552 float_raise( float_flag_invalid STATUS_VAR);
3553 return float64_default_nan;
3554 }
3555 if ( aExp == 0x7FF ) {
3556 if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
3557 return a;
3558 }
3559
3560 aExp -= 0x3FF;
3561 aSig |= LIT64( 0x0010000000000000 );
3562 zSign = aExp < 0;
3563 zSig = (uint64_t)aExp << 52;
3564 for (i = 1LL << 51; i > 0; i >>= 1) {
3565 mul64To128( aSig, aSig, &aSig0, &aSig1 );
3566 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
3567 if ( aSig & LIT64( 0x0020000000000000 ) ) {
3568 aSig >>= 1;
3569 zSig |= i;
3570 }
3571 }
3572
3573 if ( zSign )
3574 zSig = -zSig;
3575 return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
3576 }
3577
3578 /*----------------------------------------------------------------------------
3579 | Returns 1 if the double-precision floating-point value `a' is equal to the
3580 | corresponding value `b', and 0 otherwise. The invalid exception is raised
3581 | if either operand is a NaN. Otherwise, the comparison is performed
3582 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3583 *----------------------------------------------------------------------------*/
3584
3585 int float64_eq( float64 a, float64 b STATUS_PARAM )
3586 {
3587 uint64_t av, bv;
3588 a = float64_squash_input_denormal(a STATUS_VAR);
3589 b = float64_squash_input_denormal(b STATUS_VAR);
3590
3591 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3592 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3593 ) {
3594 float_raise( float_flag_invalid STATUS_VAR);
3595 return 0;
3596 }
3597 av = float64_val(a);
3598 bv = float64_val(b);
3599 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3600
3601 }
3602
3603 /*----------------------------------------------------------------------------
3604 | Returns 1 if the double-precision floating-point value `a' is less than or
3605 | equal to the corresponding value `b', and 0 otherwise. The comparison is
3606 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3607 | Arithmetic.
3608 *----------------------------------------------------------------------------*/
3609
3610 int float64_le( float64 a, float64 b STATUS_PARAM )
3611 {
3612 flag aSign, bSign;
3613 uint64_t av, bv;
3614 a = float64_squash_input_denormal(a STATUS_VAR);
3615 b = float64_squash_input_denormal(b STATUS_VAR);
3616
3617 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3618 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3619 ) {
3620 float_raise( float_flag_invalid STATUS_VAR);
3621 return 0;
3622 }
3623 aSign = extractFloat64Sign( a );
3624 bSign = extractFloat64Sign( b );
3625 av = float64_val(a);
3626 bv = float64_val(b);
3627 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3628 return ( av == bv ) || ( aSign ^ ( av < bv ) );
3629
3630 }
3631
3632 /*----------------------------------------------------------------------------
3633 | Returns 1 if the double-precision floating-point value `a' is less than
3634 | the corresponding value `b', and 0 otherwise. The comparison is performed
3635 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3636 *----------------------------------------------------------------------------*/
3637
3638 int float64_lt( float64 a, float64 b STATUS_PARAM )
3639 {
3640 flag aSign, bSign;
3641 uint64_t av, bv;
3642
3643 a = float64_squash_input_denormal(a STATUS_VAR);
3644 b = float64_squash_input_denormal(b STATUS_VAR);
3645 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3646 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3647 ) {
3648 float_raise( float_flag_invalid STATUS_VAR);
3649 return 0;
3650 }
3651 aSign = extractFloat64Sign( a );
3652 bSign = extractFloat64Sign( b );
3653 av = float64_val(a);
3654 bv = float64_val(b);
3655 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
3656 return ( av != bv ) && ( aSign ^ ( av < bv ) );
3657
3658 }
3659
3660 /*----------------------------------------------------------------------------
3661 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
3662 | be compared, and 0 otherwise. The comparison is performed according to the
3663 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3664 *----------------------------------------------------------------------------*/
3665
3666 int float64_unordered( float64 a, float64 b STATUS_PARAM )
3667 {
3668 a = float64_squash_input_denormal(a STATUS_VAR);
3669 b = float64_squash_input_denormal(b STATUS_VAR);
3670
3671 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3672 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3673 ) {
3674 float_raise( float_flag_invalid STATUS_VAR);
3675 return 1;
3676 }
3677 return 0;
3678 }
3679
3680 /*----------------------------------------------------------------------------
3681 | Returns 1 if the double-precision floating-point value `a' is equal to the
3682 | corresponding value `b', and 0 otherwise. The comparison is performed
3683 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3684 *----------------------------------------------------------------------------*/
3685
3686 int float64_eq_quiet( float64 a, float64 b STATUS_PARAM )
3687 {
3688 uint64_t av, bv;
3689 a = float64_squash_input_denormal(a STATUS_VAR);
3690 b = float64_squash_input_denormal(b STATUS_VAR);
3691
3692 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3693 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3694 ) {
3695 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3696 float_raise( float_flag_invalid STATUS_VAR);
3697 }
3698 return 0;
3699 }
3700 av = float64_val(a);
3701 bv = float64_val(b);
3702 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3703
3704 }
3705
3706 /*----------------------------------------------------------------------------
3707 | Returns 1 if the double-precision floating-point value `a' is less than or
3708 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3709 | cause an exception. Otherwise, the comparison is performed according to the
3710 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3711 *----------------------------------------------------------------------------*/
3712
3713 int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
3714 {
3715 flag aSign, bSign;
3716 uint64_t av, bv;
3717 a = float64_squash_input_denormal(a STATUS_VAR);
3718 b = float64_squash_input_denormal(b STATUS_VAR);
3719
3720 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3721 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3722 ) {
3723 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3724 float_raise( float_flag_invalid STATUS_VAR);
3725 }
3726 return 0;
3727 }
3728 aSign = extractFloat64Sign( a );
3729 bSign = extractFloat64Sign( b );
3730 av = float64_val(a);
3731 bv = float64_val(b);
3732 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3733 return ( av == bv ) || ( aSign ^ ( av < bv ) );
3734
3735 }
3736
3737 /*----------------------------------------------------------------------------
3738 | Returns 1 if the double-precision floating-point value `a' is less than
3739 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3740 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
3741 | Standard for Binary Floating-Point Arithmetic.
3742 *----------------------------------------------------------------------------*/
3743
3744 int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
3745 {
3746 flag aSign, bSign;
3747 uint64_t av, bv;
3748 a = float64_squash_input_denormal(a STATUS_VAR);
3749 b = float64_squash_input_denormal(b STATUS_VAR);
3750
3751 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3752 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3753 ) {
3754 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3755 float_raise( float_flag_invalid STATUS_VAR);
3756 }
3757 return 0;
3758 }
3759 aSign = extractFloat64Sign( a );
3760 bSign = extractFloat64Sign( b );
3761 av = float64_val(a);
3762 bv = float64_val(b);
3763 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
3764 return ( av != bv ) && ( aSign ^ ( av < bv ) );
3765
3766 }
3767
3768 /*----------------------------------------------------------------------------
3769 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
3770 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
3771 | comparison is performed according to the IEC/IEEE Standard for Binary
3772 | Floating-Point Arithmetic.
3773 *----------------------------------------------------------------------------*/
3774
3775 int float64_unordered_quiet( float64 a, float64 b STATUS_PARAM )
3776 {
3777 a = float64_squash_input_denormal(a STATUS_VAR);
3778 b = float64_squash_input_denormal(b STATUS_VAR);
3779
3780 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3781 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3782 ) {
3783 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3784 float_raise( float_flag_invalid STATUS_VAR);
3785 }
3786 return 1;
3787 }
3788 return 0;
3789 }
3790
3791 #ifdef FLOATX80
3792
3793 /*----------------------------------------------------------------------------
3794 | Returns the result of converting the extended double-precision floating-
3795 | point value `a' to the 32-bit two's complement integer format. The
3796 | conversion is performed according to the IEC/IEEE Standard for Binary
3797 | Floating-Point Arithmetic---which means in particular that the conversion
3798 | is rounded according to the current rounding mode. If `a' is a NaN, the
3799 | largest positive integer is returned. Otherwise, if the conversion
3800 | overflows, the largest integer with the same sign as `a' is returned.
3801 *----------------------------------------------------------------------------*/
3802
3803 int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
3804 {
3805 flag aSign;
3806 int32 aExp, shiftCount;
3807 uint64_t aSig;
3808
3809 aSig = extractFloatx80Frac( a );
3810 aExp = extractFloatx80Exp( a );
3811 aSign = extractFloatx80Sign( a );
3812 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
3813 shiftCount = 0x4037 - aExp;
3814 if ( shiftCount <= 0 ) shiftCount = 1;
3815 shift64RightJamming( aSig, shiftCount, &aSig );
3816 return roundAndPackInt32( aSign, aSig STATUS_VAR );
3817
3818 }
3819
3820 /*----------------------------------------------------------------------------
3821 | Returns the result of converting the extended double-precision floating-
3822 | point value `a' to the 32-bit two's complement integer format. The
3823 | conversion is performed according to the IEC/IEEE Standard for Binary
3824 | Floating-Point Arithmetic, except that the conversion is always rounded
3825 | toward zero. If `a' is a NaN, the largest positive integer is returned.
3826 | Otherwise, if the conversion overflows, the largest integer with the same
3827 | sign as `a' is returned.
3828 *----------------------------------------------------------------------------*/
3829
3830 int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
3831 {
3832 flag aSign;
3833 int32 aExp, shiftCount;
3834 uint64_t aSig, savedASig;
3835 int32 z;
3836
3837 aSig = extractFloatx80Frac( a );
3838 aExp = extractFloatx80Exp( a );
3839 aSign = extractFloatx80Sign( a );
3840 if ( 0x401E < aExp ) {
3841 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
3842 goto invalid;
3843 }
3844 else if ( aExp < 0x3FFF ) {
3845 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3846 return 0;
3847 }
3848 shiftCount = 0x403E - aExp;
3849 savedASig = aSig;
3850 aSig >>= shiftCount;
3851 z = aSig;
3852 if ( aSign ) z = - z;
3853 if ( ( z < 0 ) ^ aSign ) {
3854 invalid:
3855 float_raise( float_flag_invalid STATUS_VAR);
3856 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
3857 }
3858 if ( ( aSig<<shiftCount ) != savedASig ) {
3859 STATUS(float_exception_flags) |= float_flag_inexact;
3860 }
3861 return z;
3862
3863 }
3864
3865 /*----------------------------------------------------------------------------
3866 | Returns the result of converting the extended double-precision floating-
3867 | point value `a' to the 64-bit two's complement integer format. The
3868 | conversion is performed according to the IEC/IEEE Standard for Binary
3869 | Floating-Point Arithmetic---which means in particular that the conversion
3870 | is rounded according to the current rounding mode. If `a' is a NaN,
3871 | the largest positive integer is returned. Otherwise, if the conversion
3872 | overflows, the largest integer with the same sign as `a' is returned.
3873 *----------------------------------------------------------------------------*/
3874
3875 int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
3876 {
3877 flag aSign;
3878 int32 aExp, shiftCount;
3879 uint64_t aSig, aSigExtra;
3880
3881 aSig = extractFloatx80Frac( a );
3882 aExp = extractFloatx80Exp( a );
3883 aSign = extractFloatx80Sign( a );
3884 shiftCount = 0x403E - aExp;
3885 if ( shiftCount <= 0 ) {
3886 if ( shiftCount ) {
3887 float_raise( float_flag_invalid STATUS_VAR);
3888 if ( ! aSign
3889 || ( ( aExp == 0x7FFF )
3890 && ( aSig != LIT64( 0x8000000000000000 ) ) )
3891 ) {
3892 return LIT64( 0x7FFFFFFFFFFFFFFF );
3893 }
3894 return (int64_t) LIT64( 0x8000000000000000 );
3895 }
3896 aSigExtra = 0;
3897 }
3898 else {
3899 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3900 }
3901 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3902
3903 }
3904
3905 /*----------------------------------------------------------------------------
3906 | Returns the result of converting the extended double-precision floating-
3907 | point value `a' to the 64-bit two's complement integer format. The
3908 | conversion is performed according to the IEC/IEEE Standard for Binary
3909 | Floating-Point Arithmetic, except that the conversion is always rounded
3910 | toward zero. If `a' is a NaN, the largest positive integer is returned.
3911 | Otherwise, if the conversion overflows, the largest integer with the same
3912 | sign as `a' is returned.
3913 *----------------------------------------------------------------------------*/
3914
3915 int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
3916 {
3917 flag aSign;
3918 int32 aExp, shiftCount;
3919 uint64_t aSig;
3920 int64 z;
3921
3922 aSig = extractFloatx80Frac( a );
3923 aExp = extractFloatx80Exp( a );
3924 aSign = extractFloatx80Sign( a );
3925 shiftCount = aExp - 0x403E;
3926 if ( 0 <= shiftCount ) {
3927 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3928 if ( ( a.high != 0xC03E ) || aSig ) {
3929 float_raise( float_flag_invalid STATUS_VAR);
3930 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3931 return LIT64( 0x7FFFFFFFFFFFFFFF );
3932 }
3933 }
3934 return (int64_t) LIT64( 0x8000000000000000 );
3935 }
3936 else if ( aExp < 0x3FFF ) {
3937 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3938 return 0;
3939 }
3940 z = aSig>>( - shiftCount );
3941 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
3942 STATUS(float_exception_flags) |= float_flag_inexact;
3943 }
3944 if ( aSign ) z = - z;
3945 return z;
3946
3947 }
3948
3949 /*----------------------------------------------------------------------------
3950 | Returns the result of converting the extended double-precision floating-
3951 | point value `a' to the single-precision floating-point format. The
3952 | conversion is performed according to the IEC/IEEE Standard for Binary
3953 | Floating-Point Arithmetic.
3954 *----------------------------------------------------------------------------*/
3955
3956 float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
3957 {
3958 flag aSign;
3959 int32 aExp;
3960 uint64_t aSig;
3961
3962 aSig = extractFloatx80Frac( a );
3963 aExp = extractFloatx80Exp( a );
3964 aSign = extractFloatx80Sign( a );
3965 if ( aExp == 0x7FFF ) {
3966 if ( (uint64_t) ( aSig<<1 ) ) {
3967 return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3968 }
3969 return packFloat32( aSign, 0xFF, 0 );
3970 }
3971 shift64RightJamming( aSig, 33, &aSig );
3972 if ( aExp || aSig ) aExp -= 0x3F81;
3973 return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
3974
3975 }
3976
3977 /*----------------------------------------------------------------------------
3978 | Returns the result of converting the extended double-precision floating-
3979 | point value `a' to the double-precision floating-point format. The
3980 | conversion is performed according to the IEC/IEEE Standard for Binary
3981 | Floating-Point Arithmetic.
3982 *----------------------------------------------------------------------------*/
3983
3984 float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
3985 {
3986 flag aSign;
3987 int32 aExp;
3988 uint64_t aSig, zSig;
3989
3990 aSig = extractFloatx80Frac( a );
3991 aExp = extractFloatx80Exp( a );
3992 aSign = extractFloatx80Sign( a );
3993 if ( aExp == 0x7FFF ) {
3994 if ( (uint64_t) ( aSig<<1 ) ) {
3995 return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3996 }
3997 return packFloat64( aSign, 0x7FF, 0 );
3998 }
3999 shift64RightJamming( aSig, 1, &zSig );
4000 if ( aExp || aSig ) aExp -= 0x3C01;
4001 return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
4002
4003 }
4004
4005 #ifdef FLOAT128
4006
4007 /*----------------------------------------------------------------------------
4008 | Returns the result of converting the extended double-precision floating-
4009 | point value `a' to the quadruple-precision floating-point format. The
4010 | conversion is performed according to the IEC/IEEE Standard for Binary
4011 | Floating-Point Arithmetic.
4012 *----------------------------------------------------------------------------*/
4013
4014 float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
4015 {
4016 flag aSign;
4017 int16 aExp;
4018 uint64_t aSig, zSig0, zSig1;
4019
4020 aSig = extractFloatx80Frac( a );
4021 aExp = extractFloatx80Exp( a );
4022 aSign = extractFloatx80Sign( a );
4023 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4024 return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4025 }
4026 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4027 return packFloat128( aSign, aExp, zSig0, zSig1 );
4028
4029 }
4030
4031 #endif
4032
4033 /*----------------------------------------------------------------------------
4034 | Rounds the extended double-precision floating-point value `a' to an integer,
4035 | and returns the result as an extended quadruple-precision floating-point
4036 | value. The operation is performed according to the IEC/IEEE Standard for
4037 | Binary Floating-Point Arithmetic.
4038 *----------------------------------------------------------------------------*/
4039
4040 floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
4041 {
4042 flag aSign;
4043 int32 aExp;
4044 uint64_t lastBitMask, roundBitsMask;
4045 int8 roundingMode;
4046 floatx80 z;
4047
4048 aExp = extractFloatx80Exp( a );
4049 if ( 0x403E <= aExp ) {
4050 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4051 return propagateFloatx80NaN( a, a STATUS_VAR );
4052 }
4053 return a;
4054 }
4055 if ( aExp < 0x3FFF ) {
4056 if ( ( aExp == 0 )
4057 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4058 return a;
4059 }
4060 STATUS(float_exception_flags) |= float_flag_inexact;
4061 aSign = extractFloatx80Sign( a );
4062 switch ( STATUS(float_rounding_mode) ) {
4063 case float_round_nearest_even:
4064 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4065 ) {
4066 return
4067 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4068 }
4069 break;
4070 case float_round_down:
4071 return
4072 aSign ?
4073 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4074 : packFloatx80( 0, 0, 0 );
4075 case float_round_up:
4076 return
4077 aSign ? packFloatx80( 1, 0, 0 )
4078 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4079 }
4080 return packFloatx80( aSign, 0, 0 );
4081 }
4082 lastBitMask = 1;
4083 lastBitMask <<= 0x403E - aExp;
4084 roundBitsMask = lastBitMask - 1;
4085 z = a;
4086 roundingMode = STATUS(float_rounding_mode);
4087 if ( roundingMode == float_round_nearest_even ) {
4088 z.low += lastBitMask>>1;
4089 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4090 }
4091 else if ( roundingMode != float_round_to_zero ) {
4092 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
4093 z.low += roundBitsMask;
4094 }
4095 }
4096 z.low &= ~ roundBitsMask;
4097 if ( z.low == 0 ) {
4098 ++z.high;
4099 z.low = LIT64( 0x8000000000000000 );
4100 }
4101 if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
4102 return z;
4103
4104 }
4105
4106 /*----------------------------------------------------------------------------
4107 | Returns the result of adding the absolute values of the extended double-
4108 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
4109 | negated before being returned. `zSign' is ignored if the result is a NaN.
4110 | The addition is performed according to the IEC/IEEE Standard for Binary
4111 | Floating-Point Arithmetic.
4112 *----------------------------------------------------------------------------*/
4113
4114 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
4115 {
4116 int32 aExp, bExp, zExp;
4117 uint64_t aSig, bSig, zSig0, zSig1;
4118 int32 expDiff;
4119
4120 aSig = extractFloatx80Frac( a );
4121 aExp = extractFloatx80Exp( a );
4122 bSig = extractFloatx80Frac( b );
4123 bExp = extractFloatx80Exp( b );
4124 expDiff = aExp - bExp;
4125 if ( 0 < expDiff ) {
4126 if ( aExp == 0x7FFF ) {
4127 if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4128 return a;
4129 }
4130 if ( bExp == 0 ) --expDiff;
4131 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4132 zExp = aExp;
4133 }
4134 else if ( expDiff < 0 ) {
4135 if ( bExp == 0x7FFF ) {
4136 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4137 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4138 }
4139 if ( aExp == 0 ) ++expDiff;
4140 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4141 zExp = bExp;
4142 }
4143 else {
4144 if ( aExp == 0x7FFF ) {
4145 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4146 return propagateFloatx80NaN( a, b STATUS_VAR );
4147 }
4148 return a;
4149 }
4150 zSig1 = 0;
4151 zSig0 = aSig + bSig;
4152 if ( aExp == 0 ) {
4153 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4154 goto roundAndPack;
4155 }
4156 zExp = aExp;
4157 goto shiftRight1;
4158 }
4159 zSig0 = aSig + bSig;
4160 if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
4161 shiftRight1:
4162 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4163 zSig0 |= LIT64( 0x8000000000000000 );
4164 ++zExp;
4165 roundAndPack:
4166 return
4167 roundAndPackFloatx80(
4168 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4169
4170 }
4171
4172 /*----------------------------------------------------------------------------
4173 | Returns the result of subtracting the absolute values of the extended
4174 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
4175 | difference is negated before being returned. `zSign' is ignored if the
4176 | result is a NaN. The subtraction is performed according to the IEC/IEEE
4177 | Standard for Binary Floating-Point Arithmetic.
4178 *----------------------------------------------------------------------------*/
4179
4180 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
4181 {
4182 int32 aExp, bExp, zExp;
4183 uint64_t aSig, bSig, zSig0, zSig1;
4184 int32 expDiff;
4185 floatx80 z;
4186
4187 aSig = extractFloatx80Frac( a );
4188 aExp = extractFloatx80Exp( a );
4189 bSig = extractFloatx80Frac( b );
4190 bExp = extractFloatx80Exp( b );
4191 expDiff = aExp - bExp;
4192 if ( 0 < expDiff ) goto aExpBigger;
4193 if ( expDiff < 0 ) goto bExpBigger;
4194 if ( aExp == 0x7FFF ) {
4195 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4196 return propagateFloatx80NaN( a, b STATUS_VAR );
4197 }
4198 float_raise( float_flag_invalid STATUS_VAR);
4199 z.low = floatx80_default_nan_low;
4200 z.high = floatx80_default_nan_high;
4201 return z;
4202 }
4203 if ( aExp == 0 ) {
4204 aExp = 1;
4205 bExp = 1;
4206 }
4207 zSig1 = 0;
4208 if ( bSig < aSig ) goto aBigger;
4209 if ( aSig < bSig ) goto bBigger;
4210 return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
4211 bExpBigger:
4212 if ( bExp == 0x7FFF ) {
4213 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4214 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
4215 }
4216 if ( aExp == 0 ) ++expDiff;
4217 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4218 bBigger:
4219 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4220 zExp = bExp;
4221 zSign ^= 1;
4222 goto normalizeRoundAndPack;
4223 aExpBigger:
4224 if ( aExp == 0x7FFF ) {
4225 if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4226 return a;
4227 }
4228 if ( bExp == 0 ) --expDiff;
4229 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4230 aBigger:
4231 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4232 zExp = aExp;
4233 normalizeRoundAndPack:
4234 return
4235 normalizeRoundAndPackFloatx80(
4236 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4237
4238 }
4239
4240 /*----------------------------------------------------------------------------
4241 | Returns the result of adding the extended double-precision floating-point
4242 | values `a' and `b'. The operation is performed according to the IEC/IEEE
4243 | Standard for Binary Floating-Point Arithmetic.
4244 *----------------------------------------------------------------------------*/
4245
4246 floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
4247 {
4248 flag aSign, bSign;
4249
4250 aSign = extractFloatx80Sign( a );
4251 bSign = extractFloatx80Sign( b );
4252 if ( aSign == bSign ) {
4253 return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4254 }
4255 else {
4256 return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4257 }
4258
4259 }
4260
4261 /*----------------------------------------------------------------------------
4262 | Returns the result of subtracting the extended double-precision floating-
4263 | point values `a' and `b'. The operation is performed according to the
4264 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4265 *----------------------------------------------------------------------------*/
4266
4267 floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
4268 {
4269 flag aSign, bSign;
4270
4271 aSign = extractFloatx80Sign( a );
4272 bSign = extractFloatx80Sign( b );
4273 if ( aSign == bSign ) {
4274 return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4275 }
4276 else {
4277 return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4278 }
4279
4280 }
4281
4282 /*----------------------------------------------------------------------------
4283 | Returns the result of multiplying the extended double-precision floating-
4284 | point values `a' and `b'. The operation is performed according to the
4285 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4286 *----------------------------------------------------------------------------*/
4287
4288 floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
4289 {
4290 flag aSign, bSign, zSign;
4291 int32 aExp, bExp, zExp;
4292 uint64_t aSig, bSig, zSig0, zSig1;
4293 floatx80 z;
4294
4295 aSig = extractFloatx80Frac( a );
4296 aExp = extractFloatx80Exp( a );
4297 aSign = extractFloatx80Sign( a );
4298 bSig = extractFloatx80Frac( b );
4299 bExp = extractFloatx80Exp( b );
4300 bSign = extractFloatx80Sign( b );
4301 zSign = aSign ^ bSign;
4302 if ( aExp == 0x7FFF ) {
4303 if ( (uint64_t) ( aSig<<1 )
4304 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4305 return propagateFloatx80NaN( a, b STATUS_VAR );
4306 }
4307 if ( ( bExp | bSig ) == 0 ) goto invalid;
4308 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4309 }
4310 if ( bExp == 0x7FFF ) {
4311 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4312 if ( ( aExp | aSig ) == 0 ) {
4313 invalid:
4314 float_raise( float_flag_invalid STATUS_VAR);
4315 z.low = floatx80_default_nan_low;
4316 z.high = floatx80_default_nan_high;
4317 return z;
4318 }
4319 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4320 }
4321 if ( aExp == 0 ) {
4322 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4323 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4324 }
4325 if ( bExp == 0 ) {
4326 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
4327 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4328 }
4329 zExp = aExp + bExp - 0x3FFE;
4330 mul64To128( aSig, bSig, &zSig0, &zSig1 );
4331 if ( 0 < (int64_t) zSig0 ) {
4332 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
4333 --zExp;
4334 }
4335 return
4336 roundAndPackFloatx80(
4337 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4338
4339 }
4340
4341 /*----------------------------------------------------------------------------
4342 | Returns the result of dividing the extended double-precision floating-point
4343 | value `a' by the corresponding value `b'. The operation is performed
4344 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4345 *----------------------------------------------------------------------------*/
4346
4347 floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
4348 {
4349 flag aSign, bSign, zSign;
4350 int32 aExp, bExp, zExp;
4351 uint64_t aSig, bSig, zSig0, zSig1;
4352 uint64_t rem0, rem1, rem2, term0, term1, term2;
4353 floatx80 z;
4354
4355 aSig = extractFloatx80Frac( a );
4356 aExp = extractFloatx80Exp( a );
4357 aSign = extractFloatx80Sign( a );
4358 bSig = extractFloatx80Frac( b );
4359 bExp = extractFloatx80Exp( b );
4360 bSign = extractFloatx80Sign( b );
4361 zSign = aSign ^ bSign;
4362 if ( aExp == 0x7FFF ) {
4363 if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4364 if ( bExp == 0x7FFF ) {
4365 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4366 goto invalid;
4367 }
4368 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4369 }
4370 if ( bExp == 0x7FFF ) {
4371 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4372 return packFloatx80( zSign, 0, 0 );
4373 }
4374 if ( bExp == 0 ) {
4375 if ( bSig == 0 ) {
4376 if ( ( aExp | aSig ) == 0 ) {
4377 invalid:
4378 float_raise( float_flag_invalid STATUS_VAR);
4379 z.low = floatx80_default_nan_low;
4380 z.high = floatx80_default_nan_high;
4381 return z;
4382 }
4383 float_raise( float_flag_divbyzero STATUS_VAR);
4384 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4385 }
4386 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4387 }
4388 if ( aExp == 0 ) {
4389 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4390 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4391 }
4392 zExp = aExp - bExp + 0x3FFE;
4393 rem1 = 0;
4394 if ( bSig <= aSig ) {
4395 shift128Right( aSig, 0, 1, &aSig, &rem1 );
4396 ++zExp;
4397 }
4398 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
4399 mul64To128( bSig, zSig0, &term0, &term1 );
4400 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
4401 while ( (int64_t) rem0 < 0 ) {
4402 --zSig0;
4403 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4404 }
4405 zSig1 = estimateDiv128To64( rem1, 0, bSig );
4406 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
4407 mul64To128( bSig, zSig1, &term1, &term2 );
4408 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4409 while ( (int64_t) rem1 < 0 ) {
4410 --zSig1;
4411 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
4412 }
4413 zSig1 |= ( ( rem1 | rem2 ) != 0 );
4414 }
4415 return
4416 roundAndPackFloatx80(
4417 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4418
4419 }
4420
4421 /*----------------------------------------------------------------------------
4422 | Returns the remainder of the extended double-precision floating-point value
4423 | `a' with respect to the corresponding value `b'. The operation is performed
4424 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4425 *----------------------------------------------------------------------------*/
4426
4427 floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
4428 {
4429 flag aSign, zSign;
4430 int32 aExp, bExp, expDiff;
4431 uint64_t aSig0, aSig1, bSig;
4432 uint64_t q, term0, term1, alternateASig0, alternateASig1;
4433 floatx80 z;
4434
4435 aSig0 = extractFloatx80Frac( a );
4436 aExp = extractFloatx80Exp( a );
4437 aSign = extractFloatx80Sign( a );
4438 bSig = extractFloatx80Frac( b );
4439 bExp = extractFloatx80Exp( b );
4440 if ( aExp == 0x7FFF ) {
4441 if ( (uint64_t) ( aSig0<<1 )
4442 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4443 return propagateFloatx80NaN( a, b STATUS_VAR );
4444 }
4445 goto invalid;
4446 }
4447 if ( bExp == 0x7FFF ) {
4448 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4449 return a;
4450 }
4451 if ( bExp == 0 ) {
4452 if ( bSig == 0 ) {
4453 invalid:
4454 float_raise( float_flag_invalid STATUS_VAR);
4455 z.low = floatx80_default_nan_low;
4456 z.high = floatx80_default_nan_high;
4457 return z;
4458 }
4459 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4460 }
4461 if ( aExp == 0 ) {
4462 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
4463 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4464 }
4465 bSig |= LIT64( 0x8000000000000000 );
4466 zSign = aSign;
4467 expDiff = aExp - bExp;
4468 aSig1 = 0;
4469 if ( expDiff < 0 ) {
4470 if ( expDiff < -1 ) return a;
4471 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
4472 expDiff = 0;
4473 }
4474 q = ( bSig <= aSig0 );
4475 if ( q ) aSig0 -= bSig;
4476 expDiff -= 64;
4477 while ( 0 < expDiff ) {
4478 q = estimateDiv128To64( aSig0, aSig1, bSig );
4479 q = ( 2 < q ) ? q - 2 : 0;
4480 mul64To128( bSig, q, &term0, &term1 );
4481 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4482 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
4483 expDiff -= 62;
4484 }
4485 expDiff += 64;
4486 if ( 0 < expDiff ) {
4487 q = estimateDiv128To64( aSig0, aSig1, bSig );
4488 q = ( 2 < q ) ? q - 2 : 0;
4489 q >>= 64 - expDiff;
4490 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
4491 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4492 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4493 while ( le128( term0, term1, aSig0, aSig1 ) ) {
4494 ++q;
4495 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4496 }
4497 }
4498 else {
4499 term1 = 0;
4500 term0 = bSig;
4501 }
4502 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4503 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4504 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4505 && ( q & 1 ) )
4506 ) {
4507 aSig0 = alternateASig0;
4508 aSig1 = alternateASig1;
4509 zSign = ! zSign;
4510 }
4511 return
4512 normalizeRoundAndPackFloatx80(
4513 80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
4514
4515 }
4516
4517 /*----------------------------------------------------------------------------
4518 | Returns the square root of the extended double-precision floating-point
4519 | value `a'. The operation is performed according to the IEC/IEEE Standard
4520 | for Binary Floating-Point Arithmetic.
4521 *----------------------------------------------------------------------------*/
4522
4523 floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
4524 {
4525 flag aSign;
4526 int32 aExp, zExp;
4527 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4528 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4529 floatx80 z;
4530
4531 aSig0 = extractFloatx80Frac( a );
4532 aExp = extractFloatx80Exp( a );
4533 aSign = extractFloatx80Sign( a );
4534 if ( aExp == 0x7FFF ) {
4535 if ( (uint64_t) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
4536 if ( ! aSign ) return a;
4537 goto invalid;
4538 }
4539 if ( aSign ) {
4540 if ( ( aExp | aSig0 ) == 0 ) return a;
4541 invalid:
4542 float_raise( float_flag_invalid STATUS_VAR);
4543 z.low = floatx80_default_nan_low;
4544 z.high = floatx80_default_nan_high;
4545 return z;
4546 }
4547 if ( aExp == 0 ) {
4548 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4549 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4550 }
4551 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4552 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4553 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4554 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4555 doubleZSig0 = zSig0<<1;
4556 mul64To128( zSig0, zSig0, &term0, &term1 );
4557 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4558 while ( (int64_t) rem0 < 0 ) {
4559 --zSig0;
4560 doubleZSig0 -= 2;
4561 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4562 }
4563 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4564 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4565 if ( zSig1 == 0 ) zSig1 = 1;
4566 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4567 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4568 mul64To128( zSig1, zSig1, &term2, &term3 );
4569 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4570 while ( (int64_t) rem1 < 0 ) {
4571 --zSig1;
4572 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4573 term3 |= 1;
4574 term2 |= doubleZSig0;
4575 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4576 }
4577 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4578 }
4579 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4580 zSig0 |= doubleZSig0;
4581 return
4582 roundAndPackFloatx80(
4583 STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
4584
4585 }
4586
4587 /*----------------------------------------------------------------------------
4588 | Returns 1 if the extended double-precision floating-point value `a' is equal
4589 | to the corresponding value `b', and 0 otherwise. The invalid exception is
4590 | raised if either operand is a NaN. Otherwise, the comparison is performed
4591 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4592 *----------------------------------------------------------------------------*/
4593
4594 int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
4595 {
4596
4597 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4598 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4599 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4600 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4601 ) {
4602 float_raise( float_flag_invalid STATUS_VAR);
4603 return 0;
4604 }
4605 return
4606 ( a.low == b.low )
4607 && ( ( a.high == b.high )
4608 || ( ( a.low == 0 )
4609 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
4610 );
4611
4612 }
4613
4614 /*----------------------------------------------------------------------------
4615 | Returns 1 if the extended double-precision floating-point value `a' is
4616 | less than or equal to the corresponding value `b', and 0 otherwise. The
4617 | comparison is performed according to the IEC/IEEE Standard for Binary
4618 | Floating-Point Arithmetic.
4619 *----------------------------------------------------------------------------*/
4620
4621 int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
4622 {
4623 flag aSign, bSign;
4624
4625 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4626 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4627 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4628 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4629 ) {
4630 float_raise( float_flag_invalid STATUS_VAR);
4631 return 0;
4632 }
4633 aSign = extractFloatx80Sign( a );
4634 bSign = extractFloatx80Sign( b );
4635 if ( aSign != bSign ) {
4636 return
4637 aSign
4638 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4639 == 0 );
4640 }
4641 return
4642 aSign ? le128( b.high, b.low, a.high, a.low )
4643 : le128( a.high, a.low, b.high, b.low );
4644
4645 }
4646
4647 /*----------------------------------------------------------------------------
4648 | Returns 1 if the extended double-precision floating-point value `a' is
4649 | less than the corresponding value `b', and 0 otherwise. The comparison
4650 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4651 | Arithmetic.
4652 *----------------------------------------------------------------------------*/
4653
4654 int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
4655 {
4656 flag aSign, bSign;
4657
4658 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4659 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4660 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4661 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4662 ) {
4663 float_raise( float_flag_invalid STATUS_VAR);
4664 return 0;
4665 }
4666 aSign = extractFloatx80Sign( a );
4667 bSign = extractFloatx80Sign( b );
4668 if ( aSign != bSign ) {
4669 return
4670 aSign
4671 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4672 != 0 );
4673 }
4674 return
4675 aSign ? lt128( b.high, b.low, a.high, a.low )
4676 : lt128( a.high, a.low, b.high, b.low );
4677
4678 }
4679
4680 /*----------------------------------------------------------------------------
4681 | Returns 1 if the extended double-precision floating-point values `a' and `b'
4682 | cannot be compared, and 0 otherwise. The comparison is performed according
4683 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4684 *----------------------------------------------------------------------------*/
4685 int floatx80_unordered( floatx80 a, floatx80 b STATUS_PARAM )
4686 {
4687 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4688 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4689 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4690 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4691 ) {
4692 float_raise( float_flag_invalid STATUS_VAR);
4693 return 1;
4694 }
4695 return 0;
4696 }
4697
4698 /*----------------------------------------------------------------------------
4699 | Returns 1 if the extended double-precision floating-point value `a' is
4700 | equal to the corresponding value `b', and 0 otherwise. The comparison is
4701 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4702 | Arithmetic.
4703 *----------------------------------------------------------------------------*/
4704
4705 int floatx80_eq_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4706 {
4707
4708 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4709 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4710 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4711 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4712 ) {
4713 if ( floatx80_is_signaling_nan( a )
4714 || floatx80_is_signaling_nan( b ) ) {
4715 float_raise( float_flag_invalid STATUS_VAR);
4716 }
4717 return 0;
4718 }
4719 return
4720 ( a.low == b.low )
4721 && ( ( a.high == b.high )
4722 || ( ( a.low == 0 )
4723 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
4724 );
4725
4726 }
4727
4728 /*----------------------------------------------------------------------------
4729 | Returns 1 if the extended double-precision floating-point value `a' is less
4730 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
4731 | do not cause an exception. Otherwise, the comparison is performed according
4732 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4733 *----------------------------------------------------------------------------*/
4734
4735 int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4736 {
4737 flag aSign, bSign;
4738
4739 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4740 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4741 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4742 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4743 ) {
4744 if ( floatx80_is_signaling_nan( a )
4745 || floatx80_is_signaling_nan( b ) ) {
4746 float_raise( float_flag_invalid STATUS_VAR);
4747 }
4748 return 0;
4749 }
4750 aSign = extractFloatx80Sign( a );
4751 bSign = extractFloatx80Sign( b );
4752 if ( aSign != bSign ) {
4753 return
4754 aSign
4755 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4756 == 0 );
4757 }
4758 return
4759 aSign ? le128( b.high, b.low, a.high, a.low )
4760 : le128( a.high, a.low, b.high, b.low );
4761
4762 }
4763
4764 /*----------------------------------------------------------------------------
4765 | Returns 1 if the extended double-precision floating-point value `a' is less
4766 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
4767 | an exception. Otherwise, the comparison is performed according to the
4768 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4769 *----------------------------------------------------------------------------*/
4770
4771 int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4772 {
4773 flag aSign, bSign;
4774
4775 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4776 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4777 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4778 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4779 ) {
4780 if ( floatx80_is_signaling_nan( a )
4781 || floatx80_is_signaling_nan( b ) ) {
4782 float_raise( float_flag_invalid STATUS_VAR);
4783 }
4784 return 0;
4785 }
4786 aSign = extractFloatx80Sign( a );
4787 bSign = extractFloatx80Sign( b );
4788 if ( aSign != bSign ) {
4789 return
4790 aSign
4791 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4792 != 0 );
4793 }
4794 return
4795 aSign ? lt128( b.high, b.low, a.high, a.low )
4796 : lt128( a.high, a.low, b.high, b.low );
4797
4798 }
4799
4800 /*----------------------------------------------------------------------------
4801 | Returns 1 if the extended double-precision floating-point values `a' and `b'
4802 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
4803 | The comparison is performed according to the IEC/IEEE Standard for Binary
4804 | Floating-Point Arithmetic.
4805 *----------------------------------------------------------------------------*/
4806 int floatx80_unordered_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4807 {
4808 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4809 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4810 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4811 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4812 ) {
4813 if ( floatx80_is_signaling_nan( a )
4814 || floatx80_is_signaling_nan( b ) ) {
4815 float_raise( float_flag_invalid STATUS_VAR);
4816 }
4817 return 1;
4818 }
4819 return 0;
4820 }
4821
4822 #endif
4823
4824 #ifdef FLOAT128
4825
4826 /*----------------------------------------------------------------------------
4827 | Returns the result of converting the quadruple-precision floating-point
4828 | value `a' to the 32-bit two's complement integer format. The conversion
4829 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4830 | Arithmetic---which means in particular that the conversion is rounded
4831 | according to the current rounding mode. If `a' is a NaN, the largest
4832 | positive integer is returned. Otherwise, if the conversion overflows, the
4833 | largest integer with the same sign as `a' is returned.
4834 *----------------------------------------------------------------------------*/
4835
4836 int32 float128_to_int32( float128 a STATUS_PARAM )
4837 {
4838 flag aSign;
4839 int32 aExp, shiftCount;
4840 uint64_t aSig0, aSig1;
4841
4842 aSig1 = extractFloat128Frac1( a );
4843 aSig0 = extractFloat128Frac0( a );
4844 aExp = extractFloat128Exp( a );
4845 aSign = extractFloat128Sign( a );
4846 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4847 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4848 aSig0 |= ( aSig1 != 0 );
4849 shiftCount = 0x4028 - aExp;
4850 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4851 return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
4852
4853 }
4854
4855 /*----------------------------------------------------------------------------
4856 | Returns the result of converting the quadruple-precision floating-point
4857 | value `a' to the 32-bit two's complement integer format. The conversion
4858 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4859 | Arithmetic, except that the conversion is always rounded toward zero. If
4860 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
4861 | conversion overflows, the largest integer with the same sign as `a' is
4862 | returned.
4863 *----------------------------------------------------------------------------*/
4864
4865 int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
4866 {
4867 flag aSign;
4868 int32 aExp, shiftCount;
4869 uint64_t aSig0, aSig1, savedASig;
4870 int32 z;
4871
4872 aSig1 = extractFloat128Frac1( a );
4873 aSig0 = extractFloat128Frac0( a );
4874 aExp = extractFloat128Exp( a );
4875 aSign = extractFloat128Sign( a );
4876 aSig0 |= ( aSig1 != 0 );
4877 if ( 0x401E < aExp ) {
4878 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4879 goto invalid;
4880 }
4881 else if ( aExp < 0x3FFF ) {
4882 if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
4883 return 0;
4884 }
4885 aSig0 |= LIT64( 0x0001000000000000 );
4886 shiftCount = 0x402F - aExp;
4887 savedASig = aSig0;
4888 aSig0 >>= shiftCount;
4889 z = aSig0;
4890 if ( aSign ) z = - z;
4891 if ( ( z < 0 ) ^ aSign ) {
4892 invalid:
4893 float_raise( float_flag_invalid STATUS_VAR);
4894 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4895 }
4896 if ( ( aSig0<<shiftCount ) != savedASig ) {
4897 STATUS(float_exception_flags) |= float_flag_inexact;
4898 }
4899 return z;
4900
4901 }
4902
4903 /*----------------------------------------------------------------------------
4904 | Returns the result of converting the quadruple-precision floating-point
4905 | value `a' to the 64-bit two's complement integer format. The conversion
4906 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4907 | Arithmetic---which means in particular that the conversion is rounded
4908 | according to the current rounding mode. If `a' is a NaN, the largest
4909 | positive integer is returned. Otherwise, if the conversion overflows, the
4910 | largest integer with the same sign as `a' is returned.
4911 *----------------------------------------------------------------------------*/
4912
4913 int64 float128_to_int64( float128 a STATUS_PARAM )
4914 {
4915 flag aSign;
4916 int32 aExp, shiftCount;
4917 uint64_t aSig0, aSig1;
4918
4919 aSig1 = extractFloat128Frac1( a );
4920 aSig0 = extractFloat128Frac0( a );
4921 aExp = extractFloat128Exp( a );
4922 aSign = extractFloat128Sign( a );
4923 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4924 shiftCount = 0x402F - aExp;
4925 if ( shiftCount <= 0 ) {
4926 if ( 0x403E < aExp ) {
4927 float_raise( float_flag_invalid STATUS_VAR);
4928 if ( ! aSign
4929 || ( ( aExp == 0x7FFF )
4930 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4931 )
4932 ) {
4933 return LIT64( 0x7FFFFFFFFFFFFFFF );
4934 }
4935 return (int64_t) LIT64( 0x8000000000000000 );
4936 }
4937 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4938 }
4939 else {
4940 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4941 }
4942 return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
4943
4944 }
4945
4946 /*----------------------------------------------------------------------------
4947 | Returns the result of converting the quadruple-precision floating-point
4948 | value `a' to the 64-bit two's complement integer format. The conversion
4949 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4950 | Arithmetic, except that the conversion is always rounded toward zero.
4951 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
4952 | the conversion overflows, the largest integer with the same sign as `a' is
4953 | returned.
4954 *----------------------------------------------------------------------------*/
4955
4956 int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
4957 {
4958 flag aSign;
4959 int32 aExp, shiftCount;
4960 uint64_t aSig0, aSig1;
4961 int64 z;
4962
4963 aSig1 = extractFloat128Frac1( a );
4964 aSig0 = extractFloat128Frac0( a );
4965 aExp = extractFloat128Exp( a );
4966 aSign = extractFloat128Sign( a );
4967 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4968 shiftCount = aExp - 0x402F;
4969 if ( 0 < shiftCount ) {
4970 if ( 0x403E <= aExp ) {
4971 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4972 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
4973 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4974 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
4975 }
4976 else {
4977 float_raise( float_flag_invalid STATUS_VAR);
4978 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4979 return LIT64( 0x7FFFFFFFFFFFFFFF );
4980 }
4981 }
4982 return (int64_t) LIT64( 0x8000000000000000 );
4983 }
4984 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4985 if ( (uint64_t) ( aSig1<<shiftCount ) ) {
4986 STATUS(float_exception_flags) |= float_flag_inexact;
4987 }
4988 }
4989 else {
4990 if ( aExp < 0x3FFF ) {
4991 if ( aExp | aSig0 | aSig1 ) {
4992 STATUS(float_exception_flags) |= float_flag_inexact;
4993 }
4994 return 0;
4995 }
4996 z = aSig0>>( - shiftCount );
4997 if ( aSig1
4998 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4999 STATUS(float_exception_flags) |= float_flag_inexact;
5000 }
5001 }
5002 if ( aSign ) z = - z;
5003 return z;
5004
5005 }
5006
5007 /*----------------------------------------------------------------------------
5008 | Returns the result of converting the quadruple-precision floating-point
5009 | value `a' to the single-precision floating-point format. The conversion
5010 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5011 | Arithmetic.
5012 *----------------------------------------------------------------------------*/
5013
5014 float32 float128_to_float32( float128 a STATUS_PARAM )
5015 {
5016 flag aSign;
5017 int32 aExp;
5018 uint64_t aSig0, aSig1;
5019 uint32_t zSig;
5020
5021 aSig1 = extractFloat128Frac1( a );
5022 aSig0 = extractFloat128Frac0( a );
5023 aExp = extractFloat128Exp( a );
5024 aSign = extractFloat128Sign( a );
5025 if ( aExp == 0x7FFF ) {
5026 if ( aSig0 | aSig1 ) {
5027 return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5028 }
5029 return packFloat32( aSign, 0xFF, 0 );
5030 }
5031 aSig0 |= ( aSig1 != 0 );
5032 shift64RightJamming( aSig0, 18, &aSig0 );
5033 zSig = aSig0;
5034 if ( aExp || zSig ) {
5035 zSig |= 0x40000000;
5036 aExp -= 0x3F81;
5037 }
5038 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
5039
5040 }
5041
5042 /*----------------------------------------------------------------------------
5043 | Returns the result of converting the quadruple-precision floating-point
5044 | value `a' to the double-precision floating-point format. The conversion
5045 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5046 | Arithmetic.
5047 *----------------------------------------------------------------------------*/
5048
5049 float64 float128_to_float64( float128 a STATUS_PARAM )
5050 {
5051 flag aSign;
5052 int32 aExp;
5053 uint64_t aSig0, aSig1;
5054
5055 aSig1 = extractFloat128Frac1( a );
5056 aSig0 = extractFloat128Frac0( a );
5057 aExp = extractFloat128Exp( a );
5058 aSign = extractFloat128Sign( a );
5059 if ( aExp == 0x7FFF ) {
5060 if ( aSig0 | aSig1 ) {
5061 return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5062 }
5063 return packFloat64( aSign, 0x7FF, 0 );
5064 }
5065 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5066 aSig0 |= ( aSig1 != 0 );
5067 if ( aExp || aSig0 ) {
5068 aSig0 |= LIT64( 0x4000000000000000 );
5069 aExp -= 0x3C01;
5070 }
5071 return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
5072
5073 }
5074
5075 #ifdef FLOATX80
5076
5077 /*----------------------------------------------------------------------------
5078 | Returns the result of converting the quadruple-precision floating-point
5079 | value `a' to the extended double-precision floating-point format. The
5080 | conversion is performed according to the IEC/IEEE Standard for Binary
5081 | Floating-Point Arithmetic.
5082 *----------------------------------------------------------------------------*/
5083
5084 floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
5085 {
5086 flag aSign;
5087 int32 aExp;
5088 uint64_t aSig0, aSig1;
5089
5090 aSig1 = extractFloat128Frac1( a );
5091 aSig0 = extractFloat128Frac0( a );
5092 aExp = extractFloat128Exp( a );
5093 aSign = extractFloat128Sign( a );
5094 if ( aExp == 0x7FFF ) {
5095 if ( aSig0 | aSig1 ) {
5096 return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5097 }
5098 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5099 }
5100 if ( aExp == 0 ) {
5101 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5102 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5103 }
5104 else {
5105 aSig0 |= LIT64( 0x0001000000000000 );
5106 }
5107 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5108 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
5109
5110 }
5111
5112 #endif
5113
5114 /*----------------------------------------------------------------------------
5115 | Rounds the quadruple-precision floating-point value `a' to an integer, and
5116 | returns the result as a quadruple-precision floating-point value. The
5117 | operation is performed according to the IEC/IEEE Standard for Binary
5118 | Floating-Point Arithmetic.
5119 *----------------------------------------------------------------------------*/
5120
5121 float128 float128_round_to_int( float128 a STATUS_PARAM )
5122 {
5123 flag aSign;
5124 int32 aExp;
5125 uint64_t lastBitMask, roundBitsMask;
5126 int8 roundingMode;
5127 float128 z;
5128
5129 aExp = extractFloat128Exp( a );
5130 if ( 0x402F <= aExp ) {
5131 if ( 0x406F <= aExp ) {
5132 if ( ( aExp == 0x7FFF )
5133 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5134 ) {
5135 return propagateFloat128NaN( a, a STATUS_VAR );
5136 }
5137 return a;
5138 }
5139 lastBitMask = 1;
5140 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5141 roundBitsMask = lastBitMask - 1;
5142 z = a;
5143 roundingMode = STATUS(float_rounding_mode);
5144 if ( roundingMode == float_round_nearest_even ) {
5145 if ( lastBitMask ) {
5146 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
5147 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
5148 }
5149 else {
5150 if ( (int64_t) z.low < 0 ) {
5151 ++z.high;
5152 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
5153 }
5154 }
5155 }
5156 else if ( roundingMode != float_round_to_zero ) {
5157 if ( extractFloat128Sign( z )
5158 ^ ( roundingMode == float_round_up ) ) {
5159 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
5160 }
5161 }
5162 z.low &= ~ roundBitsMask;
5163 }
5164 else {
5165 if ( aExp < 0x3FFF ) {
5166 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
5167 STATUS(float_exception_flags) |= float_flag_inexact;
5168 aSign = extractFloat128Sign( a );
5169 switch ( STATUS(float_rounding_mode) ) {
5170 case float_round_nearest_even:
5171 if ( ( aExp == 0x3FFE )
5172 && ( extractFloat128Frac0( a )
5173 | extractFloat128Frac1( a ) )
5174 ) {
5175 return packFloat128( aSign, 0x3FFF, 0, 0 );
5176 }
5177 break;
5178 case float_round_down:
5179 return
5180 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
5181 : packFloat128( 0, 0, 0, 0 );
5182 case float_round_up:
5183 return
5184 aSign ? packFloat128( 1, 0, 0, 0 )
5185 : packFloat128( 0, 0x3FFF, 0, 0 );
5186 }
5187 return packFloat128( aSign, 0, 0, 0 );
5188 }
5189 lastBitMask = 1;
5190 lastBitMask <<= 0x402F - aExp;
5191 roundBitsMask = lastBitMask - 1;
5192 z.low = 0;
5193 z.high = a.high;
5194 roundingMode = STATUS(float_rounding_mode);
5195 if ( roundingMode == float_round_nearest_even ) {
5196 z.high += lastBitMask>>1;
5197 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
5198 z.high &= ~ lastBitMask;
5199 }
5200 }
5201 else if ( roundingMode != float_round_to_zero ) {
5202 if ( extractFloat128Sign( z )
5203 ^ ( roundingMode == float_round_up ) ) {
5204 z.high |= ( a.low != 0 );
5205 z.high += roundBitsMask;
5206 }
5207 }
5208 z.high &= ~ roundBitsMask;
5209 }
5210 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
5211 STATUS(float_exception_flags) |= float_flag_inexact;
5212 }
5213 return z;
5214
5215 }
5216
5217 /*----------------------------------------------------------------------------
5218 | Returns the result of adding the absolute values of the quadruple-precision
5219 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
5220 | before being returned. `zSign' is ignored if the result is a NaN.
5221 | The addition is performed according to the IEC/IEEE Standard for Binary
5222 | Floating-Point Arithmetic.
5223 *----------------------------------------------------------------------------*/
5224
5225 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5226 {
5227 int32 aExp, bExp, zExp;
5228 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5229 int32 expDiff;
5230
5231 aSig1 = extractFloat128Frac1( a );
5232 aSig0 = extractFloat128Frac0( a );
5233 aExp = extractFloat128Exp( a );
5234 bSig1 = extractFloat128Frac1( b );
5235 bSig0 = extractFloat128Frac0( b );
5236 bExp = extractFloat128Exp( b );
5237 expDiff = aExp - bExp;
5238 if ( 0 < expDiff ) {
5239 if ( aExp == 0x7FFF ) {
5240 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5241 return a;
5242 }
5243 if ( bExp == 0 ) {
5244 --expDiff;
5245 }
5246 else {
5247 bSig0 |= LIT64( 0x0001000000000000 );
5248 }
5249 shift128ExtraRightJamming(
5250 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
5251 zExp = aExp;
5252 }
5253 else if ( expDiff < 0 ) {
5254 if ( bExp == 0x7FFF ) {
5255 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5256 return packFloat128( zSign, 0x7FFF, 0, 0 );
5257 }
5258 if ( aExp == 0 ) {
5259 ++expDiff;
5260 }
5261 else {
5262 aSig0 |= LIT64( 0x0001000000000000 );
5263 }
5264 shift128ExtraRightJamming(
5265 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
5266 zExp = bExp;
5267 }
5268 else {
5269 if ( aExp == 0x7FFF ) {
5270 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5271 return propagateFloat128NaN( a, b STATUS_VAR );
5272 }
5273 return a;
5274 }
5275 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5276 if ( aExp == 0 ) {
5277 if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
5278 return packFloat128( zSign, 0, zSig0, zSig1 );
5279 }
5280 zSig2 = 0;
5281 zSig0 |= LIT64( 0x0002000000000000 );
5282 zExp = aExp;
5283 goto shiftRight1;
5284 }
5285 aSig0 |= LIT64( 0x0001000000000000 );
5286 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5287 --zExp;
5288 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
5289 ++zExp;
5290 shiftRight1:
5291 shift128ExtraRightJamming(
5292 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5293 roundAndPack:
5294 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5295
5296 }
5297
5298 /*----------------------------------------------------------------------------
5299 | Returns the result of subtracting the absolute values of the quadruple-
5300 | precision floating-point values `a' and `b'. If `zSign' is 1, the
5301 | difference is negated before being returned. `zSign' is ignored if the
5302 | result is a NaN. The subtraction is performed according to the IEC/IEEE
5303 | Standard for Binary Floating-Point Arithmetic.
5304 *----------------------------------------------------------------------------*/
5305
5306 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5307 {
5308 int32 aExp, bExp, zExp;
5309 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
5310 int32 expDiff;
5311 float128 z;
5312
5313 aSig1 = extractFloat128Frac1( a );
5314 aSig0 = extractFloat128Frac0( a );
5315 aExp = extractFloat128Exp( a );
5316 bSig1 = extractFloat128Frac1( b );
5317 bSig0 = extractFloat128Frac0( b );
5318 bExp = extractFloat128Exp( b );
5319 expDiff = aExp - bExp;
5320 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5321 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
5322 if ( 0 < expDiff ) goto aExpBigger;
5323 if ( expDiff < 0 ) goto bExpBigger;
5324 if ( aExp == 0x7FFF ) {
5325 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5326 return propagateFloat128NaN( a, b STATUS_VAR );
5327 }
5328 float_raise( float_flag_invalid STATUS_VAR);
5329 z.low = float128_default_nan_low;
5330 z.high = float128_default_nan_high;
5331 return z;
5332 }
5333 if ( aExp == 0 ) {
5334 aExp = 1;
5335 bExp = 1;
5336 }
5337 if ( bSig0 < aSig0 ) goto aBigger;
5338 if ( aSig0 < bSig0 ) goto bBigger;
5339 if ( bSig1 < aSig1 ) goto aBigger;
5340 if ( aSig1 < bSig1 ) goto bBigger;
5341 return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
5342 bExpBigger:
5343 if ( bExp == 0x7FFF ) {
5344 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5345 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
5346 }
5347 if ( aExp == 0 ) {
5348 ++expDiff;
5349 }
5350 else {
5351 aSig0 |= LIT64( 0x4000000000000000 );
5352 }
5353 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5354 bSig0 |= LIT64( 0x4000000000000000 );
5355 bBigger:
5356 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
5357 zExp = bExp;
5358 zSign ^= 1;
5359 goto normalizeRoundAndPack;
5360 aExpBigger:
5361 if ( aExp == 0x7FFF ) {
5362 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5363 return a;
5364 }
5365 if ( bExp == 0 ) {
5366 --expDiff;
5367 }
5368 else {
5369 bSig0 |= LIT64( 0x4000000000000000 );
5370 }
5371 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
5372 aSig0 |= LIT64( 0x4000000000000000 );
5373 aBigger:
5374 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5375 zExp = aExp;
5376 normalizeRoundAndPack:
5377 --zExp;
5378 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
5379
5380 }
5381
5382 /*----------------------------------------------------------------------------
5383 | Returns the result of adding the quadruple-precision floating-point values
5384 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
5385 | for Binary Floating-Point Arithmetic.
5386 *----------------------------------------------------------------------------*/
5387
5388 float128 float128_add( float128 a, float128 b STATUS_PARAM )
5389 {
5390 flag aSign, bSign;
5391
5392 aSign = extractFloat128Sign( a );
5393 bSign = extractFloat128Sign( b );
5394 if ( aSign == bSign ) {
5395 return addFloat128Sigs( a, b, aSign STATUS_VAR );
5396 }
5397 else {
5398 return subFloat128Sigs( a, b, aSign STATUS_VAR );
5399 }
5400
5401 }
5402
5403 /*----------------------------------------------------------------------------
5404 | Returns the result of subtracting the quadruple-precision floating-point
5405 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5406 | Standard for Binary Floating-Point Arithmetic.
5407 *----------------------------------------------------------------------------*/
5408
5409 float128 float128_sub( float128 a, float128 b STATUS_PARAM )
5410 {
5411 flag aSign, bSign;
5412
5413 aSign = extractFloat128Sign( a );
5414 bSign = extractFloat128Sign( b );
5415 if ( aSign == bSign ) {
5416 return subFloat128Sigs( a, b, aSign STATUS_VAR );
5417 }
5418 else {
5419 return addFloat128Sigs( a, b, aSign STATUS_VAR );
5420 }
5421
5422 }
5423
5424 /*----------------------------------------------------------------------------
5425 | Returns the result of multiplying the quadruple-precision floating-point
5426 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5427 | Standard for Binary Floating-Point Arithmetic.
5428 *----------------------------------------------------------------------------*/
5429
5430 float128 float128_mul( float128 a, float128 b STATUS_PARAM )
5431 {
5432 flag aSign, bSign, zSign;
5433 int32 aExp, bExp, zExp;
5434 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
5435 float128 z;
5436
5437 aSig1 = extractFloat128Frac1( a );
5438 aSig0 = extractFloat128Frac0( a );
5439 aExp = extractFloat128Exp( a );
5440 aSign = extractFloat128Sign( a );
5441 bSig1 = extractFloat128Frac1( b );
5442 bSig0 = extractFloat128Frac0( b );
5443 bExp = extractFloat128Exp( b );
5444 bSign = extractFloat128Sign( b );
5445 zSign = aSign ^ bSign;
5446 if ( aExp == 0x7FFF ) {
5447 if ( ( aSig0 | aSig1 )
5448 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5449 return propagateFloat128NaN( a, b STATUS_VAR );
5450 }
5451 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
5452 return packFloat128( zSign, 0x7FFF, 0, 0 );
5453 }
5454 if ( bExp == 0x7FFF ) {
5455 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5456 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5457 invalid:
5458 float_raise( float_flag_invalid STATUS_VAR);
5459 z.low = float128_default_nan_low;
5460 z.high = float128_default_nan_high;
5461 return z;
5462 }
5463 return packFloat128( zSign, 0x7FFF, 0, 0 );
5464 }
5465 if ( aExp == 0 ) {
5466 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5467 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5468 }
5469 if ( bExp == 0 ) {
5470 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5471 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5472 }
5473 zExp = aExp + bExp - 0x4000;
5474 aSig0 |= LIT64( 0x0001000000000000 );
5475 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5476 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5477 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5478 zSig2 |= ( zSig3 != 0 );
5479 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5480 shift128ExtraRightJamming(
5481 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5482 ++zExp;
5483 }
5484 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5485
5486 }
5487
5488 /*----------------------------------------------------------------------------
5489 | Returns the result of dividing the quadruple-precision floating-point value
5490 | `a' by the corresponding value `b'. The operation is performed according to
5491 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5492 *----------------------------------------------------------------------------*/
5493
5494 float128 float128_div( float128 a, float128 b STATUS_PARAM )
5495 {
5496 flag aSign, bSign, zSign;
5497 int32 aExp, bExp, zExp;
5498 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5499 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5500 float128 z;
5501
5502 aSig1 = extractFloat128Frac1( a );
5503 aSig0 = extractFloat128Frac0( a );
5504 aExp = extractFloat128Exp( a );
5505 aSign = extractFloat128Sign( a );
5506 bSig1 = extractFloat128Frac1( b );
5507 bSig0 = extractFloat128Frac0( b );
5508 bExp = extractFloat128Exp( b );
5509 bSign = extractFloat128Sign( b );
5510 zSign = aSign ^ bSign;
5511 if ( aExp == 0x7FFF ) {
5512 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5513 if ( bExp == 0x7FFF ) {
5514 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5515 goto invalid;
5516 }
5517 return packFloat128( zSign, 0x7FFF, 0, 0 );
5518 }
5519 if ( bExp == 0x7FFF ) {
5520 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5521 return packFloat128( zSign, 0, 0, 0 );
5522 }
5523 if ( bExp == 0 ) {
5524 if ( ( bSig0 | bSig1 ) == 0 ) {
5525 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5526 invalid:
5527 float_raise( float_flag_invalid STATUS_VAR);
5528 z.low = float128_default_nan_low;
5529 z.high = float128_default_nan_high;
5530 return z;
5531 }
5532 float_raise( float_flag_divbyzero STATUS_VAR);
5533 return packFloat128( zSign, 0x7FFF, 0, 0 );
5534 }
5535 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5536 }
5537 if ( aExp == 0 ) {
5538 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5539 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5540 }
5541 zExp = aExp - bExp + 0x3FFD;
5542 shortShift128Left(
5543 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5544 shortShift128Left(
5545 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5546 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5547 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5548 ++zExp;
5549 }
5550 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5551 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5552 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5553 while ( (int64_t) rem0 < 0 ) {
5554 --zSig0;
5555 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5556 }
5557 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5558 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5559 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5560 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5561 while ( (int64_t) rem1 < 0 ) {
5562 --zSig1;
5563 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5564 }
5565 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5566 }
5567 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5568 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5569
5570 }
5571
5572 /*----------------------------------------------------------------------------
5573 | Returns the remainder of the quadruple-precision floating-point value `a'
5574 | with respect to the corresponding value `b'. The operation is performed
5575 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5576 *----------------------------------------------------------------------------*/
5577
5578 float128 float128_rem( float128 a, float128 b STATUS_PARAM )
5579 {
5580 flag aSign, zSign;
5581 int32 aExp, bExp, expDiff;
5582 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5583 uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
5584 int64_t sigMean0;
5585 float128 z;
5586
5587 aSig1 = extractFloat128Frac1( a );
5588 aSig0 = extractFloat128Frac0( a );
5589 aExp = extractFloat128Exp( a );
5590 aSign = extractFloat128Sign( a );
5591 bSig1 = extractFloat128Frac1( b );
5592 bSig0 = extractFloat128Frac0( b );
5593 bExp = extractFloat128Exp( b );
5594 if ( aExp == 0x7FFF ) {
5595 if ( ( aSig0 | aSig1 )
5596 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5597 return propagateFloat128NaN( a, b STATUS_VAR );
5598 }
5599 goto invalid;
5600 }
5601 if ( bExp == 0x7FFF ) {
5602 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5603 return a;
5604 }
5605 if ( bExp == 0 ) {
5606 if ( ( bSig0 | bSig1 ) == 0 ) {
5607 invalid:
5608 float_raise( float_flag_invalid STATUS_VAR);
5609 z.low = float128_default_nan_low;
5610 z.high = float128_default_nan_high;
5611 return z;
5612 }
5613 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5614 }
5615 if ( aExp == 0 ) {
5616 if ( ( aSig0 | aSig1 ) == 0 ) return a;
5617 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5618 }
5619 expDiff = aExp - bExp;
5620 if ( expDiff < -1 ) return a;
5621 shortShift128Left(
5622 aSig0 | LIT64( 0x0001000000000000 ),
5623 aSig1,
5624 15 - ( expDiff < 0 ),
5625 &aSig0,
5626 &aSig1
5627 );
5628 shortShift128Left(
5629 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5630 q = le128( bSig0, bSig1, aSig0, aSig1 );
5631 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5632 expDiff -= 64;
5633 while ( 0 < expDiff ) {
5634 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5635 q = ( 4 < q ) ? q - 4 : 0;
5636 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5637 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5638 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5639 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5640 expDiff -= 61;
5641 }
5642 if ( -64 < expDiff ) {
5643 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5644 q = ( 4 < q ) ? q - 4 : 0;
5645 q >>= - expDiff;
5646 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5647 expDiff += 52;
5648 if ( expDiff < 0 ) {
5649 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5650 }
5651 else {
5652 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5653 }
5654 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5655 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5656 }
5657 else {
5658 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5659 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5660 }
5661 do {
5662 alternateASig0 = aSig0;
5663 alternateASig1 = aSig1;
5664 ++q;
5665 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5666 } while ( 0 <= (int64_t) aSig0 );
5667 add128(
5668 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
5669 if ( ( sigMean0 < 0 )
5670 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5671 aSig0 = alternateASig0;
5672 aSig1 = alternateASig1;
5673 }
5674 zSign = ( (int64_t) aSig0 < 0 );
5675 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5676 return
5677 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
5678
5679 }
5680
5681 /*----------------------------------------------------------------------------
5682 | Returns the square root of the quadruple-precision floating-point value `a'.
5683 | The operation is performed according to the IEC/IEEE Standard for Binary
5684 | Floating-Point Arithmetic.
5685 *----------------------------------------------------------------------------*/
5686
5687 float128 float128_sqrt( float128 a STATUS_PARAM )
5688 {
5689 flag aSign;
5690 int32 aExp, zExp;
5691 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5692 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5693 float128 z;
5694
5695 aSig1 = extractFloat128Frac1( a );
5696 aSig0 = extractFloat128Frac0( a );
5697 aExp = extractFloat128Exp( a );
5698 aSign = extractFloat128Sign( a );
5699 if ( aExp == 0x7FFF ) {
5700 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
5701 if ( ! aSign ) return a;
5702 goto invalid;
5703 }
5704 if ( aSign ) {
5705 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5706 invalid:
5707 float_raise( float_flag_invalid STATUS_VAR);
5708 z.low = float128_default_nan_low;
5709 z.high = float128_default_nan_high;
5710 return z;
5711 }
5712 if ( aExp == 0 ) {
5713 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5714 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5715 }
5716 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5717 aSig0 |= LIT64( 0x0001000000000000 );
5718 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5719 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5720 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5721 doubleZSig0 = zSig0<<1;
5722 mul64To128( zSig0, zSig0, &term0, &term1 );
5723 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5724 while ( (int64_t) rem0 < 0 ) {
5725 --zSig0;
5726 doubleZSig0 -= 2;
5727 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5728 }
5729 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5730 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5731 if ( zSig1 == 0 ) zSig1 = 1;
5732 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5733 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5734 mul64To128( zSig1, zSig1, &term2, &term3 );
5735 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5736 while ( (int64_t) rem1 < 0 ) {
5737 --zSig1;
5738 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5739 term3 |= 1;
5740 term2 |= doubleZSig0;
5741 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5742 }
5743 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5744 }
5745 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5746 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5747
5748 }
5749
5750 /*----------------------------------------------------------------------------
5751 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5752 | the corresponding value `b', and 0 otherwise. The invalid exception is
5753 | raised if either operand is a NaN. Otherwise, the comparison is performed
5754 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5755 *----------------------------------------------------------------------------*/
5756
5757 int float128_eq( float128 a, float128 b STATUS_PARAM )
5758 {
5759
5760 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5761 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5762 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5763 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5764 ) {
5765 float_raise( float_flag_invalid STATUS_VAR);
5766 return 0;
5767 }
5768 return
5769 ( a.low == b.low )
5770 && ( ( a.high == b.high )
5771 || ( ( a.low == 0 )
5772 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5773 );
5774
5775 }
5776
5777 /*----------------------------------------------------------------------------
5778 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5779 | or equal to the corresponding value `b', and 0 otherwise. The comparison
5780 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5781 | Arithmetic.
5782 *----------------------------------------------------------------------------*/
5783
5784 int float128_le( float128 a, float128 b STATUS_PARAM )
5785 {
5786 flag aSign, bSign;
5787
5788 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5789 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5790 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5791 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5792 ) {
5793 float_raise( float_flag_invalid STATUS_VAR);
5794 return 0;
5795 }
5796 aSign = extractFloat128Sign( a );
5797 bSign = extractFloat128Sign( b );
5798 if ( aSign != bSign ) {
5799 return
5800 aSign
5801 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5802 == 0 );
5803 }
5804 return
5805 aSign ? le128( b.high, b.low, a.high, a.low )
5806 : le128( a.high, a.low, b.high, b.low );
5807
5808 }
5809
5810 /*----------------------------------------------------------------------------
5811 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5812 | the corresponding value `b', and 0 otherwise. The comparison is performed
5813 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5814 *----------------------------------------------------------------------------*/
5815
5816 int float128_lt( float128 a, float128 b STATUS_PARAM )
5817 {
5818 flag aSign, bSign;
5819
5820 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5821 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5822 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5823 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5824 ) {
5825 float_raise( float_flag_invalid STATUS_VAR);
5826 return 0;
5827 }
5828 aSign = extractFloat128Sign( a );
5829 bSign = extractFloat128Sign( b );
5830 if ( aSign != bSign ) {
5831 return
5832 aSign
5833 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5834 != 0 );
5835 }
5836 return
5837 aSign ? lt128( b.high, b.low, a.high, a.low )
5838 : lt128( a.high, a.low, b.high, b.low );
5839
5840 }
5841
5842 /*----------------------------------------------------------------------------
5843 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
5844 | be compared, and 0 otherwise. The comparison is performed according to the
5845 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5846 *----------------------------------------------------------------------------*/
5847
5848 int float128_unordered( float128 a, float128 b STATUS_PARAM )
5849 {
5850 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5851 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5852 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5853 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5854 ) {
5855 float_raise( float_flag_invalid STATUS_VAR);
5856 return 1;
5857 }
5858 return 0;
5859 }
5860
5861 /*----------------------------------------------------------------------------
5862 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5863 | the corresponding value `b', and 0 otherwise. The comparison is performed
5864 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5865 *----------------------------------------------------------------------------*/
5866
5867 int float128_eq_quiet( float128 a, float128 b STATUS_PARAM )
5868 {
5869
5870 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5871 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5872 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5873 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5874 ) {
5875 if ( float128_is_signaling_nan( a )
5876 || float128_is_signaling_nan( b ) ) {
5877 float_raise( float_flag_invalid STATUS_VAR);
5878 }
5879 return 0;
5880 }
5881 return
5882 ( a.low == b.low )
5883 && ( ( a.high == b.high )
5884 || ( ( a.low == 0 )
5885 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5886 );
5887
5888 }
5889
5890 /*----------------------------------------------------------------------------
5891 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5892 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5893 | cause an exception. Otherwise, the comparison is performed according to the
5894 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5895 *----------------------------------------------------------------------------*/
5896
5897 int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
5898 {
5899 flag aSign, bSign;
5900
5901 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5902 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5903 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5904 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5905 ) {
5906 if ( float128_is_signaling_nan( a )
5907 || float128_is_signaling_nan( b ) ) {
5908 float_raise( float_flag_invalid STATUS_VAR);
5909 }
5910 return 0;
5911 }
5912 aSign = extractFloat128Sign( a );
5913 bSign = extractFloat128Sign( b );
5914 if ( aSign != bSign ) {
5915 return
5916 aSign
5917 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5918 == 0 );
5919 }
5920 return
5921 aSign ? le128( b.high, b.low, a.high, a.low )
5922 : le128( a.high, a.low, b.high, b.low );
5923
5924 }
5925
5926 /*----------------------------------------------------------------------------
5927 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5928 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5929 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
5930 | Standard for Binary Floating-Point Arithmetic.
5931 *----------------------------------------------------------------------------*/
5932
5933 int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
5934 {
5935 flag aSign, bSign;
5936
5937 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5938 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5939 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5940 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5941 ) {
5942 if ( float128_is_signaling_nan( a )
5943 || float128_is_signaling_nan( b ) ) {
5944 float_raise( float_flag_invalid STATUS_VAR);
5945 }
5946 return 0;
5947 }
5948 aSign = extractFloat128Sign( a );
5949 bSign = extractFloat128Sign( b );
5950 if ( aSign != bSign ) {
5951 return
5952 aSign
5953 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5954 != 0 );
5955 }
5956 return
5957 aSign ? lt128( b.high, b.low, a.high, a.low )
5958 : lt128( a.high, a.low, b.high, b.low );
5959
5960 }
5961
5962 /*----------------------------------------------------------------------------
5963 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
5964 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
5965 | comparison is performed according to the IEC/IEEE Standard for Binary
5966 | Floating-Point Arithmetic.
5967 *----------------------------------------------------------------------------*/
5968
5969 int float128_unordered_quiet( float128 a, float128 b STATUS_PARAM )
5970 {
5971 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5972 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5973 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5974 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5975 ) {
5976 if ( float128_is_signaling_nan( a )
5977 || float128_is_signaling_nan( b ) ) {
5978 float_raise( float_flag_invalid STATUS_VAR);
5979 }
5980 return 1;
5981 }
5982 return 0;
5983 }
5984
5985 #endif
5986
5987 /* misc functions */
5988 float32 uint32_to_float32( unsigned int a STATUS_PARAM )
5989 {
5990 return int64_to_float32(a STATUS_VAR);
5991 }
5992
5993 float64 uint32_to_float64( unsigned int a STATUS_PARAM )
5994 {
5995 return int64_to_float64(a STATUS_VAR);
5996 }
5997
5998 unsigned int float32_to_uint32( float32 a STATUS_PARAM )
5999 {
6000 int64_t v;
6001 unsigned int res;
6002
6003 v = float32_to_int64(a STATUS_VAR);
6004 if (v < 0) {
6005 res = 0;
6006 float_raise( float_flag_invalid STATUS_VAR);
6007 } else if (v > 0xffffffff) {
6008 res = 0xffffffff;
6009 float_raise( float_flag_invalid STATUS_VAR);
6010 } else {
6011 res = v;
6012 }
6013 return res;
6014 }
6015
6016 unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
6017 {
6018 int64_t v;
6019 unsigned int res;
6020
6021 v = float32_to_int64_round_to_zero(a STATUS_VAR);
6022 if (v < 0) {
6023 res = 0;
6024 float_raise( float_flag_invalid STATUS_VAR);
6025 } else if (v > 0xffffffff) {
6026 res = 0xffffffff;
6027 float_raise( float_flag_invalid STATUS_VAR);
6028 } else {
6029 res = v;
6030 }
6031 return res;
6032 }
6033
6034 unsigned int float32_to_uint16_round_to_zero( float32 a STATUS_PARAM )
6035 {
6036 int64_t v;
6037 unsigned int res;
6038
6039 v = float32_to_int64_round_to_zero(a STATUS_VAR);
6040 if (v < 0) {
6041 res = 0;
6042 float_raise( float_flag_invalid STATUS_VAR);
6043 } else if (v > 0xffff) {
6044 res = 0xffff;
6045 float_raise( float_flag_invalid STATUS_VAR);
6046 } else {
6047 res = v;
6048 }
6049 return res;
6050 }
6051
6052 unsigned int float64_to_uint32( float64 a STATUS_PARAM )
6053 {
6054 int64_t v;
6055 unsigned int res;
6056
6057 v = float64_to_int64(a STATUS_VAR);
6058 if (v < 0) {
6059 res = 0;
6060 float_raise( float_flag_invalid STATUS_VAR);
6061 } else if (v > 0xffffffff) {
6062 res = 0xffffffff;
6063 float_raise( float_flag_invalid STATUS_VAR);
6064 } else {
6065 res = v;
6066 }
6067 return res;
6068 }
6069
6070 unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
6071 {
6072 int64_t v;
6073 unsigned int res;
6074
6075 v = float64_to_int64_round_to_zero(a STATUS_VAR);
6076 if (v < 0) {
6077 res = 0;
6078 float_raise( float_flag_invalid STATUS_VAR);
6079 } else if (v > 0xffffffff) {
6080 res = 0xffffffff;
6081 float_raise( float_flag_invalid STATUS_VAR);
6082 } else {
6083 res = v;
6084 }
6085 return res;
6086 }
6087
6088 unsigned int float64_to_uint16_round_to_zero( float64 a STATUS_PARAM )
6089 {
6090 int64_t v;
6091 unsigned int res;
6092
6093 v = float64_to_int64_round_to_zero(a STATUS_VAR);
6094 if (v < 0) {
6095 res = 0;
6096 float_raise( float_flag_invalid STATUS_VAR);
6097 } else if (v > 0xffff) {
6098 res = 0xffff;
6099 float_raise( float_flag_invalid STATUS_VAR);
6100 } else {
6101 res = v;
6102 }
6103 return res;
6104 }
6105
6106 /* FIXME: This looks broken. */
6107 uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
6108 {
6109 int64_t v;
6110
6111 v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
6112 v += float64_val(a);
6113 v = float64_to_int64(make_float64(v) STATUS_VAR);
6114
6115 return v - INT64_MIN;
6116 }
6117
6118 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
6119 {
6120 int64_t v;
6121
6122 v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
6123 v += float64_val(a);
6124 v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);
6125
6126 return v - INT64_MIN;
6127 }
6128
6129 #define COMPARE(s, nan_exp) \
6130 INLINE int float ## s ## _compare_internal( float ## s a, float ## s b, \
6131 int is_quiet STATUS_PARAM ) \
6132 { \
6133 flag aSign, bSign; \
6134 uint ## s ## _t av, bv; \
6135 a = float ## s ## _squash_input_denormal(a STATUS_VAR); \
6136 b = float ## s ## _squash_input_denormal(b STATUS_VAR); \
6137 \
6138 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
6139 extractFloat ## s ## Frac( a ) ) || \
6140 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
6141 extractFloat ## s ## Frac( b ) )) { \
6142 if (!is_quiet || \
6143 float ## s ## _is_signaling_nan( a ) || \
6144 float ## s ## _is_signaling_nan( b ) ) { \
6145 float_raise( float_flag_invalid STATUS_VAR); \
6146 } \
6147 return float_relation_unordered; \
6148 } \
6149 aSign = extractFloat ## s ## Sign( a ); \
6150 bSign = extractFloat ## s ## Sign( b ); \
6151 av = float ## s ## _val(a); \
6152 bv = float ## s ## _val(b); \
6153 if ( aSign != bSign ) { \
6154 if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) { \
6155 /* zero case */ \
6156 return float_relation_equal; \
6157 } else { \
6158 return 1 - (2 * aSign); \
6159 } \
6160 } else { \
6161 if (av == bv) { \
6162 return float_relation_equal; \
6163 } else { \
6164 return 1 - 2 * (aSign ^ ( av < bv )); \
6165 } \
6166 } \
6167 } \
6168 \
6169 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM ) \
6170 { \
6171 return float ## s ## _compare_internal(a, b, 0 STATUS_VAR); \
6172 } \
6173 \
6174 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \
6175 { \
6176 return float ## s ## _compare_internal(a, b, 1 STATUS_VAR); \
6177 }
6178
6179 COMPARE(32, 0xff)
6180 COMPARE(64, 0x7ff)
6181
6182 INLINE int float128_compare_internal( float128 a, float128 b,
6183 int is_quiet STATUS_PARAM )
6184 {
6185 flag aSign, bSign;
6186
6187 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6188 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6189 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6190 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6191 if (!is_quiet ||
6192 float128_is_signaling_nan( a ) ||
6193 float128_is_signaling_nan( b ) ) {
6194 float_raise( float_flag_invalid STATUS_VAR);
6195 }
6196 return float_relation_unordered;
6197 }
6198 aSign = extractFloat128Sign( a );
6199 bSign = extractFloat128Sign( b );
6200 if ( aSign != bSign ) {
6201 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6202 /* zero case */
6203 return float_relation_equal;
6204 } else {
6205 return 1 - (2 * aSign);
6206 }
6207 } else {
6208 if (a.low == b.low && a.high == b.high) {
6209 return float_relation_equal;
6210 } else {
6211 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6212 }
6213 }
6214 }
6215
6216 int float128_compare( float128 a, float128 b STATUS_PARAM )
6217 {
6218 return float128_compare_internal(a, b, 0 STATUS_VAR);
6219 }
6220
6221 int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
6222 {
6223 return float128_compare_internal(a, b, 1 STATUS_VAR);
6224 }
6225
6226 /* min() and max() functions. These can't be implemented as
6227 * 'compare and pick one input' because that would mishandle
6228 * NaNs and +0 vs -0.
6229 */
6230 #define MINMAX(s, nan_exp) \
6231 INLINE float ## s float ## s ## _minmax(float ## s a, float ## s b, \
6232 int ismin STATUS_PARAM ) \
6233 { \
6234 flag aSign, bSign; \
6235 uint ## s ## _t av, bv; \
6236 a = float ## s ## _squash_input_denormal(a STATUS_VAR); \
6237 b = float ## s ## _squash_input_denormal(b STATUS_VAR); \
6238 if (float ## s ## _is_any_nan(a) || \
6239 float ## s ## _is_any_nan(b)) { \
6240 return propagateFloat ## s ## NaN(a, b STATUS_VAR); \
6241 } \
6242 aSign = extractFloat ## s ## Sign(a); \
6243 bSign = extractFloat ## s ## Sign(b); \
6244 av = float ## s ## _val(a); \
6245 bv = float ## s ## _val(b); \
6246 if (aSign != bSign) { \
6247 if (ismin) { \
6248 return aSign ? a : b; \
6249 } else { \
6250 return aSign ? b : a; \
6251 } \
6252 } else { \
6253 if (ismin) { \
6254 return (aSign ^ (av < bv)) ? a : b; \
6255 } else { \
6256 return (aSign ^ (av < bv)) ? b : a; \
6257 } \
6258 } \
6259 } \
6260 \
6261 float ## s float ## s ## _min(float ## s a, float ## s b STATUS_PARAM) \
6262 { \
6263 return float ## s ## _minmax(a, b, 1 STATUS_VAR); \
6264 } \
6265 \
6266 float ## s float ## s ## _max(float ## s a, float ## s b STATUS_PARAM) \
6267 { \
6268 return float ## s ## _minmax(a, b, 0 STATUS_VAR); \
6269 }
6270
6271 MINMAX(32, 0xff)
6272 MINMAX(64, 0x7ff)
6273
6274
6275 /* Multiply A by 2 raised to the power N. */
6276 float32 float32_scalbn( float32 a, int n STATUS_PARAM )
6277 {
6278 flag aSign;
6279 int16 aExp;
6280 uint32_t aSig;
6281
6282 a = float32_squash_input_denormal(a STATUS_VAR);
6283 aSig = extractFloat32Frac( a );
6284 aExp = extractFloat32Exp( a );
6285 aSign = extractFloat32Sign( a );
6286
6287 if ( aExp == 0xFF ) {
6288 return a;
6289 }
6290 if ( aExp != 0 )
6291 aSig |= 0x00800000;
6292 else if ( aSig == 0 )
6293 return a;
6294
6295 aExp += n - 1;
6296 aSig <<= 7;
6297 return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
6298 }
6299
6300 float64 float64_scalbn( float64 a, int n STATUS_PARAM )
6301 {
6302 flag aSign;
6303 int16 aExp;
6304 uint64_t aSig;
6305
6306 a = float64_squash_input_denormal(a STATUS_VAR);
6307 aSig = extractFloat64Frac( a );
6308 aExp = extractFloat64Exp( a );
6309 aSign = extractFloat64Sign( a );
6310
6311 if ( aExp == 0x7FF ) {
6312 return a;
6313 }
6314 if ( aExp != 0 )
6315 aSig |= LIT64( 0x0010000000000000 );
6316 else if ( aSig == 0 )
6317 return a;
6318
6319 aExp += n - 1;
6320 aSig <<= 10;
6321 return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
6322 }
6323
6324 #ifdef FLOATX80
6325 floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
6326 {
6327 flag aSign;
6328 int16 aExp;
6329 uint64_t aSig;
6330
6331 aSig = extractFloatx80Frac( a );
6332 aExp = extractFloatx80Exp( a );
6333 aSign = extractFloatx80Sign( a );
6334
6335 if ( aExp == 0x7FF ) {
6336 return a;
6337 }
6338 if (aExp == 0 && aSig == 0)
6339 return a;
6340
6341 aExp += n;
6342 return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
6343 aSign, aExp, aSig, 0 STATUS_VAR );
6344 }
6345 #endif
6346
6347 #ifdef FLOAT128
6348 float128 float128_scalbn( float128 a, int n STATUS_PARAM )
6349 {
6350 flag aSign;
6351 int32 aExp;
6352 uint64_t aSig0, aSig1;
6353
6354 aSig1 = extractFloat128Frac1( a );
6355 aSig0 = extractFloat128Frac0( a );
6356 aExp = extractFloat128Exp( a );
6357 aSign = extractFloat128Sign( a );
6358 if ( aExp == 0x7FFF ) {
6359 return a;
6360 }
6361 if ( aExp != 0 )
6362 aSig0 |= LIT64( 0x0001000000000000 );
6363 else if ( aSig0 == 0 && aSig1 == 0 )
6364 return a;
6365
6366 aExp += n - 1;
6367 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
6368 STATUS_VAR );
6369
6370 }
6371 #endif