]> git.proxmox.com Git - qemu.git/blob - fpu/softfloat.c
softfloat: Add float16 type and float16 NaN handling functions
[qemu.git] / fpu / softfloat.c
1
2 /*============================================================================
3
4 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
5 Package, Release 2b.
6
7 Written by John R. Hauser. This work was made possible in part by the
8 International Computer Science Institute, located at Suite 600, 1947 Center
9 Street, Berkeley, California 94704. Funding was partially provided by the
10 National Science Foundation under grant MIP-9311980. The original version
11 of this code was written as part of a project to build a fixed-point vector
12 processor in collaboration with the University of California at Berkeley,
13 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
14 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
15 arithmetic/SoftFloat.html'.
16
17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
18 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
19 RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
20 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
21 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
22 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
23 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
24 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
25
26 Derivative works are acceptable, even for commercial purposes, so long as
27 (1) the source code for the derivative work includes prominent notice that
28 the work is derivative, and (2) the source code includes prominent notice with
29 these four paragraphs for those parts of this code that are retained.
30
31 =============================================================================*/
32
33 #include "softfloat.h"
34
35 /*----------------------------------------------------------------------------
36 | Primitive arithmetic functions, including multi-word arithmetic, and
37 | division and square root approximations. (Can be specialized to target if
38 | desired.)
39 *----------------------------------------------------------------------------*/
40 #include "softfloat-macros.h"
41
42 /*----------------------------------------------------------------------------
43 | Functions and definitions to determine: (1) whether tininess for underflow
44 | is detected before or after rounding by default, (2) what (if anything)
45 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
46 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
47 | are propagated from function inputs to output. These details are target-
48 | specific.
49 *----------------------------------------------------------------------------*/
50 #include "softfloat-specialize.h"
51
52 void set_float_rounding_mode(int val STATUS_PARAM)
53 {
54 STATUS(float_rounding_mode) = val;
55 }
56
57 void set_float_exception_flags(int val STATUS_PARAM)
58 {
59 STATUS(float_exception_flags) = val;
60 }
61
62 #ifdef FLOATX80
63 void set_floatx80_rounding_precision(int val STATUS_PARAM)
64 {
65 STATUS(floatx80_rounding_precision) = val;
66 }
67 #endif
68
69 /*----------------------------------------------------------------------------
70 | Returns the fraction bits of the half-precision floating-point value `a'.
71 *----------------------------------------------------------------------------*/
72
73 INLINE uint32_t extractFloat16Frac(float16 a)
74 {
75 return float16_val(a) & 0x3ff;
76 }
77
78 /*----------------------------------------------------------------------------
79 | Returns the exponent bits of the half-precision floating-point value `a'.
80 *----------------------------------------------------------------------------*/
81
82 INLINE int16 extractFloat16Exp(float16 a)
83 {
84 return (float16_val(a) >> 10) & 0x1f;
85 }
86
87 /*----------------------------------------------------------------------------
88 | Returns the sign bit of the single-precision floating-point value `a'.
89 *----------------------------------------------------------------------------*/
90
91 INLINE flag extractFloat16Sign(float16 a)
92 {
93 return float16_val(a)>>15;
94 }
95
96 /*----------------------------------------------------------------------------
97 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
98 | and 7, and returns the properly rounded 32-bit integer corresponding to the
99 | input. If `zSign' is 1, the input is negated before being converted to an
100 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
101 | is simply rounded to an integer, with the inexact exception raised if the
102 | input cannot be represented exactly as an integer. However, if the fixed-
103 | point input is too large, the invalid exception is raised and the largest
104 | positive or negative integer is returned.
105 *----------------------------------------------------------------------------*/
106
107 static int32 roundAndPackInt32( flag zSign, bits64 absZ STATUS_PARAM)
108 {
109 int8 roundingMode;
110 flag roundNearestEven;
111 int8 roundIncrement, roundBits;
112 int32 z;
113
114 roundingMode = STATUS(float_rounding_mode);
115 roundNearestEven = ( roundingMode == float_round_nearest_even );
116 roundIncrement = 0x40;
117 if ( ! roundNearestEven ) {
118 if ( roundingMode == float_round_to_zero ) {
119 roundIncrement = 0;
120 }
121 else {
122 roundIncrement = 0x7F;
123 if ( zSign ) {
124 if ( roundingMode == float_round_up ) roundIncrement = 0;
125 }
126 else {
127 if ( roundingMode == float_round_down ) roundIncrement = 0;
128 }
129 }
130 }
131 roundBits = absZ & 0x7F;
132 absZ = ( absZ + roundIncrement )>>7;
133 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
134 z = absZ;
135 if ( zSign ) z = - z;
136 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
137 float_raise( float_flag_invalid STATUS_VAR);
138 return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
139 }
140 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
141 return z;
142
143 }
144
145 /*----------------------------------------------------------------------------
146 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
147 | `absZ1', with binary point between bits 63 and 64 (between the input words),
148 | and returns the properly rounded 64-bit integer corresponding to the input.
149 | If `zSign' is 1, the input is negated before being converted to an integer.
150 | Ordinarily, the fixed-point input is simply rounded to an integer, with
151 | the inexact exception raised if the input cannot be represented exactly as
152 | an integer. However, if the fixed-point input is too large, the invalid
153 | exception is raised and the largest positive or negative integer is
154 | returned.
155 *----------------------------------------------------------------------------*/
156
157 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 STATUS_PARAM)
158 {
159 int8 roundingMode;
160 flag roundNearestEven, increment;
161 int64 z;
162
163 roundingMode = STATUS(float_rounding_mode);
164 roundNearestEven = ( roundingMode == float_round_nearest_even );
165 increment = ( (sbits64) absZ1 < 0 );
166 if ( ! roundNearestEven ) {
167 if ( roundingMode == float_round_to_zero ) {
168 increment = 0;
169 }
170 else {
171 if ( zSign ) {
172 increment = ( roundingMode == float_round_down ) && absZ1;
173 }
174 else {
175 increment = ( roundingMode == float_round_up ) && absZ1;
176 }
177 }
178 }
179 if ( increment ) {
180 ++absZ0;
181 if ( absZ0 == 0 ) goto overflow;
182 absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
183 }
184 z = absZ0;
185 if ( zSign ) z = - z;
186 if ( z && ( ( z < 0 ) ^ zSign ) ) {
187 overflow:
188 float_raise( float_flag_invalid STATUS_VAR);
189 return
190 zSign ? (sbits64) LIT64( 0x8000000000000000 )
191 : LIT64( 0x7FFFFFFFFFFFFFFF );
192 }
193 if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
194 return z;
195
196 }
197
198 /*----------------------------------------------------------------------------
199 | Returns the fraction bits of the single-precision floating-point value `a'.
200 *----------------------------------------------------------------------------*/
201
202 INLINE bits32 extractFloat32Frac( float32 a )
203 {
204
205 return float32_val(a) & 0x007FFFFF;
206
207 }
208
209 /*----------------------------------------------------------------------------
210 | Returns the exponent bits of the single-precision floating-point value `a'.
211 *----------------------------------------------------------------------------*/
212
213 INLINE int16 extractFloat32Exp( float32 a )
214 {
215
216 return ( float32_val(a)>>23 ) & 0xFF;
217
218 }
219
220 /*----------------------------------------------------------------------------
221 | Returns the sign bit of the single-precision floating-point value `a'.
222 *----------------------------------------------------------------------------*/
223
224 INLINE flag extractFloat32Sign( float32 a )
225 {
226
227 return float32_val(a)>>31;
228
229 }
230
231 /*----------------------------------------------------------------------------
232 | If `a' is denormal and we are in flush-to-zero mode then set the
233 | input-denormal exception and return zero. Otherwise just return the value.
234 *----------------------------------------------------------------------------*/
235 static float32 float32_squash_input_denormal(float32 a STATUS_PARAM)
236 {
237 if (STATUS(flush_inputs_to_zero)) {
238 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
239 float_raise(float_flag_input_denormal STATUS_VAR);
240 return make_float32(float32_val(a) & 0x80000000);
241 }
242 }
243 return a;
244 }
245
246 /*----------------------------------------------------------------------------
247 | Normalizes the subnormal single-precision floating-point value represented
248 | by the denormalized significand `aSig'. The normalized exponent and
249 | significand are stored at the locations pointed to by `zExpPtr' and
250 | `zSigPtr', respectively.
251 *----------------------------------------------------------------------------*/
252
253 static void
254 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
255 {
256 int8 shiftCount;
257
258 shiftCount = countLeadingZeros32( aSig ) - 8;
259 *zSigPtr = aSig<<shiftCount;
260 *zExpPtr = 1 - shiftCount;
261
262 }
263
264 /*----------------------------------------------------------------------------
265 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
266 | single-precision floating-point value, returning the result. After being
267 | shifted into the proper positions, the three fields are simply added
268 | together to form the result. This means that any integer portion of `zSig'
269 | will be added into the exponent. Since a properly normalized significand
270 | will have an integer portion equal to 1, the `zExp' input should be 1 less
271 | than the desired result exponent whenever `zSig' is a complete, normalized
272 | significand.
273 *----------------------------------------------------------------------------*/
274
275 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
276 {
277
278 return make_float32(
279 ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig);
280
281 }
282
283 /*----------------------------------------------------------------------------
284 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
285 | and significand `zSig', and returns the proper single-precision floating-
286 | point value corresponding to the abstract input. Ordinarily, the abstract
287 | value is simply rounded and packed into the single-precision format, with
288 | the inexact exception raised if the abstract input cannot be represented
289 | exactly. However, if the abstract value is too large, the overflow and
290 | inexact exceptions are raised and an infinity or maximal finite value is
291 | returned. If the abstract value is too small, the input value is rounded to
292 | a subnormal number, and the underflow and inexact exceptions are raised if
293 | the abstract input cannot be represented exactly as a subnormal single-
294 | precision floating-point number.
295 | The input significand `zSig' has its binary point between bits 30
296 | and 29, which is 7 bits to the left of the usual location. This shifted
297 | significand must be normalized or smaller. If `zSig' is not normalized,
298 | `zExp' must be 0; in that case, the result returned is a subnormal number,
299 | and it must not require rounding. In the usual case that `zSig' is
300 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
301 | The handling of underflow and overflow follows the IEC/IEEE Standard for
302 | Binary Floating-Point Arithmetic.
303 *----------------------------------------------------------------------------*/
304
305 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
306 {
307 int8 roundingMode;
308 flag roundNearestEven;
309 int8 roundIncrement, roundBits;
310 flag isTiny;
311
312 roundingMode = STATUS(float_rounding_mode);
313 roundNearestEven = ( roundingMode == float_round_nearest_even );
314 roundIncrement = 0x40;
315 if ( ! roundNearestEven ) {
316 if ( roundingMode == float_round_to_zero ) {
317 roundIncrement = 0;
318 }
319 else {
320 roundIncrement = 0x7F;
321 if ( zSign ) {
322 if ( roundingMode == float_round_up ) roundIncrement = 0;
323 }
324 else {
325 if ( roundingMode == float_round_down ) roundIncrement = 0;
326 }
327 }
328 }
329 roundBits = zSig & 0x7F;
330 if ( 0xFD <= (bits16) zExp ) {
331 if ( ( 0xFD < zExp )
332 || ( ( zExp == 0xFD )
333 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
334 ) {
335 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
336 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
337 }
338 if ( zExp < 0 ) {
339 if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
340 isTiny =
341 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
342 || ( zExp < -1 )
343 || ( zSig + roundIncrement < 0x80000000 );
344 shift32RightJamming( zSig, - zExp, &zSig );
345 zExp = 0;
346 roundBits = zSig & 0x7F;
347 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
348 }
349 }
350 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
351 zSig = ( zSig + roundIncrement )>>7;
352 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
353 if ( zSig == 0 ) zExp = 0;
354 return packFloat32( zSign, zExp, zSig );
355
356 }
357
358 /*----------------------------------------------------------------------------
359 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
360 | and significand `zSig', and returns the proper single-precision floating-
361 | point value corresponding to the abstract input. This routine is just like
362 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
363 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
364 | floating-point exponent.
365 *----------------------------------------------------------------------------*/
366
367 static float32
368 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
369 {
370 int8 shiftCount;
371
372 shiftCount = countLeadingZeros32( zSig ) - 1;
373 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
374
375 }
376
377 /*----------------------------------------------------------------------------
378 | Returns the fraction bits of the double-precision floating-point value `a'.
379 *----------------------------------------------------------------------------*/
380
381 INLINE bits64 extractFloat64Frac( float64 a )
382 {
383
384 return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
385
386 }
387
388 /*----------------------------------------------------------------------------
389 | Returns the exponent bits of the double-precision floating-point value `a'.
390 *----------------------------------------------------------------------------*/
391
392 INLINE int16 extractFloat64Exp( float64 a )
393 {
394
395 return ( float64_val(a)>>52 ) & 0x7FF;
396
397 }
398
399 /*----------------------------------------------------------------------------
400 | Returns the sign bit of the double-precision floating-point value `a'.
401 *----------------------------------------------------------------------------*/
402
403 INLINE flag extractFloat64Sign( float64 a )
404 {
405
406 return float64_val(a)>>63;
407
408 }
409
410 /*----------------------------------------------------------------------------
411 | If `a' is denormal and we are in flush-to-zero mode then set the
412 | input-denormal exception and return zero. Otherwise just return the value.
413 *----------------------------------------------------------------------------*/
414 static float64 float64_squash_input_denormal(float64 a STATUS_PARAM)
415 {
416 if (STATUS(flush_inputs_to_zero)) {
417 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
418 float_raise(float_flag_input_denormal STATUS_VAR);
419 return make_float64(float64_val(a) & (1ULL << 63));
420 }
421 }
422 return a;
423 }
424
425 /*----------------------------------------------------------------------------
426 | Normalizes the subnormal double-precision floating-point value represented
427 | by the denormalized significand `aSig'. The normalized exponent and
428 | significand are stored at the locations pointed to by `zExpPtr' and
429 | `zSigPtr', respectively.
430 *----------------------------------------------------------------------------*/
431
432 static void
433 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
434 {
435 int8 shiftCount;
436
437 shiftCount = countLeadingZeros64( aSig ) - 11;
438 *zSigPtr = aSig<<shiftCount;
439 *zExpPtr = 1 - shiftCount;
440
441 }
442
443 /*----------------------------------------------------------------------------
444 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
445 | double-precision floating-point value, returning the result. After being
446 | shifted into the proper positions, the three fields are simply added
447 | together to form the result. This means that any integer portion of `zSig'
448 | will be added into the exponent. Since a properly normalized significand
449 | will have an integer portion equal to 1, the `zExp' input should be 1 less
450 | than the desired result exponent whenever `zSig' is a complete, normalized
451 | significand.
452 *----------------------------------------------------------------------------*/
453
454 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
455 {
456
457 return make_float64(
458 ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig);
459
460 }
461
462 /*----------------------------------------------------------------------------
463 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
464 | and significand `zSig', and returns the proper double-precision floating-
465 | point value corresponding to the abstract input. Ordinarily, the abstract
466 | value is simply rounded and packed into the double-precision format, with
467 | the inexact exception raised if the abstract input cannot be represented
468 | exactly. However, if the abstract value is too large, the overflow and
469 | inexact exceptions are raised and an infinity or maximal finite value is
470 | returned. If the abstract value is too small, the input value is rounded
471 | to a subnormal number, and the underflow and inexact exceptions are raised
472 | if the abstract input cannot be represented exactly as a subnormal double-
473 | precision floating-point number.
474 | The input significand `zSig' has its binary point between bits 62
475 | and 61, which is 10 bits to the left of the usual location. This shifted
476 | significand must be normalized or smaller. If `zSig' is not normalized,
477 | `zExp' must be 0; in that case, the result returned is a subnormal number,
478 | and it must not require rounding. In the usual case that `zSig' is
479 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
480 | The handling of underflow and overflow follows the IEC/IEEE Standard for
481 | Binary Floating-Point Arithmetic.
482 *----------------------------------------------------------------------------*/
483
484 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
485 {
486 int8 roundingMode;
487 flag roundNearestEven;
488 int16 roundIncrement, roundBits;
489 flag isTiny;
490
491 roundingMode = STATUS(float_rounding_mode);
492 roundNearestEven = ( roundingMode == float_round_nearest_even );
493 roundIncrement = 0x200;
494 if ( ! roundNearestEven ) {
495 if ( roundingMode == float_round_to_zero ) {
496 roundIncrement = 0;
497 }
498 else {
499 roundIncrement = 0x3FF;
500 if ( zSign ) {
501 if ( roundingMode == float_round_up ) roundIncrement = 0;
502 }
503 else {
504 if ( roundingMode == float_round_down ) roundIncrement = 0;
505 }
506 }
507 }
508 roundBits = zSig & 0x3FF;
509 if ( 0x7FD <= (bits16) zExp ) {
510 if ( ( 0x7FD < zExp )
511 || ( ( zExp == 0x7FD )
512 && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
513 ) {
514 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
515 return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
516 }
517 if ( zExp < 0 ) {
518 if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
519 isTiny =
520 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
521 || ( zExp < -1 )
522 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
523 shift64RightJamming( zSig, - zExp, &zSig );
524 zExp = 0;
525 roundBits = zSig & 0x3FF;
526 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
527 }
528 }
529 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
530 zSig = ( zSig + roundIncrement )>>10;
531 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
532 if ( zSig == 0 ) zExp = 0;
533 return packFloat64( zSign, zExp, zSig );
534
535 }
536
537 /*----------------------------------------------------------------------------
538 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
539 | and significand `zSig', and returns the proper double-precision floating-
540 | point value corresponding to the abstract input. This routine is just like
541 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
542 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
543 | floating-point exponent.
544 *----------------------------------------------------------------------------*/
545
546 static float64
547 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
548 {
549 int8 shiftCount;
550
551 shiftCount = countLeadingZeros64( zSig ) - 1;
552 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
553
554 }
555
556 #ifdef FLOATX80
557
558 /*----------------------------------------------------------------------------
559 | Returns the fraction bits of the extended double-precision floating-point
560 | value `a'.
561 *----------------------------------------------------------------------------*/
562
563 INLINE bits64 extractFloatx80Frac( floatx80 a )
564 {
565
566 return a.low;
567
568 }
569
570 /*----------------------------------------------------------------------------
571 | Returns the exponent bits of the extended double-precision floating-point
572 | value `a'.
573 *----------------------------------------------------------------------------*/
574
575 INLINE int32 extractFloatx80Exp( floatx80 a )
576 {
577
578 return a.high & 0x7FFF;
579
580 }
581
582 /*----------------------------------------------------------------------------
583 | Returns the sign bit of the extended double-precision floating-point value
584 | `a'.
585 *----------------------------------------------------------------------------*/
586
587 INLINE flag extractFloatx80Sign( floatx80 a )
588 {
589
590 return a.high>>15;
591
592 }
593
594 /*----------------------------------------------------------------------------
595 | Normalizes the subnormal extended double-precision floating-point value
596 | represented by the denormalized significand `aSig'. The normalized exponent
597 | and significand are stored at the locations pointed to by `zExpPtr' and
598 | `zSigPtr', respectively.
599 *----------------------------------------------------------------------------*/
600
601 static void
602 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
603 {
604 int8 shiftCount;
605
606 shiftCount = countLeadingZeros64( aSig );
607 *zSigPtr = aSig<<shiftCount;
608 *zExpPtr = 1 - shiftCount;
609
610 }
611
612 /*----------------------------------------------------------------------------
613 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
614 | extended double-precision floating-point value, returning the result.
615 *----------------------------------------------------------------------------*/
616
617 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
618 {
619 floatx80 z;
620
621 z.low = zSig;
622 z.high = ( ( (bits16) zSign )<<15 ) + zExp;
623 return z;
624
625 }
626
627 /*----------------------------------------------------------------------------
628 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
629 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
630 | and returns the proper extended double-precision floating-point value
631 | corresponding to the abstract input. Ordinarily, the abstract value is
632 | rounded and packed into the extended double-precision format, with the
633 | inexact exception raised if the abstract input cannot be represented
634 | exactly. However, if the abstract value is too large, the overflow and
635 | inexact exceptions are raised and an infinity or maximal finite value is
636 | returned. If the abstract value is too small, the input value is rounded to
637 | a subnormal number, and the underflow and inexact exceptions are raised if
638 | the abstract input cannot be represented exactly as a subnormal extended
639 | double-precision floating-point number.
640 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
641 | number of bits as single or double precision, respectively. Otherwise, the
642 | result is rounded to the full precision of the extended double-precision
643 | format.
644 | The input significand must be normalized or smaller. If the input
645 | significand is not normalized, `zExp' must be 0; in that case, the result
646 | returned is a subnormal number, and it must not require rounding. The
647 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
648 | Floating-Point Arithmetic.
649 *----------------------------------------------------------------------------*/
650
651 static floatx80
652 roundAndPackFloatx80(
653 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
654 STATUS_PARAM)
655 {
656 int8 roundingMode;
657 flag roundNearestEven, increment, isTiny;
658 int64 roundIncrement, roundMask, roundBits;
659
660 roundingMode = STATUS(float_rounding_mode);
661 roundNearestEven = ( roundingMode == float_round_nearest_even );
662 if ( roundingPrecision == 80 ) goto precision80;
663 if ( roundingPrecision == 64 ) {
664 roundIncrement = LIT64( 0x0000000000000400 );
665 roundMask = LIT64( 0x00000000000007FF );
666 }
667 else if ( roundingPrecision == 32 ) {
668 roundIncrement = LIT64( 0x0000008000000000 );
669 roundMask = LIT64( 0x000000FFFFFFFFFF );
670 }
671 else {
672 goto precision80;
673 }
674 zSig0 |= ( zSig1 != 0 );
675 if ( ! roundNearestEven ) {
676 if ( roundingMode == float_round_to_zero ) {
677 roundIncrement = 0;
678 }
679 else {
680 roundIncrement = roundMask;
681 if ( zSign ) {
682 if ( roundingMode == float_round_up ) roundIncrement = 0;
683 }
684 else {
685 if ( roundingMode == float_round_down ) roundIncrement = 0;
686 }
687 }
688 }
689 roundBits = zSig0 & roundMask;
690 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
691 if ( ( 0x7FFE < zExp )
692 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
693 ) {
694 goto overflow;
695 }
696 if ( zExp <= 0 ) {
697 if ( STATUS(flush_to_zero) ) return packFloatx80( zSign, 0, 0 );
698 isTiny =
699 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
700 || ( zExp < 0 )
701 || ( zSig0 <= zSig0 + roundIncrement );
702 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
703 zExp = 0;
704 roundBits = zSig0 & roundMask;
705 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
706 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
707 zSig0 += roundIncrement;
708 if ( (sbits64) zSig0 < 0 ) zExp = 1;
709 roundIncrement = roundMask + 1;
710 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
711 roundMask |= roundIncrement;
712 }
713 zSig0 &= ~ roundMask;
714 return packFloatx80( zSign, zExp, zSig0 );
715 }
716 }
717 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
718 zSig0 += roundIncrement;
719 if ( zSig0 < roundIncrement ) {
720 ++zExp;
721 zSig0 = LIT64( 0x8000000000000000 );
722 }
723 roundIncrement = roundMask + 1;
724 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
725 roundMask |= roundIncrement;
726 }
727 zSig0 &= ~ roundMask;
728 if ( zSig0 == 0 ) zExp = 0;
729 return packFloatx80( zSign, zExp, zSig0 );
730 precision80:
731 increment = ( (sbits64) zSig1 < 0 );
732 if ( ! roundNearestEven ) {
733 if ( roundingMode == float_round_to_zero ) {
734 increment = 0;
735 }
736 else {
737 if ( zSign ) {
738 increment = ( roundingMode == float_round_down ) && zSig1;
739 }
740 else {
741 increment = ( roundingMode == float_round_up ) && zSig1;
742 }
743 }
744 }
745 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
746 if ( ( 0x7FFE < zExp )
747 || ( ( zExp == 0x7FFE )
748 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
749 && increment
750 )
751 ) {
752 roundMask = 0;
753 overflow:
754 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
755 if ( ( roundingMode == float_round_to_zero )
756 || ( zSign && ( roundingMode == float_round_up ) )
757 || ( ! zSign && ( roundingMode == float_round_down ) )
758 ) {
759 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
760 }
761 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
762 }
763 if ( zExp <= 0 ) {
764 isTiny =
765 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
766 || ( zExp < 0 )
767 || ! increment
768 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
769 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
770 zExp = 0;
771 if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
772 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
773 if ( roundNearestEven ) {
774 increment = ( (sbits64) zSig1 < 0 );
775 }
776 else {
777 if ( zSign ) {
778 increment = ( roundingMode == float_round_down ) && zSig1;
779 }
780 else {
781 increment = ( roundingMode == float_round_up ) && zSig1;
782 }
783 }
784 if ( increment ) {
785 ++zSig0;
786 zSig0 &=
787 ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
788 if ( (sbits64) zSig0 < 0 ) zExp = 1;
789 }
790 return packFloatx80( zSign, zExp, zSig0 );
791 }
792 }
793 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
794 if ( increment ) {
795 ++zSig0;
796 if ( zSig0 == 0 ) {
797 ++zExp;
798 zSig0 = LIT64( 0x8000000000000000 );
799 }
800 else {
801 zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
802 }
803 }
804 else {
805 if ( zSig0 == 0 ) zExp = 0;
806 }
807 return packFloatx80( zSign, zExp, zSig0 );
808
809 }
810
811 /*----------------------------------------------------------------------------
812 | Takes an abstract floating-point value having sign `zSign', exponent
813 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
814 | and returns the proper extended double-precision floating-point value
815 | corresponding to the abstract input. This routine is just like
816 | `roundAndPackFloatx80' except that the input significand does not have to be
817 | normalized.
818 *----------------------------------------------------------------------------*/
819
820 static floatx80
821 normalizeRoundAndPackFloatx80(
822 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
823 STATUS_PARAM)
824 {
825 int8 shiftCount;
826
827 if ( zSig0 == 0 ) {
828 zSig0 = zSig1;
829 zSig1 = 0;
830 zExp -= 64;
831 }
832 shiftCount = countLeadingZeros64( zSig0 );
833 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
834 zExp -= shiftCount;
835 return
836 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
837
838 }
839
840 #endif
841
842 #ifdef FLOAT128
843
844 /*----------------------------------------------------------------------------
845 | Returns the least-significant 64 fraction bits of the quadruple-precision
846 | floating-point value `a'.
847 *----------------------------------------------------------------------------*/
848
849 INLINE bits64 extractFloat128Frac1( float128 a )
850 {
851
852 return a.low;
853
854 }
855
856 /*----------------------------------------------------------------------------
857 | Returns the most-significant 48 fraction bits of the quadruple-precision
858 | floating-point value `a'.
859 *----------------------------------------------------------------------------*/
860
861 INLINE bits64 extractFloat128Frac0( float128 a )
862 {
863
864 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
865
866 }
867
868 /*----------------------------------------------------------------------------
869 | Returns the exponent bits of the quadruple-precision floating-point value
870 | `a'.
871 *----------------------------------------------------------------------------*/
872
873 INLINE int32 extractFloat128Exp( float128 a )
874 {
875
876 return ( a.high>>48 ) & 0x7FFF;
877
878 }
879
880 /*----------------------------------------------------------------------------
881 | Returns the sign bit of the quadruple-precision floating-point value `a'.
882 *----------------------------------------------------------------------------*/
883
884 INLINE flag extractFloat128Sign( float128 a )
885 {
886
887 return a.high>>63;
888
889 }
890
891 /*----------------------------------------------------------------------------
892 | Normalizes the subnormal quadruple-precision floating-point value
893 | represented by the denormalized significand formed by the concatenation of
894 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
895 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
896 | significand are stored at the location pointed to by `zSig0Ptr', and the
897 | least significant 64 bits of the normalized significand are stored at the
898 | location pointed to by `zSig1Ptr'.
899 *----------------------------------------------------------------------------*/
900
901 static void
902 normalizeFloat128Subnormal(
903 bits64 aSig0,
904 bits64 aSig1,
905 int32 *zExpPtr,
906 bits64 *zSig0Ptr,
907 bits64 *zSig1Ptr
908 )
909 {
910 int8 shiftCount;
911
912 if ( aSig0 == 0 ) {
913 shiftCount = countLeadingZeros64( aSig1 ) - 15;
914 if ( shiftCount < 0 ) {
915 *zSig0Ptr = aSig1>>( - shiftCount );
916 *zSig1Ptr = aSig1<<( shiftCount & 63 );
917 }
918 else {
919 *zSig0Ptr = aSig1<<shiftCount;
920 *zSig1Ptr = 0;
921 }
922 *zExpPtr = - shiftCount - 63;
923 }
924 else {
925 shiftCount = countLeadingZeros64( aSig0 ) - 15;
926 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
927 *zExpPtr = 1 - shiftCount;
928 }
929
930 }
931
932 /*----------------------------------------------------------------------------
933 | Packs the sign `zSign', the exponent `zExp', and the significand formed
934 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
935 | floating-point value, returning the result. After being shifted into the
936 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
937 | added together to form the most significant 32 bits of the result. This
938 | means that any integer portion of `zSig0' will be added into the exponent.
939 | Since a properly normalized significand will have an integer portion equal
940 | to 1, the `zExp' input should be 1 less than the desired result exponent
941 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
942 | significand.
943 *----------------------------------------------------------------------------*/
944
945 INLINE float128
946 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
947 {
948 float128 z;
949
950 z.low = zSig1;
951 z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
952 return z;
953
954 }
955
956 /*----------------------------------------------------------------------------
957 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
958 | and extended significand formed by the concatenation of `zSig0', `zSig1',
959 | and `zSig2', and returns the proper quadruple-precision floating-point value
960 | corresponding to the abstract input. Ordinarily, the abstract value is
961 | simply rounded and packed into the quadruple-precision format, with the
962 | inexact exception raised if the abstract input cannot be represented
963 | exactly. However, if the abstract value is too large, the overflow and
964 | inexact exceptions are raised and an infinity or maximal finite value is
965 | returned. If the abstract value is too small, the input value is rounded to
966 | a subnormal number, and the underflow and inexact exceptions are raised if
967 | the abstract input cannot be represented exactly as a subnormal quadruple-
968 | precision floating-point number.
969 | The input significand must be normalized or smaller. If the input
970 | significand is not normalized, `zExp' must be 0; in that case, the result
971 | returned is a subnormal number, and it must not require rounding. In the
972 | usual case that the input significand is normalized, `zExp' must be 1 less
973 | than the ``true'' floating-point exponent. The handling of underflow and
974 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
975 *----------------------------------------------------------------------------*/
976
977 static float128
978 roundAndPackFloat128(
979 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 STATUS_PARAM)
980 {
981 int8 roundingMode;
982 flag roundNearestEven, increment, isTiny;
983
984 roundingMode = STATUS(float_rounding_mode);
985 roundNearestEven = ( roundingMode == float_round_nearest_even );
986 increment = ( (sbits64) zSig2 < 0 );
987 if ( ! roundNearestEven ) {
988 if ( roundingMode == float_round_to_zero ) {
989 increment = 0;
990 }
991 else {
992 if ( zSign ) {
993 increment = ( roundingMode == float_round_down ) && zSig2;
994 }
995 else {
996 increment = ( roundingMode == float_round_up ) && zSig2;
997 }
998 }
999 }
1000 if ( 0x7FFD <= (bits32) zExp ) {
1001 if ( ( 0x7FFD < zExp )
1002 || ( ( zExp == 0x7FFD )
1003 && eq128(
1004 LIT64( 0x0001FFFFFFFFFFFF ),
1005 LIT64( 0xFFFFFFFFFFFFFFFF ),
1006 zSig0,
1007 zSig1
1008 )
1009 && increment
1010 )
1011 ) {
1012 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
1013 if ( ( roundingMode == float_round_to_zero )
1014 || ( zSign && ( roundingMode == float_round_up ) )
1015 || ( ! zSign && ( roundingMode == float_round_down ) )
1016 ) {
1017 return
1018 packFloat128(
1019 zSign,
1020 0x7FFE,
1021 LIT64( 0x0000FFFFFFFFFFFF ),
1022 LIT64( 0xFFFFFFFFFFFFFFFF )
1023 );
1024 }
1025 return packFloat128( zSign, 0x7FFF, 0, 0 );
1026 }
1027 if ( zExp < 0 ) {
1028 if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
1029 isTiny =
1030 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
1031 || ( zExp < -1 )
1032 || ! increment
1033 || lt128(
1034 zSig0,
1035 zSig1,
1036 LIT64( 0x0001FFFFFFFFFFFF ),
1037 LIT64( 0xFFFFFFFFFFFFFFFF )
1038 );
1039 shift128ExtraRightJamming(
1040 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1041 zExp = 0;
1042 if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
1043 if ( roundNearestEven ) {
1044 increment = ( (sbits64) zSig2 < 0 );
1045 }
1046 else {
1047 if ( zSign ) {
1048 increment = ( roundingMode == float_round_down ) && zSig2;
1049 }
1050 else {
1051 increment = ( roundingMode == float_round_up ) && zSig2;
1052 }
1053 }
1054 }
1055 }
1056 if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1057 if ( increment ) {
1058 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1059 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1060 }
1061 else {
1062 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1063 }
1064 return packFloat128( zSign, zExp, zSig0, zSig1 );
1065
1066 }
1067
1068 /*----------------------------------------------------------------------------
1069 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1070 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1071 | returns the proper quadruple-precision floating-point value corresponding
1072 | to the abstract input. This routine is just like `roundAndPackFloat128'
1073 | except that the input significand has fewer bits and does not have to be
1074 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1075 | point exponent.
1076 *----------------------------------------------------------------------------*/
1077
1078 static float128
1079 normalizeRoundAndPackFloat128(
1080 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 STATUS_PARAM)
1081 {
1082 int8 shiftCount;
1083 bits64 zSig2;
1084
1085 if ( zSig0 == 0 ) {
1086 zSig0 = zSig1;
1087 zSig1 = 0;
1088 zExp -= 64;
1089 }
1090 shiftCount = countLeadingZeros64( zSig0 ) - 15;
1091 if ( 0 <= shiftCount ) {
1092 zSig2 = 0;
1093 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1094 }
1095 else {
1096 shift128ExtraRightJamming(
1097 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1098 }
1099 zExp -= shiftCount;
1100 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1101
1102 }
1103
1104 #endif
1105
1106 /*----------------------------------------------------------------------------
1107 | Returns the result of converting the 32-bit two's complement integer `a'
1108 | to the single-precision floating-point format. The conversion is performed
1109 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1110 *----------------------------------------------------------------------------*/
1111
1112 float32 int32_to_float32( int32 a STATUS_PARAM )
1113 {
1114 flag zSign;
1115
1116 if ( a == 0 ) return float32_zero;
1117 if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1118 zSign = ( a < 0 );
1119 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1120
1121 }
1122
1123 /*----------------------------------------------------------------------------
1124 | Returns the result of converting the 32-bit two's complement integer `a'
1125 | to the double-precision floating-point format. The conversion is performed
1126 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1127 *----------------------------------------------------------------------------*/
1128
1129 float64 int32_to_float64( int32 a STATUS_PARAM )
1130 {
1131 flag zSign;
1132 uint32 absA;
1133 int8 shiftCount;
1134 bits64 zSig;
1135
1136 if ( a == 0 ) return float64_zero;
1137 zSign = ( a < 0 );
1138 absA = zSign ? - a : a;
1139 shiftCount = countLeadingZeros32( absA ) + 21;
1140 zSig = absA;
1141 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1142
1143 }
1144
1145 #ifdef FLOATX80
1146
1147 /*----------------------------------------------------------------------------
1148 | Returns the result of converting the 32-bit two's complement integer `a'
1149 | to the extended double-precision floating-point format. The conversion
1150 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1151 | Arithmetic.
1152 *----------------------------------------------------------------------------*/
1153
1154 floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1155 {
1156 flag zSign;
1157 uint32 absA;
1158 int8 shiftCount;
1159 bits64 zSig;
1160
1161 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1162 zSign = ( a < 0 );
1163 absA = zSign ? - a : a;
1164 shiftCount = countLeadingZeros32( absA ) + 32;
1165 zSig = absA;
1166 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1167
1168 }
1169
1170 #endif
1171
1172 #ifdef FLOAT128
1173
1174 /*----------------------------------------------------------------------------
1175 | Returns the result of converting the 32-bit two's complement integer `a' to
1176 | the quadruple-precision floating-point format. The conversion is performed
1177 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1178 *----------------------------------------------------------------------------*/
1179
1180 float128 int32_to_float128( int32 a STATUS_PARAM )
1181 {
1182 flag zSign;
1183 uint32 absA;
1184 int8 shiftCount;
1185 bits64 zSig0;
1186
1187 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1188 zSign = ( a < 0 );
1189 absA = zSign ? - a : a;
1190 shiftCount = countLeadingZeros32( absA ) + 17;
1191 zSig0 = absA;
1192 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1193
1194 }
1195
1196 #endif
1197
1198 /*----------------------------------------------------------------------------
1199 | Returns the result of converting the 64-bit two's complement integer `a'
1200 | to the single-precision floating-point format. The conversion is performed
1201 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1202 *----------------------------------------------------------------------------*/
1203
1204 float32 int64_to_float32( int64 a STATUS_PARAM )
1205 {
1206 flag zSign;
1207 uint64 absA;
1208 int8 shiftCount;
1209
1210 if ( a == 0 ) return float32_zero;
1211 zSign = ( a < 0 );
1212 absA = zSign ? - a : a;
1213 shiftCount = countLeadingZeros64( absA ) - 40;
1214 if ( 0 <= shiftCount ) {
1215 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1216 }
1217 else {
1218 shiftCount += 7;
1219 if ( shiftCount < 0 ) {
1220 shift64RightJamming( absA, - shiftCount, &absA );
1221 }
1222 else {
1223 absA <<= shiftCount;
1224 }
1225 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1226 }
1227
1228 }
1229
1230 float32 uint64_to_float32( uint64 a STATUS_PARAM )
1231 {
1232 int8 shiftCount;
1233
1234 if ( a == 0 ) return float32_zero;
1235 shiftCount = countLeadingZeros64( a ) - 40;
1236 if ( 0 <= shiftCount ) {
1237 return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount );
1238 }
1239 else {
1240 shiftCount += 7;
1241 if ( shiftCount < 0 ) {
1242 shift64RightJamming( a, - shiftCount, &a );
1243 }
1244 else {
1245 a <<= shiftCount;
1246 }
1247 return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR );
1248 }
1249 }
1250
1251 /*----------------------------------------------------------------------------
1252 | Returns the result of converting the 64-bit two's complement integer `a'
1253 | to the double-precision floating-point format. The conversion is performed
1254 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1255 *----------------------------------------------------------------------------*/
1256
1257 float64 int64_to_float64( int64 a STATUS_PARAM )
1258 {
1259 flag zSign;
1260
1261 if ( a == 0 ) return float64_zero;
1262 if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1263 return packFloat64( 1, 0x43E, 0 );
1264 }
1265 zSign = ( a < 0 );
1266 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1267
1268 }
1269
1270 float64 uint64_to_float64( uint64 a STATUS_PARAM )
1271 {
1272 if ( a == 0 ) return float64_zero;
1273 return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR );
1274
1275 }
1276
1277 #ifdef FLOATX80
1278
1279 /*----------------------------------------------------------------------------
1280 | Returns the result of converting the 64-bit two's complement integer `a'
1281 | to the extended double-precision floating-point format. The conversion
1282 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1283 | Arithmetic.
1284 *----------------------------------------------------------------------------*/
1285
1286 floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1287 {
1288 flag zSign;
1289 uint64 absA;
1290 int8 shiftCount;
1291
1292 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1293 zSign = ( a < 0 );
1294 absA = zSign ? - a : a;
1295 shiftCount = countLeadingZeros64( absA );
1296 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1297
1298 }
1299
1300 #endif
1301
1302 #ifdef FLOAT128
1303
1304 /*----------------------------------------------------------------------------
1305 | Returns the result of converting the 64-bit two's complement integer `a' to
1306 | the quadruple-precision floating-point format. The conversion is performed
1307 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1308 *----------------------------------------------------------------------------*/
1309
1310 float128 int64_to_float128( int64 a STATUS_PARAM )
1311 {
1312 flag zSign;
1313 uint64 absA;
1314 int8 shiftCount;
1315 int32 zExp;
1316 bits64 zSig0, zSig1;
1317
1318 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1319 zSign = ( a < 0 );
1320 absA = zSign ? - a : a;
1321 shiftCount = countLeadingZeros64( absA ) + 49;
1322 zExp = 0x406E - shiftCount;
1323 if ( 64 <= shiftCount ) {
1324 zSig1 = 0;
1325 zSig0 = absA;
1326 shiftCount -= 64;
1327 }
1328 else {
1329 zSig1 = absA;
1330 zSig0 = 0;
1331 }
1332 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1333 return packFloat128( zSign, zExp, zSig0, zSig1 );
1334
1335 }
1336
1337 #endif
1338
1339 /*----------------------------------------------------------------------------
1340 | Returns the result of converting the single-precision floating-point value
1341 | `a' to the 32-bit two's complement integer format. The conversion is
1342 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1343 | Arithmetic---which means in particular that the conversion is rounded
1344 | according to the current rounding mode. If `a' is a NaN, the largest
1345 | positive integer is returned. Otherwise, if the conversion overflows, the
1346 | largest integer with the same sign as `a' is returned.
1347 *----------------------------------------------------------------------------*/
1348
1349 int32 float32_to_int32( float32 a STATUS_PARAM )
1350 {
1351 flag aSign;
1352 int16 aExp, shiftCount;
1353 bits32 aSig;
1354 bits64 aSig64;
1355
1356 a = float32_squash_input_denormal(a STATUS_VAR);
1357 aSig = extractFloat32Frac( a );
1358 aExp = extractFloat32Exp( a );
1359 aSign = extractFloat32Sign( a );
1360 if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1361 if ( aExp ) aSig |= 0x00800000;
1362 shiftCount = 0xAF - aExp;
1363 aSig64 = aSig;
1364 aSig64 <<= 32;
1365 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1366 return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1367
1368 }
1369
1370 /*----------------------------------------------------------------------------
1371 | Returns the result of converting the single-precision floating-point value
1372 | `a' to the 32-bit two's complement integer format. The conversion is
1373 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1374 | Arithmetic, except that the conversion is always rounded toward zero.
1375 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1376 | the conversion overflows, the largest integer with the same sign as `a' is
1377 | returned.
1378 *----------------------------------------------------------------------------*/
1379
1380 int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1381 {
1382 flag aSign;
1383 int16 aExp, shiftCount;
1384 bits32 aSig;
1385 int32 z;
1386 a = float32_squash_input_denormal(a STATUS_VAR);
1387
1388 aSig = extractFloat32Frac( a );
1389 aExp = extractFloat32Exp( a );
1390 aSign = extractFloat32Sign( a );
1391 shiftCount = aExp - 0x9E;
1392 if ( 0 <= shiftCount ) {
1393 if ( float32_val(a) != 0xCF000000 ) {
1394 float_raise( float_flag_invalid STATUS_VAR);
1395 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1396 }
1397 return (sbits32) 0x80000000;
1398 }
1399 else if ( aExp <= 0x7E ) {
1400 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1401 return 0;
1402 }
1403 aSig = ( aSig | 0x00800000 )<<8;
1404 z = aSig>>( - shiftCount );
1405 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1406 STATUS(float_exception_flags) |= float_flag_inexact;
1407 }
1408 if ( aSign ) z = - z;
1409 return z;
1410
1411 }
1412
1413 /*----------------------------------------------------------------------------
1414 | Returns the result of converting the single-precision floating-point value
1415 | `a' to the 16-bit two's complement integer format. The conversion is
1416 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1417 | Arithmetic, except that the conversion is always rounded toward zero.
1418 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1419 | the conversion overflows, the largest integer with the same sign as `a' is
1420 | returned.
1421 *----------------------------------------------------------------------------*/
1422
1423 int16 float32_to_int16_round_to_zero( float32 a STATUS_PARAM )
1424 {
1425 flag aSign;
1426 int16 aExp, shiftCount;
1427 bits32 aSig;
1428 int32 z;
1429
1430 aSig = extractFloat32Frac( a );
1431 aExp = extractFloat32Exp( a );
1432 aSign = extractFloat32Sign( a );
1433 shiftCount = aExp - 0x8E;
1434 if ( 0 <= shiftCount ) {
1435 if ( float32_val(a) != 0xC7000000 ) {
1436 float_raise( float_flag_invalid STATUS_VAR);
1437 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1438 return 0x7FFF;
1439 }
1440 }
1441 return (sbits32) 0xffff8000;
1442 }
1443 else if ( aExp <= 0x7E ) {
1444 if ( aExp | aSig ) {
1445 STATUS(float_exception_flags) |= float_flag_inexact;
1446 }
1447 return 0;
1448 }
1449 shiftCount -= 0x10;
1450 aSig = ( aSig | 0x00800000 )<<8;
1451 z = aSig>>( - shiftCount );
1452 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1453 STATUS(float_exception_flags) |= float_flag_inexact;
1454 }
1455 if ( aSign ) {
1456 z = - z;
1457 }
1458 return z;
1459
1460 }
1461
1462 /*----------------------------------------------------------------------------
1463 | Returns the result of converting the single-precision floating-point value
1464 | `a' to the 64-bit two's complement integer format. The conversion is
1465 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1466 | Arithmetic---which means in particular that the conversion is rounded
1467 | according to the current rounding mode. If `a' is a NaN, the largest
1468 | positive integer is returned. Otherwise, if the conversion overflows, the
1469 | largest integer with the same sign as `a' is returned.
1470 *----------------------------------------------------------------------------*/
1471
1472 int64 float32_to_int64( float32 a STATUS_PARAM )
1473 {
1474 flag aSign;
1475 int16 aExp, shiftCount;
1476 bits32 aSig;
1477 bits64 aSig64, aSigExtra;
1478 a = float32_squash_input_denormal(a STATUS_VAR);
1479
1480 aSig = extractFloat32Frac( a );
1481 aExp = extractFloat32Exp( a );
1482 aSign = extractFloat32Sign( a );
1483 shiftCount = 0xBE - aExp;
1484 if ( shiftCount < 0 ) {
1485 float_raise( float_flag_invalid STATUS_VAR);
1486 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1487 return LIT64( 0x7FFFFFFFFFFFFFFF );
1488 }
1489 return (sbits64) LIT64( 0x8000000000000000 );
1490 }
1491 if ( aExp ) aSig |= 0x00800000;
1492 aSig64 = aSig;
1493 aSig64 <<= 40;
1494 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1495 return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1496
1497 }
1498
1499 /*----------------------------------------------------------------------------
1500 | Returns the result of converting the single-precision floating-point value
1501 | `a' to the 64-bit two's complement integer format. The conversion is
1502 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1503 | Arithmetic, except that the conversion is always rounded toward zero. If
1504 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1505 | conversion overflows, the largest integer with the same sign as `a' is
1506 | returned.
1507 *----------------------------------------------------------------------------*/
1508
1509 int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1510 {
1511 flag aSign;
1512 int16 aExp, shiftCount;
1513 bits32 aSig;
1514 bits64 aSig64;
1515 int64 z;
1516 a = float32_squash_input_denormal(a STATUS_VAR);
1517
1518 aSig = extractFloat32Frac( a );
1519 aExp = extractFloat32Exp( a );
1520 aSign = extractFloat32Sign( a );
1521 shiftCount = aExp - 0xBE;
1522 if ( 0 <= shiftCount ) {
1523 if ( float32_val(a) != 0xDF000000 ) {
1524 float_raise( float_flag_invalid STATUS_VAR);
1525 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1526 return LIT64( 0x7FFFFFFFFFFFFFFF );
1527 }
1528 }
1529 return (sbits64) LIT64( 0x8000000000000000 );
1530 }
1531 else if ( aExp <= 0x7E ) {
1532 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1533 return 0;
1534 }
1535 aSig64 = aSig | 0x00800000;
1536 aSig64 <<= 40;
1537 z = aSig64>>( - shiftCount );
1538 if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1539 STATUS(float_exception_flags) |= float_flag_inexact;
1540 }
1541 if ( aSign ) z = - z;
1542 return z;
1543
1544 }
1545
1546 /*----------------------------------------------------------------------------
1547 | Returns the result of converting the single-precision floating-point value
1548 | `a' to the double-precision floating-point format. The conversion is
1549 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1550 | Arithmetic.
1551 *----------------------------------------------------------------------------*/
1552
1553 float64 float32_to_float64( float32 a STATUS_PARAM )
1554 {
1555 flag aSign;
1556 int16 aExp;
1557 bits32 aSig;
1558 a = float32_squash_input_denormal(a STATUS_VAR);
1559
1560 aSig = extractFloat32Frac( a );
1561 aExp = extractFloat32Exp( a );
1562 aSign = extractFloat32Sign( a );
1563 if ( aExp == 0xFF ) {
1564 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ));
1565 return packFloat64( aSign, 0x7FF, 0 );
1566 }
1567 if ( aExp == 0 ) {
1568 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1569 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1570 --aExp;
1571 }
1572 return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1573
1574 }
1575
1576 #ifdef FLOATX80
1577
1578 /*----------------------------------------------------------------------------
1579 | Returns the result of converting the single-precision floating-point value
1580 | `a' to the extended double-precision floating-point format. The conversion
1581 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1582 | Arithmetic.
1583 *----------------------------------------------------------------------------*/
1584
1585 floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1586 {
1587 flag aSign;
1588 int16 aExp;
1589 bits32 aSig;
1590
1591 a = float32_squash_input_denormal(a STATUS_VAR);
1592 aSig = extractFloat32Frac( a );
1593 aExp = extractFloat32Exp( a );
1594 aSign = extractFloat32Sign( a );
1595 if ( aExp == 0xFF ) {
1596 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) );
1597 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1598 }
1599 if ( aExp == 0 ) {
1600 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1601 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1602 }
1603 aSig |= 0x00800000;
1604 return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1605
1606 }
1607
1608 #endif
1609
1610 #ifdef FLOAT128
1611
1612 /*----------------------------------------------------------------------------
1613 | Returns the result of converting the single-precision floating-point value
1614 | `a' to the double-precision floating-point format. The conversion is
1615 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1616 | Arithmetic.
1617 *----------------------------------------------------------------------------*/
1618
1619 float128 float32_to_float128( float32 a STATUS_PARAM )
1620 {
1621 flag aSign;
1622 int16 aExp;
1623 bits32 aSig;
1624
1625 a = float32_squash_input_denormal(a STATUS_VAR);
1626 aSig = extractFloat32Frac( a );
1627 aExp = extractFloat32Exp( a );
1628 aSign = extractFloat32Sign( a );
1629 if ( aExp == 0xFF ) {
1630 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) );
1631 return packFloat128( aSign, 0x7FFF, 0, 0 );
1632 }
1633 if ( aExp == 0 ) {
1634 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1635 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1636 --aExp;
1637 }
1638 return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1639
1640 }
1641
1642 #endif
1643
1644 /*----------------------------------------------------------------------------
1645 | Rounds the single-precision floating-point value `a' to an integer, and
1646 | returns the result as a single-precision floating-point value. The
1647 | operation is performed according to the IEC/IEEE Standard for Binary
1648 | Floating-Point Arithmetic.
1649 *----------------------------------------------------------------------------*/
1650
1651 float32 float32_round_to_int( float32 a STATUS_PARAM)
1652 {
1653 flag aSign;
1654 int16 aExp;
1655 bits32 lastBitMask, roundBitsMask;
1656 int8 roundingMode;
1657 bits32 z;
1658 a = float32_squash_input_denormal(a STATUS_VAR);
1659
1660 aExp = extractFloat32Exp( a );
1661 if ( 0x96 <= aExp ) {
1662 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1663 return propagateFloat32NaN( a, a STATUS_VAR );
1664 }
1665 return a;
1666 }
1667 if ( aExp <= 0x7E ) {
1668 if ( (bits32) ( float32_val(a)<<1 ) == 0 ) return a;
1669 STATUS(float_exception_flags) |= float_flag_inexact;
1670 aSign = extractFloat32Sign( a );
1671 switch ( STATUS(float_rounding_mode) ) {
1672 case float_round_nearest_even:
1673 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1674 return packFloat32( aSign, 0x7F, 0 );
1675 }
1676 break;
1677 case float_round_down:
1678 return make_float32(aSign ? 0xBF800000 : 0);
1679 case float_round_up:
1680 return make_float32(aSign ? 0x80000000 : 0x3F800000);
1681 }
1682 return packFloat32( aSign, 0, 0 );
1683 }
1684 lastBitMask = 1;
1685 lastBitMask <<= 0x96 - aExp;
1686 roundBitsMask = lastBitMask - 1;
1687 z = float32_val(a);
1688 roundingMode = STATUS(float_rounding_mode);
1689 if ( roundingMode == float_round_nearest_even ) {
1690 z += lastBitMask>>1;
1691 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1692 }
1693 else if ( roundingMode != float_round_to_zero ) {
1694 if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
1695 z += roundBitsMask;
1696 }
1697 }
1698 z &= ~ roundBitsMask;
1699 if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1700 return make_float32(z);
1701
1702 }
1703
1704 /*----------------------------------------------------------------------------
1705 | Returns the result of adding the absolute values of the single-precision
1706 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1707 | before being returned. `zSign' is ignored if the result is a NaN.
1708 | The addition is performed according to the IEC/IEEE Standard for Binary
1709 | Floating-Point Arithmetic.
1710 *----------------------------------------------------------------------------*/
1711
1712 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1713 {
1714 int16 aExp, bExp, zExp;
1715 bits32 aSig, bSig, zSig;
1716 int16 expDiff;
1717
1718 aSig = extractFloat32Frac( a );
1719 aExp = extractFloat32Exp( a );
1720 bSig = extractFloat32Frac( b );
1721 bExp = extractFloat32Exp( b );
1722 expDiff = aExp - bExp;
1723 aSig <<= 6;
1724 bSig <<= 6;
1725 if ( 0 < expDiff ) {
1726 if ( aExp == 0xFF ) {
1727 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1728 return a;
1729 }
1730 if ( bExp == 0 ) {
1731 --expDiff;
1732 }
1733 else {
1734 bSig |= 0x20000000;
1735 }
1736 shift32RightJamming( bSig, expDiff, &bSig );
1737 zExp = aExp;
1738 }
1739 else if ( expDiff < 0 ) {
1740 if ( bExp == 0xFF ) {
1741 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1742 return packFloat32( zSign, 0xFF, 0 );
1743 }
1744 if ( aExp == 0 ) {
1745 ++expDiff;
1746 }
1747 else {
1748 aSig |= 0x20000000;
1749 }
1750 shift32RightJamming( aSig, - expDiff, &aSig );
1751 zExp = bExp;
1752 }
1753 else {
1754 if ( aExp == 0xFF ) {
1755 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1756 return a;
1757 }
1758 if ( aExp == 0 ) {
1759 if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
1760 return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1761 }
1762 zSig = 0x40000000 + aSig + bSig;
1763 zExp = aExp;
1764 goto roundAndPack;
1765 }
1766 aSig |= 0x20000000;
1767 zSig = ( aSig + bSig )<<1;
1768 --zExp;
1769 if ( (sbits32) zSig < 0 ) {
1770 zSig = aSig + bSig;
1771 ++zExp;
1772 }
1773 roundAndPack:
1774 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1775
1776 }
1777
1778 /*----------------------------------------------------------------------------
1779 | Returns the result of subtracting the absolute values of the single-
1780 | precision floating-point values `a' and `b'. If `zSign' is 1, the
1781 | difference is negated before being returned. `zSign' is ignored if the
1782 | result is a NaN. The subtraction is performed according to the IEC/IEEE
1783 | Standard for Binary Floating-Point Arithmetic.
1784 *----------------------------------------------------------------------------*/
1785
1786 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1787 {
1788 int16 aExp, bExp, zExp;
1789 bits32 aSig, bSig, zSig;
1790 int16 expDiff;
1791
1792 aSig = extractFloat32Frac( a );
1793 aExp = extractFloat32Exp( a );
1794 bSig = extractFloat32Frac( b );
1795 bExp = extractFloat32Exp( b );
1796 expDiff = aExp - bExp;
1797 aSig <<= 7;
1798 bSig <<= 7;
1799 if ( 0 < expDiff ) goto aExpBigger;
1800 if ( expDiff < 0 ) goto bExpBigger;
1801 if ( aExp == 0xFF ) {
1802 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1803 float_raise( float_flag_invalid STATUS_VAR);
1804 return float32_default_nan;
1805 }
1806 if ( aExp == 0 ) {
1807 aExp = 1;
1808 bExp = 1;
1809 }
1810 if ( bSig < aSig ) goto aBigger;
1811 if ( aSig < bSig ) goto bBigger;
1812 return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1813 bExpBigger:
1814 if ( bExp == 0xFF ) {
1815 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1816 return packFloat32( zSign ^ 1, 0xFF, 0 );
1817 }
1818 if ( aExp == 0 ) {
1819 ++expDiff;
1820 }
1821 else {
1822 aSig |= 0x40000000;
1823 }
1824 shift32RightJamming( aSig, - expDiff, &aSig );
1825 bSig |= 0x40000000;
1826 bBigger:
1827 zSig = bSig - aSig;
1828 zExp = bExp;
1829 zSign ^= 1;
1830 goto normalizeRoundAndPack;
1831 aExpBigger:
1832 if ( aExp == 0xFF ) {
1833 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1834 return a;
1835 }
1836 if ( bExp == 0 ) {
1837 --expDiff;
1838 }
1839 else {
1840 bSig |= 0x40000000;
1841 }
1842 shift32RightJamming( bSig, expDiff, &bSig );
1843 aSig |= 0x40000000;
1844 aBigger:
1845 zSig = aSig - bSig;
1846 zExp = aExp;
1847 normalizeRoundAndPack:
1848 --zExp;
1849 return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1850
1851 }
1852
1853 /*----------------------------------------------------------------------------
1854 | Returns the result of adding the single-precision floating-point values `a'
1855 | and `b'. The operation is performed according to the IEC/IEEE Standard for
1856 | Binary Floating-Point Arithmetic.
1857 *----------------------------------------------------------------------------*/
1858
1859 float32 float32_add( float32 a, float32 b STATUS_PARAM )
1860 {
1861 flag aSign, bSign;
1862 a = float32_squash_input_denormal(a STATUS_VAR);
1863 b = float32_squash_input_denormal(b STATUS_VAR);
1864
1865 aSign = extractFloat32Sign( a );
1866 bSign = extractFloat32Sign( b );
1867 if ( aSign == bSign ) {
1868 return addFloat32Sigs( a, b, aSign STATUS_VAR);
1869 }
1870 else {
1871 return subFloat32Sigs( a, b, aSign STATUS_VAR );
1872 }
1873
1874 }
1875
1876 /*----------------------------------------------------------------------------
1877 | Returns the result of subtracting the single-precision floating-point values
1878 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1879 | for Binary Floating-Point Arithmetic.
1880 *----------------------------------------------------------------------------*/
1881
1882 float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1883 {
1884 flag aSign, bSign;
1885 a = float32_squash_input_denormal(a STATUS_VAR);
1886 b = float32_squash_input_denormal(b STATUS_VAR);
1887
1888 aSign = extractFloat32Sign( a );
1889 bSign = extractFloat32Sign( b );
1890 if ( aSign == bSign ) {
1891 return subFloat32Sigs( a, b, aSign STATUS_VAR );
1892 }
1893 else {
1894 return addFloat32Sigs( a, b, aSign STATUS_VAR );
1895 }
1896
1897 }
1898
1899 /*----------------------------------------------------------------------------
1900 | Returns the result of multiplying the single-precision floating-point values
1901 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1902 | for Binary Floating-Point Arithmetic.
1903 *----------------------------------------------------------------------------*/
1904
1905 float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1906 {
1907 flag aSign, bSign, zSign;
1908 int16 aExp, bExp, zExp;
1909 bits32 aSig, bSig;
1910 bits64 zSig64;
1911 bits32 zSig;
1912
1913 a = float32_squash_input_denormal(a STATUS_VAR);
1914 b = float32_squash_input_denormal(b STATUS_VAR);
1915
1916 aSig = extractFloat32Frac( a );
1917 aExp = extractFloat32Exp( a );
1918 aSign = extractFloat32Sign( a );
1919 bSig = extractFloat32Frac( b );
1920 bExp = extractFloat32Exp( b );
1921 bSign = extractFloat32Sign( b );
1922 zSign = aSign ^ bSign;
1923 if ( aExp == 0xFF ) {
1924 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1925 return propagateFloat32NaN( a, b STATUS_VAR );
1926 }
1927 if ( ( bExp | bSig ) == 0 ) {
1928 float_raise( float_flag_invalid STATUS_VAR);
1929 return float32_default_nan;
1930 }
1931 return packFloat32( zSign, 0xFF, 0 );
1932 }
1933 if ( bExp == 0xFF ) {
1934 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1935 if ( ( aExp | aSig ) == 0 ) {
1936 float_raise( float_flag_invalid STATUS_VAR);
1937 return float32_default_nan;
1938 }
1939 return packFloat32( zSign, 0xFF, 0 );
1940 }
1941 if ( aExp == 0 ) {
1942 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1943 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1944 }
1945 if ( bExp == 0 ) {
1946 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1947 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1948 }
1949 zExp = aExp + bExp - 0x7F;
1950 aSig = ( aSig | 0x00800000 )<<7;
1951 bSig = ( bSig | 0x00800000 )<<8;
1952 shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1953 zSig = zSig64;
1954 if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1955 zSig <<= 1;
1956 --zExp;
1957 }
1958 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1959
1960 }
1961
1962 /*----------------------------------------------------------------------------
1963 | Returns the result of dividing the single-precision floating-point value `a'
1964 | by the corresponding value `b'. The operation is performed according to the
1965 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1966 *----------------------------------------------------------------------------*/
1967
1968 float32 float32_div( float32 a, float32 b STATUS_PARAM )
1969 {
1970 flag aSign, bSign, zSign;
1971 int16 aExp, bExp, zExp;
1972 bits32 aSig, bSig, zSig;
1973 a = float32_squash_input_denormal(a STATUS_VAR);
1974 b = float32_squash_input_denormal(b STATUS_VAR);
1975
1976 aSig = extractFloat32Frac( a );
1977 aExp = extractFloat32Exp( a );
1978 aSign = extractFloat32Sign( a );
1979 bSig = extractFloat32Frac( b );
1980 bExp = extractFloat32Exp( b );
1981 bSign = extractFloat32Sign( b );
1982 zSign = aSign ^ bSign;
1983 if ( aExp == 0xFF ) {
1984 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1985 if ( bExp == 0xFF ) {
1986 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1987 float_raise( float_flag_invalid STATUS_VAR);
1988 return float32_default_nan;
1989 }
1990 return packFloat32( zSign, 0xFF, 0 );
1991 }
1992 if ( bExp == 0xFF ) {
1993 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1994 return packFloat32( zSign, 0, 0 );
1995 }
1996 if ( bExp == 0 ) {
1997 if ( bSig == 0 ) {
1998 if ( ( aExp | aSig ) == 0 ) {
1999 float_raise( float_flag_invalid STATUS_VAR);
2000 return float32_default_nan;
2001 }
2002 float_raise( float_flag_divbyzero STATUS_VAR);
2003 return packFloat32( zSign, 0xFF, 0 );
2004 }
2005 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2006 }
2007 if ( aExp == 0 ) {
2008 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2009 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2010 }
2011 zExp = aExp - bExp + 0x7D;
2012 aSig = ( aSig | 0x00800000 )<<7;
2013 bSig = ( bSig | 0x00800000 )<<8;
2014 if ( bSig <= ( aSig + aSig ) ) {
2015 aSig >>= 1;
2016 ++zExp;
2017 }
2018 zSig = ( ( (bits64) aSig )<<32 ) / bSig;
2019 if ( ( zSig & 0x3F ) == 0 ) {
2020 zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
2021 }
2022 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2023
2024 }
2025
2026 /*----------------------------------------------------------------------------
2027 | Returns the remainder of the single-precision floating-point value `a'
2028 | with respect to the corresponding value `b'. The operation is performed
2029 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2030 *----------------------------------------------------------------------------*/
2031
2032 float32 float32_rem( float32 a, float32 b STATUS_PARAM )
2033 {
2034 flag aSign, zSign;
2035 int16 aExp, bExp, expDiff;
2036 bits32 aSig, bSig;
2037 bits32 q;
2038 bits64 aSig64, bSig64, q64;
2039 bits32 alternateASig;
2040 sbits32 sigMean;
2041 a = float32_squash_input_denormal(a STATUS_VAR);
2042 b = float32_squash_input_denormal(b STATUS_VAR);
2043
2044 aSig = extractFloat32Frac( a );
2045 aExp = extractFloat32Exp( a );
2046 aSign = extractFloat32Sign( a );
2047 bSig = extractFloat32Frac( b );
2048 bExp = extractFloat32Exp( b );
2049 if ( aExp == 0xFF ) {
2050 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2051 return propagateFloat32NaN( a, b STATUS_VAR );
2052 }
2053 float_raise( float_flag_invalid STATUS_VAR);
2054 return float32_default_nan;
2055 }
2056 if ( bExp == 0xFF ) {
2057 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2058 return a;
2059 }
2060 if ( bExp == 0 ) {
2061 if ( bSig == 0 ) {
2062 float_raise( float_flag_invalid STATUS_VAR);
2063 return float32_default_nan;
2064 }
2065 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2066 }
2067 if ( aExp == 0 ) {
2068 if ( aSig == 0 ) return a;
2069 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2070 }
2071 expDiff = aExp - bExp;
2072 aSig |= 0x00800000;
2073 bSig |= 0x00800000;
2074 if ( expDiff < 32 ) {
2075 aSig <<= 8;
2076 bSig <<= 8;
2077 if ( expDiff < 0 ) {
2078 if ( expDiff < -1 ) return a;
2079 aSig >>= 1;
2080 }
2081 q = ( bSig <= aSig );
2082 if ( q ) aSig -= bSig;
2083 if ( 0 < expDiff ) {
2084 q = ( ( (bits64) aSig )<<32 ) / bSig;
2085 q >>= 32 - expDiff;
2086 bSig >>= 2;
2087 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2088 }
2089 else {
2090 aSig >>= 2;
2091 bSig >>= 2;
2092 }
2093 }
2094 else {
2095 if ( bSig <= aSig ) aSig -= bSig;
2096 aSig64 = ( (bits64) aSig )<<40;
2097 bSig64 = ( (bits64) bSig )<<40;
2098 expDiff -= 64;
2099 while ( 0 < expDiff ) {
2100 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2101 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2102 aSig64 = - ( ( bSig * q64 )<<38 );
2103 expDiff -= 62;
2104 }
2105 expDiff += 64;
2106 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2107 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2108 q = q64>>( 64 - expDiff );
2109 bSig <<= 6;
2110 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2111 }
2112 do {
2113 alternateASig = aSig;
2114 ++q;
2115 aSig -= bSig;
2116 } while ( 0 <= (sbits32) aSig );
2117 sigMean = aSig + alternateASig;
2118 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2119 aSig = alternateASig;
2120 }
2121 zSign = ( (sbits32) aSig < 0 );
2122 if ( zSign ) aSig = - aSig;
2123 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2124
2125 }
2126
2127 /*----------------------------------------------------------------------------
2128 | Returns the square root of the single-precision floating-point value `a'.
2129 | The operation is performed according to the IEC/IEEE Standard for Binary
2130 | Floating-Point Arithmetic.
2131 *----------------------------------------------------------------------------*/
2132
2133 float32 float32_sqrt( float32 a STATUS_PARAM )
2134 {
2135 flag aSign;
2136 int16 aExp, zExp;
2137 bits32 aSig, zSig;
2138 bits64 rem, term;
2139 a = float32_squash_input_denormal(a STATUS_VAR);
2140
2141 aSig = extractFloat32Frac( a );
2142 aExp = extractFloat32Exp( a );
2143 aSign = extractFloat32Sign( a );
2144 if ( aExp == 0xFF ) {
2145 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2146 if ( ! aSign ) return a;
2147 float_raise( float_flag_invalid STATUS_VAR);
2148 return float32_default_nan;
2149 }
2150 if ( aSign ) {
2151 if ( ( aExp | aSig ) == 0 ) return a;
2152 float_raise( float_flag_invalid STATUS_VAR);
2153 return float32_default_nan;
2154 }
2155 if ( aExp == 0 ) {
2156 if ( aSig == 0 ) return float32_zero;
2157 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2158 }
2159 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2160 aSig = ( aSig | 0x00800000 )<<8;
2161 zSig = estimateSqrt32( aExp, aSig ) + 2;
2162 if ( ( zSig & 0x7F ) <= 5 ) {
2163 if ( zSig < 2 ) {
2164 zSig = 0x7FFFFFFF;
2165 goto roundAndPack;
2166 }
2167 aSig >>= aExp & 1;
2168 term = ( (bits64) zSig ) * zSig;
2169 rem = ( ( (bits64) aSig )<<32 ) - term;
2170 while ( (sbits64) rem < 0 ) {
2171 --zSig;
2172 rem += ( ( (bits64) zSig )<<1 ) | 1;
2173 }
2174 zSig |= ( rem != 0 );
2175 }
2176 shift32RightJamming( zSig, 1, &zSig );
2177 roundAndPack:
2178 return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2179
2180 }
2181
2182 /*----------------------------------------------------------------------------
2183 | Returns the binary exponential of the single-precision floating-point value
2184 | `a'. The operation is performed according to the IEC/IEEE Standard for
2185 | Binary Floating-Point Arithmetic.
2186 |
2187 | Uses the following identities:
2188 |
2189 | 1. -------------------------------------------------------------------------
2190 | x x*ln(2)
2191 | 2 = e
2192 |
2193 | 2. -------------------------------------------------------------------------
2194 | 2 3 4 5 n
2195 | x x x x x x x
2196 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2197 | 1! 2! 3! 4! 5! n!
2198 *----------------------------------------------------------------------------*/
2199
2200 static const float64 float32_exp2_coefficients[15] =
2201 {
2202 make_float64( 0x3ff0000000000000ll ), /* 1 */
2203 make_float64( 0x3fe0000000000000ll ), /* 2 */
2204 make_float64( 0x3fc5555555555555ll ), /* 3 */
2205 make_float64( 0x3fa5555555555555ll ), /* 4 */
2206 make_float64( 0x3f81111111111111ll ), /* 5 */
2207 make_float64( 0x3f56c16c16c16c17ll ), /* 6 */
2208 make_float64( 0x3f2a01a01a01a01all ), /* 7 */
2209 make_float64( 0x3efa01a01a01a01all ), /* 8 */
2210 make_float64( 0x3ec71de3a556c734ll ), /* 9 */
2211 make_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2212 make_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2213 make_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2214 make_float64( 0x3de6124613a86d09ll ), /* 13 */
2215 make_float64( 0x3da93974a8c07c9dll ), /* 14 */
2216 make_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2217 };
2218
2219 float32 float32_exp2( float32 a STATUS_PARAM )
2220 {
2221 flag aSign;
2222 int16 aExp;
2223 bits32 aSig;
2224 float64 r, x, xn;
2225 int i;
2226 a = float32_squash_input_denormal(a STATUS_VAR);
2227
2228 aSig = extractFloat32Frac( a );
2229 aExp = extractFloat32Exp( a );
2230 aSign = extractFloat32Sign( a );
2231
2232 if ( aExp == 0xFF) {
2233 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2234 return (aSign) ? float32_zero : a;
2235 }
2236 if (aExp == 0) {
2237 if (aSig == 0) return float32_one;
2238 }
2239
2240 float_raise( float_flag_inexact STATUS_VAR);
2241
2242 /* ******************************* */
2243 /* using float64 for approximation */
2244 /* ******************************* */
2245 x = float32_to_float64(a STATUS_VAR);
2246 x = float64_mul(x, float64_ln2 STATUS_VAR);
2247
2248 xn = x;
2249 r = float64_one;
2250 for (i = 0 ; i < 15 ; i++) {
2251 float64 f;
2252
2253 f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2254 r = float64_add(r, f STATUS_VAR);
2255
2256 xn = float64_mul(xn, x STATUS_VAR);
2257 }
2258
2259 return float64_to_float32(r, status);
2260 }
2261
2262 /*----------------------------------------------------------------------------
2263 | Returns the binary log of the single-precision floating-point value `a'.
2264 | The operation is performed according to the IEC/IEEE Standard for Binary
2265 | Floating-Point Arithmetic.
2266 *----------------------------------------------------------------------------*/
2267 float32 float32_log2( float32 a STATUS_PARAM )
2268 {
2269 flag aSign, zSign;
2270 int16 aExp;
2271 bits32 aSig, zSig, i;
2272
2273 a = float32_squash_input_denormal(a STATUS_VAR);
2274 aSig = extractFloat32Frac( a );
2275 aExp = extractFloat32Exp( a );
2276 aSign = extractFloat32Sign( a );
2277
2278 if ( aExp == 0 ) {
2279 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2280 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2281 }
2282 if ( aSign ) {
2283 float_raise( float_flag_invalid STATUS_VAR);
2284 return float32_default_nan;
2285 }
2286 if ( aExp == 0xFF ) {
2287 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2288 return a;
2289 }
2290
2291 aExp -= 0x7F;
2292 aSig |= 0x00800000;
2293 zSign = aExp < 0;
2294 zSig = aExp << 23;
2295
2296 for (i = 1 << 22; i > 0; i >>= 1) {
2297 aSig = ( (bits64)aSig * aSig ) >> 23;
2298 if ( aSig & 0x01000000 ) {
2299 aSig >>= 1;
2300 zSig |= i;
2301 }
2302 }
2303
2304 if ( zSign )
2305 zSig = -zSig;
2306
2307 return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2308 }
2309
2310 /*----------------------------------------------------------------------------
2311 | Returns 1 if the single-precision floating-point value `a' is equal to
2312 | the corresponding value `b', and 0 otherwise. The comparison is performed
2313 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2314 *----------------------------------------------------------------------------*/
2315
2316 int float32_eq( float32 a, float32 b STATUS_PARAM )
2317 {
2318 a = float32_squash_input_denormal(a STATUS_VAR);
2319 b = float32_squash_input_denormal(b STATUS_VAR);
2320
2321 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2322 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2323 ) {
2324 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2325 float_raise( float_flag_invalid STATUS_VAR);
2326 }
2327 return 0;
2328 }
2329 return ( float32_val(a) == float32_val(b) ) ||
2330 ( (bits32) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2331
2332 }
2333
2334 /*----------------------------------------------------------------------------
2335 | Returns 1 if the single-precision floating-point value `a' is less than
2336 | or equal to the corresponding value `b', and 0 otherwise. The comparison
2337 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2338 | Arithmetic.
2339 *----------------------------------------------------------------------------*/
2340
2341 int float32_le( float32 a, float32 b STATUS_PARAM )
2342 {
2343 flag aSign, bSign;
2344 bits32 av, bv;
2345 a = float32_squash_input_denormal(a STATUS_VAR);
2346 b = float32_squash_input_denormal(b STATUS_VAR);
2347
2348 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2349 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2350 ) {
2351 float_raise( float_flag_invalid STATUS_VAR);
2352 return 0;
2353 }
2354 aSign = extractFloat32Sign( a );
2355 bSign = extractFloat32Sign( b );
2356 av = float32_val(a);
2357 bv = float32_val(b);
2358 if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2359 return ( av == bv ) || ( aSign ^ ( av < bv ) );
2360
2361 }
2362
2363 /*----------------------------------------------------------------------------
2364 | Returns 1 if the single-precision floating-point value `a' is less than
2365 | the corresponding value `b', and 0 otherwise. The comparison is performed
2366 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2367 *----------------------------------------------------------------------------*/
2368
2369 int float32_lt( float32 a, float32 b STATUS_PARAM )
2370 {
2371 flag aSign, bSign;
2372 bits32 av, bv;
2373 a = float32_squash_input_denormal(a STATUS_VAR);
2374 b = float32_squash_input_denormal(b STATUS_VAR);
2375
2376 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2377 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2378 ) {
2379 float_raise( float_flag_invalid STATUS_VAR);
2380 return 0;
2381 }
2382 aSign = extractFloat32Sign( a );
2383 bSign = extractFloat32Sign( b );
2384 av = float32_val(a);
2385 bv = float32_val(b);
2386 if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 );
2387 return ( av != bv ) && ( aSign ^ ( av < bv ) );
2388
2389 }
2390
2391 /*----------------------------------------------------------------------------
2392 | Returns 1 if the single-precision floating-point value `a' is equal to
2393 | the corresponding value `b', and 0 otherwise. The invalid exception is
2394 | raised if either operand is a NaN. Otherwise, the comparison is performed
2395 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2396 *----------------------------------------------------------------------------*/
2397
2398 int float32_eq_signaling( float32 a, float32 b STATUS_PARAM )
2399 {
2400 bits32 av, bv;
2401 a = float32_squash_input_denormal(a STATUS_VAR);
2402 b = float32_squash_input_denormal(b STATUS_VAR);
2403
2404 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2405 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2406 ) {
2407 float_raise( float_flag_invalid STATUS_VAR);
2408 return 0;
2409 }
2410 av = float32_val(a);
2411 bv = float32_val(b);
2412 return ( av == bv ) || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2413
2414 }
2415
2416 /*----------------------------------------------------------------------------
2417 | Returns 1 if the single-precision floating-point value `a' is less than or
2418 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2419 | cause an exception. Otherwise, the comparison is performed according to the
2420 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2421 *----------------------------------------------------------------------------*/
2422
2423 int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2424 {
2425 flag aSign, bSign;
2426 bits32 av, bv;
2427 a = float32_squash_input_denormal(a STATUS_VAR);
2428 b = float32_squash_input_denormal(b STATUS_VAR);
2429
2430 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2431 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2432 ) {
2433 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2434 float_raise( float_flag_invalid STATUS_VAR);
2435 }
2436 return 0;
2437 }
2438 aSign = extractFloat32Sign( a );
2439 bSign = extractFloat32Sign( b );
2440 av = float32_val(a);
2441 bv = float32_val(b);
2442 if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2443 return ( av == bv ) || ( aSign ^ ( av < bv ) );
2444
2445 }
2446
2447 /*----------------------------------------------------------------------------
2448 | Returns 1 if the single-precision floating-point value `a' is less than
2449 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2450 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
2451 | Standard for Binary Floating-Point Arithmetic.
2452 *----------------------------------------------------------------------------*/
2453
2454 int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2455 {
2456 flag aSign, bSign;
2457 bits32 av, bv;
2458 a = float32_squash_input_denormal(a STATUS_VAR);
2459 b = float32_squash_input_denormal(b STATUS_VAR);
2460
2461 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2462 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2463 ) {
2464 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2465 float_raise( float_flag_invalid STATUS_VAR);
2466 }
2467 return 0;
2468 }
2469 aSign = extractFloat32Sign( a );
2470 bSign = extractFloat32Sign( b );
2471 av = float32_val(a);
2472 bv = float32_val(b);
2473 if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 );
2474 return ( av != bv ) && ( aSign ^ ( av < bv ) );
2475
2476 }
2477
2478 /*----------------------------------------------------------------------------
2479 | Returns the result of converting the double-precision floating-point value
2480 | `a' to the 32-bit two's complement integer format. The conversion is
2481 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2482 | Arithmetic---which means in particular that the conversion is rounded
2483 | according to the current rounding mode. If `a' is a NaN, the largest
2484 | positive integer is returned. Otherwise, if the conversion overflows, the
2485 | largest integer with the same sign as `a' is returned.
2486 *----------------------------------------------------------------------------*/
2487
2488 int32 float64_to_int32( float64 a STATUS_PARAM )
2489 {
2490 flag aSign;
2491 int16 aExp, shiftCount;
2492 bits64 aSig;
2493 a = float64_squash_input_denormal(a STATUS_VAR);
2494
2495 aSig = extractFloat64Frac( a );
2496 aExp = extractFloat64Exp( a );
2497 aSign = extractFloat64Sign( a );
2498 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2499 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2500 shiftCount = 0x42C - aExp;
2501 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2502 return roundAndPackInt32( aSign, aSig STATUS_VAR );
2503
2504 }
2505
2506 /*----------------------------------------------------------------------------
2507 | Returns the result of converting the double-precision floating-point value
2508 | `a' to the 32-bit two's complement integer format. The conversion is
2509 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2510 | Arithmetic, except that the conversion is always rounded toward zero.
2511 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2512 | the conversion overflows, the largest integer with the same sign as `a' is
2513 | returned.
2514 *----------------------------------------------------------------------------*/
2515
2516 int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2517 {
2518 flag aSign;
2519 int16 aExp, shiftCount;
2520 bits64 aSig, savedASig;
2521 int32 z;
2522 a = float64_squash_input_denormal(a STATUS_VAR);
2523
2524 aSig = extractFloat64Frac( a );
2525 aExp = extractFloat64Exp( a );
2526 aSign = extractFloat64Sign( a );
2527 if ( 0x41E < aExp ) {
2528 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2529 goto invalid;
2530 }
2531 else if ( aExp < 0x3FF ) {
2532 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2533 return 0;
2534 }
2535 aSig |= LIT64( 0x0010000000000000 );
2536 shiftCount = 0x433 - aExp;
2537 savedASig = aSig;
2538 aSig >>= shiftCount;
2539 z = aSig;
2540 if ( aSign ) z = - z;
2541 if ( ( z < 0 ) ^ aSign ) {
2542 invalid:
2543 float_raise( float_flag_invalid STATUS_VAR);
2544 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2545 }
2546 if ( ( aSig<<shiftCount ) != savedASig ) {
2547 STATUS(float_exception_flags) |= float_flag_inexact;
2548 }
2549 return z;
2550
2551 }
2552
2553 /*----------------------------------------------------------------------------
2554 | Returns the result of converting the double-precision floating-point value
2555 | `a' to the 16-bit two's complement integer format. The conversion is
2556 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2557 | Arithmetic, except that the conversion is always rounded toward zero.
2558 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2559 | the conversion overflows, the largest integer with the same sign as `a' is
2560 | returned.
2561 *----------------------------------------------------------------------------*/
2562
2563 int16 float64_to_int16_round_to_zero( float64 a STATUS_PARAM )
2564 {
2565 flag aSign;
2566 int16 aExp, shiftCount;
2567 bits64 aSig, savedASig;
2568 int32 z;
2569
2570 aSig = extractFloat64Frac( a );
2571 aExp = extractFloat64Exp( a );
2572 aSign = extractFloat64Sign( a );
2573 if ( 0x40E < aExp ) {
2574 if ( ( aExp == 0x7FF ) && aSig ) {
2575 aSign = 0;
2576 }
2577 goto invalid;
2578 }
2579 else if ( aExp < 0x3FF ) {
2580 if ( aExp || aSig ) {
2581 STATUS(float_exception_flags) |= float_flag_inexact;
2582 }
2583 return 0;
2584 }
2585 aSig |= LIT64( 0x0010000000000000 );
2586 shiftCount = 0x433 - aExp;
2587 savedASig = aSig;
2588 aSig >>= shiftCount;
2589 z = aSig;
2590 if ( aSign ) {
2591 z = - z;
2592 }
2593 if ( ( (int16_t)z < 0 ) ^ aSign ) {
2594 invalid:
2595 float_raise( float_flag_invalid STATUS_VAR);
2596 return aSign ? (sbits32) 0xffff8000 : 0x7FFF;
2597 }
2598 if ( ( aSig<<shiftCount ) != savedASig ) {
2599 STATUS(float_exception_flags) |= float_flag_inexact;
2600 }
2601 return z;
2602 }
2603
2604 /*----------------------------------------------------------------------------
2605 | Returns the result of converting the double-precision floating-point value
2606 | `a' to the 64-bit two's complement integer format. The conversion is
2607 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2608 | Arithmetic---which means in particular that the conversion is rounded
2609 | according to the current rounding mode. If `a' is a NaN, the largest
2610 | positive integer is returned. Otherwise, if the conversion overflows, the
2611 | largest integer with the same sign as `a' is returned.
2612 *----------------------------------------------------------------------------*/
2613
2614 int64 float64_to_int64( float64 a STATUS_PARAM )
2615 {
2616 flag aSign;
2617 int16 aExp, shiftCount;
2618 bits64 aSig, aSigExtra;
2619 a = float64_squash_input_denormal(a STATUS_VAR);
2620
2621 aSig = extractFloat64Frac( a );
2622 aExp = extractFloat64Exp( a );
2623 aSign = extractFloat64Sign( a );
2624 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2625 shiftCount = 0x433 - aExp;
2626 if ( shiftCount <= 0 ) {
2627 if ( 0x43E < aExp ) {
2628 float_raise( float_flag_invalid STATUS_VAR);
2629 if ( ! aSign
2630 || ( ( aExp == 0x7FF )
2631 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2632 ) {
2633 return LIT64( 0x7FFFFFFFFFFFFFFF );
2634 }
2635 return (sbits64) LIT64( 0x8000000000000000 );
2636 }
2637 aSigExtra = 0;
2638 aSig <<= - shiftCount;
2639 }
2640 else {
2641 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2642 }
2643 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2644
2645 }
2646
2647 /*----------------------------------------------------------------------------
2648 | Returns the result of converting the double-precision floating-point value
2649 | `a' to the 64-bit two's complement integer format. The conversion is
2650 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2651 | Arithmetic, except that the conversion is always rounded toward zero.
2652 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2653 | the conversion overflows, the largest integer with the same sign as `a' is
2654 | returned.
2655 *----------------------------------------------------------------------------*/
2656
2657 int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2658 {
2659 flag aSign;
2660 int16 aExp, shiftCount;
2661 bits64 aSig;
2662 int64 z;
2663 a = float64_squash_input_denormal(a STATUS_VAR);
2664
2665 aSig = extractFloat64Frac( a );
2666 aExp = extractFloat64Exp( a );
2667 aSign = extractFloat64Sign( a );
2668 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2669 shiftCount = aExp - 0x433;
2670 if ( 0 <= shiftCount ) {
2671 if ( 0x43E <= aExp ) {
2672 if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
2673 float_raise( float_flag_invalid STATUS_VAR);
2674 if ( ! aSign
2675 || ( ( aExp == 0x7FF )
2676 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2677 ) {
2678 return LIT64( 0x7FFFFFFFFFFFFFFF );
2679 }
2680 }
2681 return (sbits64) LIT64( 0x8000000000000000 );
2682 }
2683 z = aSig<<shiftCount;
2684 }
2685 else {
2686 if ( aExp < 0x3FE ) {
2687 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2688 return 0;
2689 }
2690 z = aSig>>( - shiftCount );
2691 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2692 STATUS(float_exception_flags) |= float_flag_inexact;
2693 }
2694 }
2695 if ( aSign ) z = - z;
2696 return z;
2697
2698 }
2699
2700 /*----------------------------------------------------------------------------
2701 | Returns the result of converting the double-precision floating-point value
2702 | `a' to the single-precision floating-point format. The conversion is
2703 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2704 | Arithmetic.
2705 *----------------------------------------------------------------------------*/
2706
2707 float32 float64_to_float32( float64 a STATUS_PARAM )
2708 {
2709 flag aSign;
2710 int16 aExp;
2711 bits64 aSig;
2712 bits32 zSig;
2713 a = float64_squash_input_denormal(a STATUS_VAR);
2714
2715 aSig = extractFloat64Frac( a );
2716 aExp = extractFloat64Exp( a );
2717 aSign = extractFloat64Sign( a );
2718 if ( aExp == 0x7FF ) {
2719 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) );
2720 return packFloat32( aSign, 0xFF, 0 );
2721 }
2722 shift64RightJamming( aSig, 22, &aSig );
2723 zSig = aSig;
2724 if ( aExp || zSig ) {
2725 zSig |= 0x40000000;
2726 aExp -= 0x381;
2727 }
2728 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2729
2730 }
2731
2732
2733 /*----------------------------------------------------------------------------
2734 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2735 | half-precision floating-point value, returning the result. After being
2736 | shifted into the proper positions, the three fields are simply added
2737 | together to form the result. This means that any integer portion of `zSig'
2738 | will be added into the exponent. Since a properly normalized significand
2739 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2740 | than the desired result exponent whenever `zSig' is a complete, normalized
2741 | significand.
2742 *----------------------------------------------------------------------------*/
2743 static float16 packFloat16(flag zSign, int16 zExp, bits16 zSig)
2744 {
2745 return make_float16(
2746 (((bits32)zSign) << 15) + (((bits32)zExp) << 10) + zSig);
2747 }
2748
2749 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
2750 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
2751
2752 float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM)
2753 {
2754 flag aSign;
2755 int16 aExp;
2756 bits32 aSig;
2757
2758 aSign = extractFloat16Sign(a);
2759 aExp = extractFloat16Exp(a);
2760 aSig = extractFloat16Frac(a);
2761
2762 if (aExp == 0x1f && ieee) {
2763 if (aSig) {
2764 /* Make sure correct exceptions are raised. */
2765 float32ToCommonNaN(a STATUS_VAR);
2766 aSig |= 0x200;
2767 }
2768 return packFloat32(aSign, 0xff, aSig << 13);
2769 }
2770 if (aExp == 0) {
2771 int8 shiftCount;
2772
2773 if (aSig == 0) {
2774 return packFloat32(aSign, 0, 0);
2775 }
2776
2777 shiftCount = countLeadingZeros32( aSig ) - 21;
2778 aSig = aSig << shiftCount;
2779 aExp = -shiftCount;
2780 }
2781 return packFloat32( aSign, aExp + 0x70, aSig << 13);
2782 }
2783
2784 float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM)
2785 {
2786 flag aSign;
2787 int16 aExp;
2788 bits32 aSig;
2789 bits32 mask;
2790 bits32 increment;
2791 int8 roundingMode;
2792 a = float32_squash_input_denormal(a STATUS_VAR);
2793
2794 aSig = extractFloat32Frac( a );
2795 aExp = extractFloat32Exp( a );
2796 aSign = extractFloat32Sign( a );
2797 if ( aExp == 0xFF ) {
2798 if (aSig) {
2799 /* Make sure correct exceptions are raised. */
2800 float32ToCommonNaN(a STATUS_VAR);
2801 aSig |= 0x00400000;
2802 }
2803 return packFloat16(aSign, 0x1f, aSig >> 13);
2804 }
2805 if (aExp == 0 && aSign == 0) {
2806 return packFloat16(aSign, 0, 0);
2807 }
2808 /* Decimal point between bits 22 and 23. */
2809 aSig |= 0x00800000;
2810 aExp -= 0x7f;
2811 if (aExp < -14) {
2812 mask = 0x007fffff;
2813 if (aExp < -24) {
2814 aExp = -25;
2815 } else {
2816 mask >>= 24 + aExp;
2817 }
2818 } else {
2819 mask = 0x00001fff;
2820 }
2821 if (aSig & mask) {
2822 float_raise( float_flag_underflow STATUS_VAR );
2823 roundingMode = STATUS(float_rounding_mode);
2824 switch (roundingMode) {
2825 case float_round_nearest_even:
2826 increment = (mask + 1) >> 1;
2827 if ((aSig & mask) == increment) {
2828 increment = aSig & (increment << 1);
2829 }
2830 break;
2831 case float_round_up:
2832 increment = aSign ? 0 : mask;
2833 break;
2834 case float_round_down:
2835 increment = aSign ? mask : 0;
2836 break;
2837 default: /* round_to_zero */
2838 increment = 0;
2839 break;
2840 }
2841 aSig += increment;
2842 if (aSig >= 0x01000000) {
2843 aSig >>= 1;
2844 aExp++;
2845 }
2846 } else if (aExp < -14
2847 && STATUS(float_detect_tininess) == float_tininess_before_rounding) {
2848 float_raise( float_flag_underflow STATUS_VAR);
2849 }
2850
2851 if (ieee) {
2852 if (aExp > 15) {
2853 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
2854 return packFloat16(aSign, 0x1f, 0);
2855 }
2856 } else {
2857 if (aExp > 16) {
2858 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
2859 return packFloat16(aSign, 0x1f, 0x3ff);
2860 }
2861 }
2862 if (aExp < -24) {
2863 return packFloat16(aSign, 0, 0);
2864 }
2865 if (aExp < -14) {
2866 aSig >>= -14 - aExp;
2867 aExp = -14;
2868 }
2869 return packFloat16(aSign, aExp + 14, aSig >> 13);
2870 }
2871
2872 #ifdef FLOATX80
2873
2874 /*----------------------------------------------------------------------------
2875 | Returns the result of converting the double-precision floating-point value
2876 | `a' to the extended double-precision floating-point format. The conversion
2877 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2878 | Arithmetic.
2879 *----------------------------------------------------------------------------*/
2880
2881 floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
2882 {
2883 flag aSign;
2884 int16 aExp;
2885 bits64 aSig;
2886
2887 a = float64_squash_input_denormal(a STATUS_VAR);
2888 aSig = extractFloat64Frac( a );
2889 aExp = extractFloat64Exp( a );
2890 aSign = extractFloat64Sign( a );
2891 if ( aExp == 0x7FF ) {
2892 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) );
2893 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2894 }
2895 if ( aExp == 0 ) {
2896 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2897 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2898 }
2899 return
2900 packFloatx80(
2901 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2902
2903 }
2904
2905 #endif
2906
2907 #ifdef FLOAT128
2908
2909 /*----------------------------------------------------------------------------
2910 | Returns the result of converting the double-precision floating-point value
2911 | `a' to the quadruple-precision floating-point format. The conversion is
2912 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2913 | Arithmetic.
2914 *----------------------------------------------------------------------------*/
2915
2916 float128 float64_to_float128( float64 a STATUS_PARAM )
2917 {
2918 flag aSign;
2919 int16 aExp;
2920 bits64 aSig, zSig0, zSig1;
2921
2922 a = float64_squash_input_denormal(a STATUS_VAR);
2923 aSig = extractFloat64Frac( a );
2924 aExp = extractFloat64Exp( a );
2925 aSign = extractFloat64Sign( a );
2926 if ( aExp == 0x7FF ) {
2927 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) );
2928 return packFloat128( aSign, 0x7FFF, 0, 0 );
2929 }
2930 if ( aExp == 0 ) {
2931 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2932 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2933 --aExp;
2934 }
2935 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2936 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2937
2938 }
2939
2940 #endif
2941
2942 /*----------------------------------------------------------------------------
2943 | Rounds the double-precision floating-point value `a' to an integer, and
2944 | returns the result as a double-precision floating-point value. The
2945 | operation is performed according to the IEC/IEEE Standard for Binary
2946 | Floating-Point Arithmetic.
2947 *----------------------------------------------------------------------------*/
2948
2949 float64 float64_round_to_int( float64 a STATUS_PARAM )
2950 {
2951 flag aSign;
2952 int16 aExp;
2953 bits64 lastBitMask, roundBitsMask;
2954 int8 roundingMode;
2955 bits64 z;
2956 a = float64_squash_input_denormal(a STATUS_VAR);
2957
2958 aExp = extractFloat64Exp( a );
2959 if ( 0x433 <= aExp ) {
2960 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2961 return propagateFloat64NaN( a, a STATUS_VAR );
2962 }
2963 return a;
2964 }
2965 if ( aExp < 0x3FF ) {
2966 if ( (bits64) ( float64_val(a)<<1 ) == 0 ) return a;
2967 STATUS(float_exception_flags) |= float_flag_inexact;
2968 aSign = extractFloat64Sign( a );
2969 switch ( STATUS(float_rounding_mode) ) {
2970 case float_round_nearest_even:
2971 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2972 return packFloat64( aSign, 0x3FF, 0 );
2973 }
2974 break;
2975 case float_round_down:
2976 return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
2977 case float_round_up:
2978 return make_float64(
2979 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
2980 }
2981 return packFloat64( aSign, 0, 0 );
2982 }
2983 lastBitMask = 1;
2984 lastBitMask <<= 0x433 - aExp;
2985 roundBitsMask = lastBitMask - 1;
2986 z = float64_val(a);
2987 roundingMode = STATUS(float_rounding_mode);
2988 if ( roundingMode == float_round_nearest_even ) {
2989 z += lastBitMask>>1;
2990 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2991 }
2992 else if ( roundingMode != float_round_to_zero ) {
2993 if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
2994 z += roundBitsMask;
2995 }
2996 }
2997 z &= ~ roundBitsMask;
2998 if ( z != float64_val(a) )
2999 STATUS(float_exception_flags) |= float_flag_inexact;
3000 return make_float64(z);
3001
3002 }
3003
3004 float64 float64_trunc_to_int( float64 a STATUS_PARAM)
3005 {
3006 int oldmode;
3007 float64 res;
3008 oldmode = STATUS(float_rounding_mode);
3009 STATUS(float_rounding_mode) = float_round_to_zero;
3010 res = float64_round_to_int(a STATUS_VAR);
3011 STATUS(float_rounding_mode) = oldmode;
3012 return res;
3013 }
3014
3015 /*----------------------------------------------------------------------------
3016 | Returns the result of adding the absolute values of the double-precision
3017 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
3018 | before being returned. `zSign' is ignored if the result is a NaN.
3019 | The addition is performed according to the IEC/IEEE Standard for Binary
3020 | Floating-Point Arithmetic.
3021 *----------------------------------------------------------------------------*/
3022
3023 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3024 {
3025 int16 aExp, bExp, zExp;
3026 bits64 aSig, bSig, zSig;
3027 int16 expDiff;
3028
3029 aSig = extractFloat64Frac( a );
3030 aExp = extractFloat64Exp( a );
3031 bSig = extractFloat64Frac( b );
3032 bExp = extractFloat64Exp( b );
3033 expDiff = aExp - bExp;
3034 aSig <<= 9;
3035 bSig <<= 9;
3036 if ( 0 < expDiff ) {
3037 if ( aExp == 0x7FF ) {
3038 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3039 return a;
3040 }
3041 if ( bExp == 0 ) {
3042 --expDiff;
3043 }
3044 else {
3045 bSig |= LIT64( 0x2000000000000000 );
3046 }
3047 shift64RightJamming( bSig, expDiff, &bSig );
3048 zExp = aExp;
3049 }
3050 else if ( expDiff < 0 ) {
3051 if ( bExp == 0x7FF ) {
3052 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3053 return packFloat64( zSign, 0x7FF, 0 );
3054 }
3055 if ( aExp == 0 ) {
3056 ++expDiff;
3057 }
3058 else {
3059 aSig |= LIT64( 0x2000000000000000 );
3060 }
3061 shift64RightJamming( aSig, - expDiff, &aSig );
3062 zExp = bExp;
3063 }
3064 else {
3065 if ( aExp == 0x7FF ) {
3066 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3067 return a;
3068 }
3069 if ( aExp == 0 ) {
3070 if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
3071 return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3072 }
3073 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3074 zExp = aExp;
3075 goto roundAndPack;
3076 }
3077 aSig |= LIT64( 0x2000000000000000 );
3078 zSig = ( aSig + bSig )<<1;
3079 --zExp;
3080 if ( (sbits64) zSig < 0 ) {
3081 zSig = aSig + bSig;
3082 ++zExp;
3083 }
3084 roundAndPack:
3085 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3086
3087 }
3088
3089 /*----------------------------------------------------------------------------
3090 | Returns the result of subtracting the absolute values of the double-
3091 | precision floating-point values `a' and `b'. If `zSign' is 1, the
3092 | difference is negated before being returned. `zSign' is ignored if the
3093 | result is a NaN. The subtraction is performed according to the IEC/IEEE
3094 | Standard for Binary Floating-Point Arithmetic.
3095 *----------------------------------------------------------------------------*/
3096
3097 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3098 {
3099 int16 aExp, bExp, zExp;
3100 bits64 aSig, bSig, zSig;
3101 int16 expDiff;
3102
3103 aSig = extractFloat64Frac( a );
3104 aExp = extractFloat64Exp( a );
3105 bSig = extractFloat64Frac( b );
3106 bExp = extractFloat64Exp( b );
3107 expDiff = aExp - bExp;
3108 aSig <<= 10;
3109 bSig <<= 10;
3110 if ( 0 < expDiff ) goto aExpBigger;
3111 if ( expDiff < 0 ) goto bExpBigger;
3112 if ( aExp == 0x7FF ) {
3113 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3114 float_raise( float_flag_invalid STATUS_VAR);
3115 return float64_default_nan;
3116 }
3117 if ( aExp == 0 ) {
3118 aExp = 1;
3119 bExp = 1;
3120 }
3121 if ( bSig < aSig ) goto aBigger;
3122 if ( aSig < bSig ) goto bBigger;
3123 return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3124 bExpBigger:
3125 if ( bExp == 0x7FF ) {
3126 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3127 return packFloat64( zSign ^ 1, 0x7FF, 0 );
3128 }
3129 if ( aExp == 0 ) {
3130 ++expDiff;
3131 }
3132 else {
3133 aSig |= LIT64( 0x4000000000000000 );
3134 }
3135 shift64RightJamming( aSig, - expDiff, &aSig );
3136 bSig |= LIT64( 0x4000000000000000 );
3137 bBigger:
3138 zSig = bSig - aSig;
3139 zExp = bExp;
3140 zSign ^= 1;
3141 goto normalizeRoundAndPack;
3142 aExpBigger:
3143 if ( aExp == 0x7FF ) {
3144 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3145 return a;
3146 }
3147 if ( bExp == 0 ) {
3148 --expDiff;
3149 }
3150 else {
3151 bSig |= LIT64( 0x4000000000000000 );
3152 }
3153 shift64RightJamming( bSig, expDiff, &bSig );
3154 aSig |= LIT64( 0x4000000000000000 );
3155 aBigger:
3156 zSig = aSig - bSig;
3157 zExp = aExp;
3158 normalizeRoundAndPack:
3159 --zExp;
3160 return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3161
3162 }
3163
3164 /*----------------------------------------------------------------------------
3165 | Returns the result of adding the double-precision floating-point values `a'
3166 | and `b'. The operation is performed according to the IEC/IEEE Standard for
3167 | Binary Floating-Point Arithmetic.
3168 *----------------------------------------------------------------------------*/
3169
3170 float64 float64_add( float64 a, float64 b STATUS_PARAM )
3171 {
3172 flag aSign, bSign;
3173 a = float64_squash_input_denormal(a STATUS_VAR);
3174 b = float64_squash_input_denormal(b STATUS_VAR);
3175
3176 aSign = extractFloat64Sign( a );
3177 bSign = extractFloat64Sign( b );
3178 if ( aSign == bSign ) {
3179 return addFloat64Sigs( a, b, aSign STATUS_VAR );
3180 }
3181 else {
3182 return subFloat64Sigs( a, b, aSign STATUS_VAR );
3183 }
3184
3185 }
3186
3187 /*----------------------------------------------------------------------------
3188 | Returns the result of subtracting the double-precision floating-point values
3189 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3190 | for Binary Floating-Point Arithmetic.
3191 *----------------------------------------------------------------------------*/
3192
3193 float64 float64_sub( float64 a, float64 b STATUS_PARAM )
3194 {
3195 flag aSign, bSign;
3196 a = float64_squash_input_denormal(a STATUS_VAR);
3197 b = float64_squash_input_denormal(b STATUS_VAR);
3198
3199 aSign = extractFloat64Sign( a );
3200 bSign = extractFloat64Sign( b );
3201 if ( aSign == bSign ) {
3202 return subFloat64Sigs( a, b, aSign STATUS_VAR );
3203 }
3204 else {
3205 return addFloat64Sigs( a, b, aSign STATUS_VAR );
3206 }
3207
3208 }
3209
3210 /*----------------------------------------------------------------------------
3211 | Returns the result of multiplying the double-precision floating-point values
3212 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3213 | for Binary Floating-Point Arithmetic.
3214 *----------------------------------------------------------------------------*/
3215
3216 float64 float64_mul( float64 a, float64 b STATUS_PARAM )
3217 {
3218 flag aSign, bSign, zSign;
3219 int16 aExp, bExp, zExp;
3220 bits64 aSig, bSig, zSig0, zSig1;
3221
3222 a = float64_squash_input_denormal(a STATUS_VAR);
3223 b = float64_squash_input_denormal(b STATUS_VAR);
3224
3225 aSig = extractFloat64Frac( a );
3226 aExp = extractFloat64Exp( a );
3227 aSign = extractFloat64Sign( a );
3228 bSig = extractFloat64Frac( b );
3229 bExp = extractFloat64Exp( b );
3230 bSign = extractFloat64Sign( b );
3231 zSign = aSign ^ bSign;
3232 if ( aExp == 0x7FF ) {
3233 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3234 return propagateFloat64NaN( a, b STATUS_VAR );
3235 }
3236 if ( ( bExp | bSig ) == 0 ) {
3237 float_raise( float_flag_invalid STATUS_VAR);
3238 return float64_default_nan;
3239 }
3240 return packFloat64( zSign, 0x7FF, 0 );
3241 }
3242 if ( bExp == 0x7FF ) {
3243 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3244 if ( ( aExp | aSig ) == 0 ) {
3245 float_raise( float_flag_invalid STATUS_VAR);
3246 return float64_default_nan;
3247 }
3248 return packFloat64( zSign, 0x7FF, 0 );
3249 }
3250 if ( aExp == 0 ) {
3251 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3252 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3253 }
3254 if ( bExp == 0 ) {
3255 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
3256 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3257 }
3258 zExp = aExp + bExp - 0x3FF;
3259 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3260 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3261 mul64To128( aSig, bSig, &zSig0, &zSig1 );
3262 zSig0 |= ( zSig1 != 0 );
3263 if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
3264 zSig0 <<= 1;
3265 --zExp;
3266 }
3267 return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
3268
3269 }
3270
3271 /*----------------------------------------------------------------------------
3272 | Returns the result of dividing the double-precision floating-point value `a'
3273 | by the corresponding value `b'. The operation is performed according to
3274 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3275 *----------------------------------------------------------------------------*/
3276
3277 float64 float64_div( float64 a, float64 b STATUS_PARAM )
3278 {
3279 flag aSign, bSign, zSign;
3280 int16 aExp, bExp, zExp;
3281 bits64 aSig, bSig, zSig;
3282 bits64 rem0, rem1;
3283 bits64 term0, term1;
3284 a = float64_squash_input_denormal(a STATUS_VAR);
3285 b = float64_squash_input_denormal(b STATUS_VAR);
3286
3287 aSig = extractFloat64Frac( a );
3288 aExp = extractFloat64Exp( a );
3289 aSign = extractFloat64Sign( a );
3290 bSig = extractFloat64Frac( b );
3291 bExp = extractFloat64Exp( b );
3292 bSign = extractFloat64Sign( b );
3293 zSign = aSign ^ bSign;
3294 if ( aExp == 0x7FF ) {
3295 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3296 if ( bExp == 0x7FF ) {
3297 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3298 float_raise( float_flag_invalid STATUS_VAR);
3299 return float64_default_nan;
3300 }
3301 return packFloat64( zSign, 0x7FF, 0 );
3302 }
3303 if ( bExp == 0x7FF ) {
3304 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3305 return packFloat64( zSign, 0, 0 );
3306 }
3307 if ( bExp == 0 ) {
3308 if ( bSig == 0 ) {
3309 if ( ( aExp | aSig ) == 0 ) {
3310 float_raise( float_flag_invalid STATUS_VAR);
3311 return float64_default_nan;
3312 }
3313 float_raise( float_flag_divbyzero STATUS_VAR);
3314 return packFloat64( zSign, 0x7FF, 0 );
3315 }
3316 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3317 }
3318 if ( aExp == 0 ) {
3319 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3320 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3321 }
3322 zExp = aExp - bExp + 0x3FD;
3323 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3324 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3325 if ( bSig <= ( aSig + aSig ) ) {
3326 aSig >>= 1;
3327 ++zExp;
3328 }
3329 zSig = estimateDiv128To64( aSig, 0, bSig );
3330 if ( ( zSig & 0x1FF ) <= 2 ) {
3331 mul64To128( bSig, zSig, &term0, &term1 );
3332 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3333 while ( (sbits64) rem0 < 0 ) {
3334 --zSig;
3335 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3336 }
3337 zSig |= ( rem1 != 0 );
3338 }
3339 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3340
3341 }
3342
3343 /*----------------------------------------------------------------------------
3344 | Returns the remainder of the double-precision floating-point value `a'
3345 | with respect to the corresponding value `b'. The operation is performed
3346 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3347 *----------------------------------------------------------------------------*/
3348
3349 float64 float64_rem( float64 a, float64 b STATUS_PARAM )
3350 {
3351 flag aSign, zSign;
3352 int16 aExp, bExp, expDiff;
3353 bits64 aSig, bSig;
3354 bits64 q, alternateASig;
3355 sbits64 sigMean;
3356
3357 a = float64_squash_input_denormal(a STATUS_VAR);
3358 b = float64_squash_input_denormal(b STATUS_VAR);
3359 aSig = extractFloat64Frac( a );
3360 aExp = extractFloat64Exp( a );
3361 aSign = extractFloat64Sign( a );
3362 bSig = extractFloat64Frac( b );
3363 bExp = extractFloat64Exp( b );
3364 if ( aExp == 0x7FF ) {
3365 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3366 return propagateFloat64NaN( a, b STATUS_VAR );
3367 }
3368 float_raise( float_flag_invalid STATUS_VAR);
3369 return float64_default_nan;
3370 }
3371 if ( bExp == 0x7FF ) {
3372 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3373 return a;
3374 }
3375 if ( bExp == 0 ) {
3376 if ( bSig == 0 ) {
3377 float_raise( float_flag_invalid STATUS_VAR);
3378 return float64_default_nan;
3379 }
3380 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3381 }
3382 if ( aExp == 0 ) {
3383 if ( aSig == 0 ) return a;
3384 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3385 }
3386 expDiff = aExp - bExp;
3387 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3388 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3389 if ( expDiff < 0 ) {
3390 if ( expDiff < -1 ) return a;
3391 aSig >>= 1;
3392 }
3393 q = ( bSig <= aSig );
3394 if ( q ) aSig -= bSig;
3395 expDiff -= 64;
3396 while ( 0 < expDiff ) {
3397 q = estimateDiv128To64( aSig, 0, bSig );
3398 q = ( 2 < q ) ? q - 2 : 0;
3399 aSig = - ( ( bSig>>2 ) * q );
3400 expDiff -= 62;
3401 }
3402 expDiff += 64;
3403 if ( 0 < expDiff ) {
3404 q = estimateDiv128To64( aSig, 0, bSig );
3405 q = ( 2 < q ) ? q - 2 : 0;
3406 q >>= 64 - expDiff;
3407 bSig >>= 2;
3408 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3409 }
3410 else {
3411 aSig >>= 2;
3412 bSig >>= 2;
3413 }
3414 do {
3415 alternateASig = aSig;
3416 ++q;
3417 aSig -= bSig;
3418 } while ( 0 <= (sbits64) aSig );
3419 sigMean = aSig + alternateASig;
3420 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3421 aSig = alternateASig;
3422 }
3423 zSign = ( (sbits64) aSig < 0 );
3424 if ( zSign ) aSig = - aSig;
3425 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
3426
3427 }
3428
3429 /*----------------------------------------------------------------------------
3430 | Returns the square root of the double-precision floating-point value `a'.
3431 | The operation is performed according to the IEC/IEEE Standard for Binary
3432 | Floating-Point Arithmetic.
3433 *----------------------------------------------------------------------------*/
3434
3435 float64 float64_sqrt( float64 a STATUS_PARAM )
3436 {
3437 flag aSign;
3438 int16 aExp, zExp;
3439 bits64 aSig, zSig, doubleZSig;
3440 bits64 rem0, rem1, term0, term1;
3441 a = float64_squash_input_denormal(a STATUS_VAR);
3442
3443 aSig = extractFloat64Frac( a );
3444 aExp = extractFloat64Exp( a );
3445 aSign = extractFloat64Sign( a );
3446 if ( aExp == 0x7FF ) {
3447 if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
3448 if ( ! aSign ) return a;
3449 float_raise( float_flag_invalid STATUS_VAR);
3450 return float64_default_nan;
3451 }
3452 if ( aSign ) {
3453 if ( ( aExp | aSig ) == 0 ) return a;
3454 float_raise( float_flag_invalid STATUS_VAR);
3455 return float64_default_nan;
3456 }
3457 if ( aExp == 0 ) {
3458 if ( aSig == 0 ) return float64_zero;
3459 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3460 }
3461 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3462 aSig |= LIT64( 0x0010000000000000 );
3463 zSig = estimateSqrt32( aExp, aSig>>21 );
3464 aSig <<= 9 - ( aExp & 1 );
3465 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3466 if ( ( zSig & 0x1FF ) <= 5 ) {
3467 doubleZSig = zSig<<1;
3468 mul64To128( zSig, zSig, &term0, &term1 );
3469 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3470 while ( (sbits64) rem0 < 0 ) {
3471 --zSig;
3472 doubleZSig -= 2;
3473 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3474 }
3475 zSig |= ( ( rem0 | rem1 ) != 0 );
3476 }
3477 return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
3478
3479 }
3480
3481 /*----------------------------------------------------------------------------
3482 | Returns the binary log of the double-precision floating-point value `a'.
3483 | The operation is performed according to the IEC/IEEE Standard for Binary
3484 | Floating-Point Arithmetic.
3485 *----------------------------------------------------------------------------*/
3486 float64 float64_log2( float64 a STATUS_PARAM )
3487 {
3488 flag aSign, zSign;
3489 int16 aExp;
3490 bits64 aSig, aSig0, aSig1, zSig, i;
3491 a = float64_squash_input_denormal(a STATUS_VAR);
3492
3493 aSig = extractFloat64Frac( a );
3494 aExp = extractFloat64Exp( a );
3495 aSign = extractFloat64Sign( a );
3496
3497 if ( aExp == 0 ) {
3498 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
3499 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3500 }
3501 if ( aSign ) {
3502 float_raise( float_flag_invalid STATUS_VAR);
3503 return float64_default_nan;
3504 }
3505 if ( aExp == 0x7FF ) {
3506 if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
3507 return a;
3508 }
3509
3510 aExp -= 0x3FF;
3511 aSig |= LIT64( 0x0010000000000000 );
3512 zSign = aExp < 0;
3513 zSig = (bits64)aExp << 52;
3514 for (i = 1LL << 51; i > 0; i >>= 1) {
3515 mul64To128( aSig, aSig, &aSig0, &aSig1 );
3516 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
3517 if ( aSig & LIT64( 0x0020000000000000 ) ) {
3518 aSig >>= 1;
3519 zSig |= i;
3520 }
3521 }
3522
3523 if ( zSign )
3524 zSig = -zSig;
3525 return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
3526 }
3527
3528 /*----------------------------------------------------------------------------
3529 | Returns 1 if the double-precision floating-point value `a' is equal to the
3530 | corresponding value `b', and 0 otherwise. The comparison is performed
3531 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3532 *----------------------------------------------------------------------------*/
3533
3534 int float64_eq( float64 a, float64 b STATUS_PARAM )
3535 {
3536 bits64 av, bv;
3537 a = float64_squash_input_denormal(a STATUS_VAR);
3538 b = float64_squash_input_denormal(b STATUS_VAR);
3539
3540 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3541 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3542 ) {
3543 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3544 float_raise( float_flag_invalid STATUS_VAR);
3545 }
3546 return 0;
3547 }
3548 av = float64_val(a);
3549 bv = float64_val(b);
3550 return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3551
3552 }
3553
3554 /*----------------------------------------------------------------------------
3555 | Returns 1 if the double-precision floating-point value `a' is less than or
3556 | equal to the corresponding value `b', and 0 otherwise. The comparison is
3557 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3558 | Arithmetic.
3559 *----------------------------------------------------------------------------*/
3560
3561 int float64_le( float64 a, float64 b STATUS_PARAM )
3562 {
3563 flag aSign, bSign;
3564 bits64 av, bv;
3565 a = float64_squash_input_denormal(a STATUS_VAR);
3566 b = float64_squash_input_denormal(b STATUS_VAR);
3567
3568 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3569 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3570 ) {
3571 float_raise( float_flag_invalid STATUS_VAR);
3572 return 0;
3573 }
3574 aSign = extractFloat64Sign( a );
3575 bSign = extractFloat64Sign( b );
3576 av = float64_val(a);
3577 bv = float64_val(b);
3578 if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3579 return ( av == bv ) || ( aSign ^ ( av < bv ) );
3580
3581 }
3582
3583 /*----------------------------------------------------------------------------
3584 | Returns 1 if the double-precision floating-point value `a' is less than
3585 | the corresponding value `b', and 0 otherwise. The comparison is performed
3586 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3587 *----------------------------------------------------------------------------*/
3588
3589 int float64_lt( float64 a, float64 b STATUS_PARAM )
3590 {
3591 flag aSign, bSign;
3592 bits64 av, bv;
3593
3594 a = float64_squash_input_denormal(a STATUS_VAR);
3595 b = float64_squash_input_denormal(b STATUS_VAR);
3596 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3597 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3598 ) {
3599 float_raise( float_flag_invalid STATUS_VAR);
3600 return 0;
3601 }
3602 aSign = extractFloat64Sign( a );
3603 bSign = extractFloat64Sign( b );
3604 av = float64_val(a);
3605 bv = float64_val(b);
3606 if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 );
3607 return ( av != bv ) && ( aSign ^ ( av < bv ) );
3608
3609 }
3610
3611 /*----------------------------------------------------------------------------
3612 | Returns 1 if the double-precision floating-point value `a' is equal to the
3613 | corresponding value `b', and 0 otherwise. The invalid exception is raised
3614 | if either operand is a NaN. Otherwise, the comparison is performed
3615 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3616 *----------------------------------------------------------------------------*/
3617
3618 int float64_eq_signaling( float64 a, float64 b STATUS_PARAM )
3619 {
3620 bits64 av, bv;
3621 a = float64_squash_input_denormal(a STATUS_VAR);
3622 b = float64_squash_input_denormal(b STATUS_VAR);
3623
3624 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3625 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3626 ) {
3627 float_raise( float_flag_invalid STATUS_VAR);
3628 return 0;
3629 }
3630 av = float64_val(a);
3631 bv = float64_val(b);
3632 return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3633
3634 }
3635
3636 /*----------------------------------------------------------------------------
3637 | Returns 1 if the double-precision floating-point value `a' is less than or
3638 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3639 | cause an exception. Otherwise, the comparison is performed according to the
3640 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3641 *----------------------------------------------------------------------------*/
3642
3643 int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
3644 {
3645 flag aSign, bSign;
3646 bits64 av, bv;
3647 a = float64_squash_input_denormal(a STATUS_VAR);
3648 b = float64_squash_input_denormal(b STATUS_VAR);
3649
3650 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3651 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3652 ) {
3653 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3654 float_raise( float_flag_invalid STATUS_VAR);
3655 }
3656 return 0;
3657 }
3658 aSign = extractFloat64Sign( a );
3659 bSign = extractFloat64Sign( b );
3660 av = float64_val(a);
3661 bv = float64_val(b);
3662 if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3663 return ( av == bv ) || ( aSign ^ ( av < bv ) );
3664
3665 }
3666
3667 /*----------------------------------------------------------------------------
3668 | Returns 1 if the double-precision floating-point value `a' is less than
3669 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3670 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
3671 | Standard for Binary Floating-Point Arithmetic.
3672 *----------------------------------------------------------------------------*/
3673
3674 int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
3675 {
3676 flag aSign, bSign;
3677 bits64 av, bv;
3678 a = float64_squash_input_denormal(a STATUS_VAR);
3679 b = float64_squash_input_denormal(b STATUS_VAR);
3680
3681 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3682 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3683 ) {
3684 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3685 float_raise( float_flag_invalid STATUS_VAR);
3686 }
3687 return 0;
3688 }
3689 aSign = extractFloat64Sign( a );
3690 bSign = extractFloat64Sign( b );
3691 av = float64_val(a);
3692 bv = float64_val(b);
3693 if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 );
3694 return ( av != bv ) && ( aSign ^ ( av < bv ) );
3695
3696 }
3697
3698 #ifdef FLOATX80
3699
3700 /*----------------------------------------------------------------------------
3701 | Returns the result of converting the extended double-precision floating-
3702 | point value `a' to the 32-bit two's complement integer format. The
3703 | conversion is performed according to the IEC/IEEE Standard for Binary
3704 | Floating-Point Arithmetic---which means in particular that the conversion
3705 | is rounded according to the current rounding mode. If `a' is a NaN, the
3706 | largest positive integer is returned. Otherwise, if the conversion
3707 | overflows, the largest integer with the same sign as `a' is returned.
3708 *----------------------------------------------------------------------------*/
3709
3710 int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
3711 {
3712 flag aSign;
3713 int32 aExp, shiftCount;
3714 bits64 aSig;
3715
3716 aSig = extractFloatx80Frac( a );
3717 aExp = extractFloatx80Exp( a );
3718 aSign = extractFloatx80Sign( a );
3719 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3720 shiftCount = 0x4037 - aExp;
3721 if ( shiftCount <= 0 ) shiftCount = 1;
3722 shift64RightJamming( aSig, shiftCount, &aSig );
3723 return roundAndPackInt32( aSign, aSig STATUS_VAR );
3724
3725 }
3726
3727 /*----------------------------------------------------------------------------
3728 | Returns the result of converting the extended double-precision floating-
3729 | point value `a' to the 32-bit two's complement integer format. The
3730 | conversion is performed according to the IEC/IEEE Standard for Binary
3731 | Floating-Point Arithmetic, except that the conversion is always rounded
3732 | toward zero. If `a' is a NaN, the largest positive integer is returned.
3733 | Otherwise, if the conversion overflows, the largest integer with the same
3734 | sign as `a' is returned.
3735 *----------------------------------------------------------------------------*/
3736
3737 int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
3738 {
3739 flag aSign;
3740 int32 aExp, shiftCount;
3741 bits64 aSig, savedASig;
3742 int32 z;
3743
3744 aSig = extractFloatx80Frac( a );
3745 aExp = extractFloatx80Exp( a );
3746 aSign = extractFloatx80Sign( a );
3747 if ( 0x401E < aExp ) {
3748 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3749 goto invalid;
3750 }
3751 else if ( aExp < 0x3FFF ) {
3752 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3753 return 0;
3754 }
3755 shiftCount = 0x403E - aExp;
3756 savedASig = aSig;
3757 aSig >>= shiftCount;
3758 z = aSig;
3759 if ( aSign ) z = - z;
3760 if ( ( z < 0 ) ^ aSign ) {
3761 invalid:
3762 float_raise( float_flag_invalid STATUS_VAR);
3763 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3764 }
3765 if ( ( aSig<<shiftCount ) != savedASig ) {
3766 STATUS(float_exception_flags) |= float_flag_inexact;
3767 }
3768 return z;
3769
3770 }
3771
3772 /*----------------------------------------------------------------------------
3773 | Returns the result of converting the extended double-precision floating-
3774 | point value `a' to the 64-bit two's complement integer format. The
3775 | conversion is performed according to the IEC/IEEE Standard for Binary
3776 | Floating-Point Arithmetic---which means in particular that the conversion
3777 | is rounded according to the current rounding mode. If `a' is a NaN,
3778 | the largest positive integer is returned. Otherwise, if the conversion
3779 | overflows, the largest integer with the same sign as `a' is returned.
3780 *----------------------------------------------------------------------------*/
3781
3782 int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
3783 {
3784 flag aSign;
3785 int32 aExp, shiftCount;
3786 bits64 aSig, aSigExtra;
3787
3788 aSig = extractFloatx80Frac( a );
3789 aExp = extractFloatx80Exp( a );
3790 aSign = extractFloatx80Sign( a );
3791 shiftCount = 0x403E - aExp;
3792 if ( shiftCount <= 0 ) {
3793 if ( shiftCount ) {
3794 float_raise( float_flag_invalid STATUS_VAR);
3795 if ( ! aSign
3796 || ( ( aExp == 0x7FFF )
3797 && ( aSig != LIT64( 0x8000000000000000 ) ) )
3798 ) {
3799 return LIT64( 0x7FFFFFFFFFFFFFFF );
3800 }
3801 return (sbits64) LIT64( 0x8000000000000000 );
3802 }
3803 aSigExtra = 0;
3804 }
3805 else {
3806 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3807 }
3808 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3809
3810 }
3811
3812 /*----------------------------------------------------------------------------
3813 | Returns the result of converting the extended double-precision floating-
3814 | point value `a' to the 64-bit two's complement integer format. The
3815 | conversion is performed according to the IEC/IEEE Standard for Binary
3816 | Floating-Point Arithmetic, except that the conversion is always rounded
3817 | toward zero. If `a' is a NaN, the largest positive integer is returned.
3818 | Otherwise, if the conversion overflows, the largest integer with the same
3819 | sign as `a' is returned.
3820 *----------------------------------------------------------------------------*/
3821
3822 int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
3823 {
3824 flag aSign;
3825 int32 aExp, shiftCount;
3826 bits64 aSig;
3827 int64 z;
3828
3829 aSig = extractFloatx80Frac( a );
3830 aExp = extractFloatx80Exp( a );
3831 aSign = extractFloatx80Sign( a );
3832 shiftCount = aExp - 0x403E;
3833 if ( 0 <= shiftCount ) {
3834 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3835 if ( ( a.high != 0xC03E ) || aSig ) {
3836 float_raise( float_flag_invalid STATUS_VAR);
3837 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3838 return LIT64( 0x7FFFFFFFFFFFFFFF );
3839 }
3840 }
3841 return (sbits64) LIT64( 0x8000000000000000 );
3842 }
3843 else if ( aExp < 0x3FFF ) {
3844 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3845 return 0;
3846 }
3847 z = aSig>>( - shiftCount );
3848 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3849 STATUS(float_exception_flags) |= float_flag_inexact;
3850 }
3851 if ( aSign ) z = - z;
3852 return z;
3853
3854 }
3855
3856 /*----------------------------------------------------------------------------
3857 | Returns the result of converting the extended double-precision floating-
3858 | point value `a' to the single-precision floating-point format. The
3859 | conversion is performed according to the IEC/IEEE Standard for Binary
3860 | Floating-Point Arithmetic.
3861 *----------------------------------------------------------------------------*/
3862
3863 float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
3864 {
3865 flag aSign;
3866 int32 aExp;
3867 bits64 aSig;
3868
3869 aSig = extractFloatx80Frac( a );
3870 aExp = extractFloatx80Exp( a );
3871 aSign = extractFloatx80Sign( a );
3872 if ( aExp == 0x7FFF ) {
3873 if ( (bits64) ( aSig<<1 ) ) {
3874 return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) );
3875 }
3876 return packFloat32( aSign, 0xFF, 0 );
3877 }
3878 shift64RightJamming( aSig, 33, &aSig );
3879 if ( aExp || aSig ) aExp -= 0x3F81;
3880 return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
3881
3882 }
3883
3884 /*----------------------------------------------------------------------------
3885 | Returns the result of converting the extended double-precision floating-
3886 | point value `a' to the double-precision floating-point format. The
3887 | conversion is performed according to the IEC/IEEE Standard for Binary
3888 | Floating-Point Arithmetic.
3889 *----------------------------------------------------------------------------*/
3890
3891 float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
3892 {
3893 flag aSign;
3894 int32 aExp;
3895 bits64 aSig, zSig;
3896
3897 aSig = extractFloatx80Frac( a );
3898 aExp = extractFloatx80Exp( a );
3899 aSign = extractFloatx80Sign( a );
3900 if ( aExp == 0x7FFF ) {
3901 if ( (bits64) ( aSig<<1 ) ) {
3902 return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) );
3903 }
3904 return packFloat64( aSign, 0x7FF, 0 );
3905 }
3906 shift64RightJamming( aSig, 1, &zSig );
3907 if ( aExp || aSig ) aExp -= 0x3C01;
3908 return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
3909
3910 }
3911
3912 #ifdef FLOAT128
3913
3914 /*----------------------------------------------------------------------------
3915 | Returns the result of converting the extended double-precision floating-
3916 | point value `a' to the quadruple-precision floating-point format. The
3917 | conversion is performed according to the IEC/IEEE Standard for Binary
3918 | Floating-Point Arithmetic.
3919 *----------------------------------------------------------------------------*/
3920
3921 float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
3922 {
3923 flag aSign;
3924 int16 aExp;
3925 bits64 aSig, zSig0, zSig1;
3926
3927 aSig = extractFloatx80Frac( a );
3928 aExp = extractFloatx80Exp( a );
3929 aSign = extractFloatx80Sign( a );
3930 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3931 return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) );
3932 }
3933 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3934 return packFloat128( aSign, aExp, zSig0, zSig1 );
3935
3936 }
3937
3938 #endif
3939
3940 /*----------------------------------------------------------------------------
3941 | Rounds the extended double-precision floating-point value `a' to an integer,
3942 | and returns the result as an extended quadruple-precision floating-point
3943 | value. The operation is performed according to the IEC/IEEE Standard for
3944 | Binary Floating-Point Arithmetic.
3945 *----------------------------------------------------------------------------*/
3946
3947 floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
3948 {
3949 flag aSign;
3950 int32 aExp;
3951 bits64 lastBitMask, roundBitsMask;
3952 int8 roundingMode;
3953 floatx80 z;
3954
3955 aExp = extractFloatx80Exp( a );
3956 if ( 0x403E <= aExp ) {
3957 if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3958 return propagateFloatx80NaN( a, a STATUS_VAR );
3959 }
3960 return a;
3961 }
3962 if ( aExp < 0x3FFF ) {
3963 if ( ( aExp == 0 )
3964 && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3965 return a;
3966 }
3967 STATUS(float_exception_flags) |= float_flag_inexact;
3968 aSign = extractFloatx80Sign( a );
3969 switch ( STATUS(float_rounding_mode) ) {
3970 case float_round_nearest_even:
3971 if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3972 ) {
3973 return
3974 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3975 }
3976 break;
3977 case float_round_down:
3978 return
3979 aSign ?
3980 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3981 : packFloatx80( 0, 0, 0 );
3982 case float_round_up:
3983 return
3984 aSign ? packFloatx80( 1, 0, 0 )
3985 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3986 }
3987 return packFloatx80( aSign, 0, 0 );
3988 }
3989 lastBitMask = 1;
3990 lastBitMask <<= 0x403E - aExp;
3991 roundBitsMask = lastBitMask - 1;
3992 z = a;
3993 roundingMode = STATUS(float_rounding_mode);
3994 if ( roundingMode == float_round_nearest_even ) {
3995 z.low += lastBitMask>>1;
3996 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3997 }
3998 else if ( roundingMode != float_round_to_zero ) {
3999 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
4000 z.low += roundBitsMask;
4001 }
4002 }
4003 z.low &= ~ roundBitsMask;
4004 if ( z.low == 0 ) {
4005 ++z.high;
4006 z.low = LIT64( 0x8000000000000000 );
4007 }
4008 if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
4009 return z;
4010
4011 }
4012
4013 /*----------------------------------------------------------------------------
4014 | Returns the result of adding the absolute values of the extended double-
4015 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
4016 | negated before being returned. `zSign' is ignored if the result is a NaN.
4017 | The addition is performed according to the IEC/IEEE Standard for Binary
4018 | Floating-Point Arithmetic.
4019 *----------------------------------------------------------------------------*/
4020
4021 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
4022 {
4023 int32 aExp, bExp, zExp;
4024 bits64 aSig, bSig, zSig0, zSig1;
4025 int32 expDiff;
4026
4027 aSig = extractFloatx80Frac( a );
4028 aExp = extractFloatx80Exp( a );
4029 bSig = extractFloatx80Frac( b );
4030 bExp = extractFloatx80Exp( b );
4031 expDiff = aExp - bExp;
4032 if ( 0 < expDiff ) {
4033 if ( aExp == 0x7FFF ) {
4034 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4035 return a;
4036 }
4037 if ( bExp == 0 ) --expDiff;
4038 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4039 zExp = aExp;
4040 }
4041 else if ( expDiff < 0 ) {
4042 if ( bExp == 0x7FFF ) {
4043 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4044 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4045 }
4046 if ( aExp == 0 ) ++expDiff;
4047 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4048 zExp = bExp;
4049 }
4050 else {
4051 if ( aExp == 0x7FFF ) {
4052 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
4053 return propagateFloatx80NaN( a, b STATUS_VAR );
4054 }
4055 return a;
4056 }
4057 zSig1 = 0;
4058 zSig0 = aSig + bSig;
4059 if ( aExp == 0 ) {
4060 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4061 goto roundAndPack;
4062 }
4063 zExp = aExp;
4064 goto shiftRight1;
4065 }
4066 zSig0 = aSig + bSig;
4067 if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
4068 shiftRight1:
4069 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4070 zSig0 |= LIT64( 0x8000000000000000 );
4071 ++zExp;
4072 roundAndPack:
4073 return
4074 roundAndPackFloatx80(
4075 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4076
4077 }
4078
4079 /*----------------------------------------------------------------------------
4080 | Returns the result of subtracting the absolute values of the extended
4081 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
4082 | difference is negated before being returned. `zSign' is ignored if the
4083 | result is a NaN. The subtraction is performed according to the IEC/IEEE
4084 | Standard for Binary Floating-Point Arithmetic.
4085 *----------------------------------------------------------------------------*/
4086
4087 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
4088 {
4089 int32 aExp, bExp, zExp;
4090 bits64 aSig, bSig, zSig0, zSig1;
4091 int32 expDiff;
4092 floatx80 z;
4093
4094 aSig = extractFloatx80Frac( a );
4095 aExp = extractFloatx80Exp( a );
4096 bSig = extractFloatx80Frac( b );
4097 bExp = extractFloatx80Exp( b );
4098 expDiff = aExp - bExp;
4099 if ( 0 < expDiff ) goto aExpBigger;
4100 if ( expDiff < 0 ) goto bExpBigger;
4101 if ( aExp == 0x7FFF ) {
4102 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
4103 return propagateFloatx80NaN( a, b STATUS_VAR );
4104 }
4105 float_raise( float_flag_invalid STATUS_VAR);
4106 z.low = floatx80_default_nan_low;
4107 z.high = floatx80_default_nan_high;
4108 return z;
4109 }
4110 if ( aExp == 0 ) {
4111 aExp = 1;
4112 bExp = 1;
4113 }
4114 zSig1 = 0;
4115 if ( bSig < aSig ) goto aBigger;
4116 if ( aSig < bSig ) goto bBigger;
4117 return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
4118 bExpBigger:
4119 if ( bExp == 0x7FFF ) {
4120 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4121 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
4122 }
4123 if ( aExp == 0 ) ++expDiff;
4124 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4125 bBigger:
4126 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4127 zExp = bExp;
4128 zSign ^= 1;
4129 goto normalizeRoundAndPack;
4130 aExpBigger:
4131 if ( aExp == 0x7FFF ) {
4132 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4133 return a;
4134 }
4135 if ( bExp == 0 ) --expDiff;
4136 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4137 aBigger:
4138 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4139 zExp = aExp;
4140 normalizeRoundAndPack:
4141 return
4142 normalizeRoundAndPackFloatx80(
4143 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4144
4145 }
4146
4147 /*----------------------------------------------------------------------------
4148 | Returns the result of adding the extended double-precision floating-point
4149 | values `a' and `b'. The operation is performed according to the IEC/IEEE
4150 | Standard for Binary Floating-Point Arithmetic.
4151 *----------------------------------------------------------------------------*/
4152
4153 floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
4154 {
4155 flag aSign, bSign;
4156
4157 aSign = extractFloatx80Sign( a );
4158 bSign = extractFloatx80Sign( b );
4159 if ( aSign == bSign ) {
4160 return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4161 }
4162 else {
4163 return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4164 }
4165
4166 }
4167
4168 /*----------------------------------------------------------------------------
4169 | Returns the result of subtracting the extended double-precision floating-
4170 | point values `a' and `b'. The operation is performed according to the
4171 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4172 *----------------------------------------------------------------------------*/
4173
4174 floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
4175 {
4176 flag aSign, bSign;
4177
4178 aSign = extractFloatx80Sign( a );
4179 bSign = extractFloatx80Sign( b );
4180 if ( aSign == bSign ) {
4181 return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4182 }
4183 else {
4184 return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4185 }
4186
4187 }
4188
4189 /*----------------------------------------------------------------------------
4190 | Returns the result of multiplying the extended double-precision floating-
4191 | point values `a' and `b'. The operation is performed according to the
4192 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4193 *----------------------------------------------------------------------------*/
4194
4195 floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
4196 {
4197 flag aSign, bSign, zSign;
4198 int32 aExp, bExp, zExp;
4199 bits64 aSig, bSig, zSig0, zSig1;
4200 floatx80 z;
4201
4202 aSig = extractFloatx80Frac( a );
4203 aExp = extractFloatx80Exp( a );
4204 aSign = extractFloatx80Sign( a );
4205 bSig = extractFloatx80Frac( b );
4206 bExp = extractFloatx80Exp( b );
4207 bSign = extractFloatx80Sign( b );
4208 zSign = aSign ^ bSign;
4209 if ( aExp == 0x7FFF ) {
4210 if ( (bits64) ( aSig<<1 )
4211 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
4212 return propagateFloatx80NaN( a, b STATUS_VAR );
4213 }
4214 if ( ( bExp | bSig ) == 0 ) goto invalid;
4215 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4216 }
4217 if ( bExp == 0x7FFF ) {
4218 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4219 if ( ( aExp | aSig ) == 0 ) {
4220 invalid:
4221 float_raise( float_flag_invalid STATUS_VAR);
4222 z.low = floatx80_default_nan_low;
4223 z.high = floatx80_default_nan_high;
4224 return z;
4225 }
4226 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4227 }
4228 if ( aExp == 0 ) {
4229 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4230 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4231 }
4232 if ( bExp == 0 ) {
4233 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
4234 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4235 }
4236 zExp = aExp + bExp - 0x3FFE;
4237 mul64To128( aSig, bSig, &zSig0, &zSig1 );
4238 if ( 0 < (sbits64) zSig0 ) {
4239 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
4240 --zExp;
4241 }
4242 return
4243 roundAndPackFloatx80(
4244 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4245
4246 }
4247
4248 /*----------------------------------------------------------------------------
4249 | Returns the result of dividing the extended double-precision floating-point
4250 | value `a' by the corresponding value `b'. The operation is performed
4251 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4252 *----------------------------------------------------------------------------*/
4253
4254 floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
4255 {
4256 flag aSign, bSign, zSign;
4257 int32 aExp, bExp, zExp;
4258 bits64 aSig, bSig, zSig0, zSig1;
4259 bits64 rem0, rem1, rem2, term0, term1, term2;
4260 floatx80 z;
4261
4262 aSig = extractFloatx80Frac( a );
4263 aExp = extractFloatx80Exp( a );
4264 aSign = extractFloatx80Sign( a );
4265 bSig = extractFloatx80Frac( b );
4266 bExp = extractFloatx80Exp( b );
4267 bSign = extractFloatx80Sign( b );
4268 zSign = aSign ^ bSign;
4269 if ( aExp == 0x7FFF ) {
4270 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4271 if ( bExp == 0x7FFF ) {
4272 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4273 goto invalid;
4274 }
4275 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4276 }
4277 if ( bExp == 0x7FFF ) {
4278 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4279 return packFloatx80( zSign, 0, 0 );
4280 }
4281 if ( bExp == 0 ) {
4282 if ( bSig == 0 ) {
4283 if ( ( aExp | aSig ) == 0 ) {
4284 invalid:
4285 float_raise( float_flag_invalid STATUS_VAR);
4286 z.low = floatx80_default_nan_low;
4287 z.high = floatx80_default_nan_high;
4288 return z;
4289 }
4290 float_raise( float_flag_divbyzero STATUS_VAR);
4291 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4292 }
4293 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4294 }
4295 if ( aExp == 0 ) {
4296 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4297 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4298 }
4299 zExp = aExp - bExp + 0x3FFE;
4300 rem1 = 0;
4301 if ( bSig <= aSig ) {
4302 shift128Right( aSig, 0, 1, &aSig, &rem1 );
4303 ++zExp;
4304 }
4305 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
4306 mul64To128( bSig, zSig0, &term0, &term1 );
4307 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
4308 while ( (sbits64) rem0 < 0 ) {
4309 --zSig0;
4310 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4311 }
4312 zSig1 = estimateDiv128To64( rem1, 0, bSig );
4313 if ( (bits64) ( zSig1<<1 ) <= 8 ) {
4314 mul64To128( bSig, zSig1, &term1, &term2 );
4315 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4316 while ( (sbits64) rem1 < 0 ) {
4317 --zSig1;
4318 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
4319 }
4320 zSig1 |= ( ( rem1 | rem2 ) != 0 );
4321 }
4322 return
4323 roundAndPackFloatx80(
4324 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4325
4326 }
4327
4328 /*----------------------------------------------------------------------------
4329 | Returns the remainder of the extended double-precision floating-point value
4330 | `a' with respect to the corresponding value `b'. The operation is performed
4331 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4332 *----------------------------------------------------------------------------*/
4333
4334 floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
4335 {
4336 flag aSign, zSign;
4337 int32 aExp, bExp, expDiff;
4338 bits64 aSig0, aSig1, bSig;
4339 bits64 q, term0, term1, alternateASig0, alternateASig1;
4340 floatx80 z;
4341
4342 aSig0 = extractFloatx80Frac( a );
4343 aExp = extractFloatx80Exp( a );
4344 aSign = extractFloatx80Sign( a );
4345 bSig = extractFloatx80Frac( b );
4346 bExp = extractFloatx80Exp( b );
4347 if ( aExp == 0x7FFF ) {
4348 if ( (bits64) ( aSig0<<1 )
4349 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
4350 return propagateFloatx80NaN( a, b STATUS_VAR );
4351 }
4352 goto invalid;
4353 }
4354 if ( bExp == 0x7FFF ) {
4355 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4356 return a;
4357 }
4358 if ( bExp == 0 ) {
4359 if ( bSig == 0 ) {
4360 invalid:
4361 float_raise( float_flag_invalid STATUS_VAR);
4362 z.low = floatx80_default_nan_low;
4363 z.high = floatx80_default_nan_high;
4364 return z;
4365 }
4366 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4367 }
4368 if ( aExp == 0 ) {
4369 if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
4370 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4371 }
4372 bSig |= LIT64( 0x8000000000000000 );
4373 zSign = aSign;
4374 expDiff = aExp - bExp;
4375 aSig1 = 0;
4376 if ( expDiff < 0 ) {
4377 if ( expDiff < -1 ) return a;
4378 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
4379 expDiff = 0;
4380 }
4381 q = ( bSig <= aSig0 );
4382 if ( q ) aSig0 -= bSig;
4383 expDiff -= 64;
4384 while ( 0 < expDiff ) {
4385 q = estimateDiv128To64( aSig0, aSig1, bSig );
4386 q = ( 2 < q ) ? q - 2 : 0;
4387 mul64To128( bSig, q, &term0, &term1 );
4388 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4389 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
4390 expDiff -= 62;
4391 }
4392 expDiff += 64;
4393 if ( 0 < expDiff ) {
4394 q = estimateDiv128To64( aSig0, aSig1, bSig );
4395 q = ( 2 < q ) ? q - 2 : 0;
4396 q >>= 64 - expDiff;
4397 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
4398 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4399 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4400 while ( le128( term0, term1, aSig0, aSig1 ) ) {
4401 ++q;
4402 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4403 }
4404 }
4405 else {
4406 term1 = 0;
4407 term0 = bSig;
4408 }
4409 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4410 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4411 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4412 && ( q & 1 ) )
4413 ) {
4414 aSig0 = alternateASig0;
4415 aSig1 = alternateASig1;
4416 zSign = ! zSign;
4417 }
4418 return
4419 normalizeRoundAndPackFloatx80(
4420 80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
4421
4422 }
4423
4424 /*----------------------------------------------------------------------------
4425 | Returns the square root of the extended double-precision floating-point
4426 | value `a'. The operation is performed according to the IEC/IEEE Standard
4427 | for Binary Floating-Point Arithmetic.
4428 *----------------------------------------------------------------------------*/
4429
4430 floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
4431 {
4432 flag aSign;
4433 int32 aExp, zExp;
4434 bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4435 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4436 floatx80 z;
4437
4438 aSig0 = extractFloatx80Frac( a );
4439 aExp = extractFloatx80Exp( a );
4440 aSign = extractFloatx80Sign( a );
4441 if ( aExp == 0x7FFF ) {
4442 if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
4443 if ( ! aSign ) return a;
4444 goto invalid;
4445 }
4446 if ( aSign ) {
4447 if ( ( aExp | aSig0 ) == 0 ) return a;
4448 invalid:
4449 float_raise( float_flag_invalid STATUS_VAR);
4450 z.low = floatx80_default_nan_low;
4451 z.high = floatx80_default_nan_high;
4452 return z;
4453 }
4454 if ( aExp == 0 ) {
4455 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4456 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4457 }
4458 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4459 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4460 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4461 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4462 doubleZSig0 = zSig0<<1;
4463 mul64To128( zSig0, zSig0, &term0, &term1 );
4464 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4465 while ( (sbits64) rem0 < 0 ) {
4466 --zSig0;
4467 doubleZSig0 -= 2;
4468 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4469 }
4470 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4471 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4472 if ( zSig1 == 0 ) zSig1 = 1;
4473 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4474 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4475 mul64To128( zSig1, zSig1, &term2, &term3 );
4476 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4477 while ( (sbits64) rem1 < 0 ) {
4478 --zSig1;
4479 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4480 term3 |= 1;
4481 term2 |= doubleZSig0;
4482 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4483 }
4484 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4485 }
4486 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4487 zSig0 |= doubleZSig0;
4488 return
4489 roundAndPackFloatx80(
4490 STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
4491
4492 }
4493
4494 /*----------------------------------------------------------------------------
4495 | Returns 1 if the extended double-precision floating-point value `a' is
4496 | equal to the corresponding value `b', and 0 otherwise. The comparison is
4497 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4498 | Arithmetic.
4499 *----------------------------------------------------------------------------*/
4500
4501 int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
4502 {
4503
4504 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4505 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4506 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4507 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4508 ) {
4509 if ( floatx80_is_signaling_nan( a )
4510 || floatx80_is_signaling_nan( b ) ) {
4511 float_raise( float_flag_invalid STATUS_VAR);
4512 }
4513 return 0;
4514 }
4515 return
4516 ( a.low == b.low )
4517 && ( ( a.high == b.high )
4518 || ( ( a.low == 0 )
4519 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4520 );
4521
4522 }
4523
4524 /*----------------------------------------------------------------------------
4525 | Returns 1 if the extended double-precision floating-point value `a' is
4526 | less than or equal to the corresponding value `b', and 0 otherwise. The
4527 | comparison is performed according to the IEC/IEEE Standard for Binary
4528 | Floating-Point Arithmetic.
4529 *----------------------------------------------------------------------------*/
4530
4531 int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
4532 {
4533 flag aSign, bSign;
4534
4535 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4536 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4537 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4538 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4539 ) {
4540 float_raise( float_flag_invalid STATUS_VAR);
4541 return 0;
4542 }
4543 aSign = extractFloatx80Sign( a );
4544 bSign = extractFloatx80Sign( b );
4545 if ( aSign != bSign ) {
4546 return
4547 aSign
4548 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4549 == 0 );
4550 }
4551 return
4552 aSign ? le128( b.high, b.low, a.high, a.low )
4553 : le128( a.high, a.low, b.high, b.low );
4554
4555 }
4556
4557 /*----------------------------------------------------------------------------
4558 | Returns 1 if the extended double-precision floating-point value `a' is
4559 | less than the corresponding value `b', and 0 otherwise. The comparison
4560 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4561 | Arithmetic.
4562 *----------------------------------------------------------------------------*/
4563
4564 int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
4565 {
4566 flag aSign, bSign;
4567
4568 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4569 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4570 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4571 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4572 ) {
4573 float_raise( float_flag_invalid STATUS_VAR);
4574 return 0;
4575 }
4576 aSign = extractFloatx80Sign( a );
4577 bSign = extractFloatx80Sign( b );
4578 if ( aSign != bSign ) {
4579 return
4580 aSign
4581 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4582 != 0 );
4583 }
4584 return
4585 aSign ? lt128( b.high, b.low, a.high, a.low )
4586 : lt128( a.high, a.low, b.high, b.low );
4587
4588 }
4589
4590 /*----------------------------------------------------------------------------
4591 | Returns 1 if the extended double-precision floating-point value `a' is equal
4592 | to the corresponding value `b', and 0 otherwise. The invalid exception is
4593 | raised if either operand is a NaN. Otherwise, the comparison is performed
4594 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4595 *----------------------------------------------------------------------------*/
4596
4597 int floatx80_eq_signaling( floatx80 a, floatx80 b STATUS_PARAM )
4598 {
4599
4600 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4601 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4602 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4603 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4604 ) {
4605 float_raise( float_flag_invalid STATUS_VAR);
4606 return 0;
4607 }
4608 return
4609 ( a.low == b.low )
4610 && ( ( a.high == b.high )
4611 || ( ( a.low == 0 )
4612 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4613 );
4614
4615 }
4616
4617 /*----------------------------------------------------------------------------
4618 | Returns 1 if the extended double-precision floating-point value `a' is less
4619 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
4620 | do not cause an exception. Otherwise, the comparison is performed according
4621 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4622 *----------------------------------------------------------------------------*/
4623
4624 int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4625 {
4626 flag aSign, bSign;
4627
4628 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4629 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4630 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4631 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4632 ) {
4633 if ( floatx80_is_signaling_nan( a )
4634 || floatx80_is_signaling_nan( b ) ) {
4635 float_raise( float_flag_invalid STATUS_VAR);
4636 }
4637 return 0;
4638 }
4639 aSign = extractFloatx80Sign( a );
4640 bSign = extractFloatx80Sign( b );
4641 if ( aSign != bSign ) {
4642 return
4643 aSign
4644 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4645 == 0 );
4646 }
4647 return
4648 aSign ? le128( b.high, b.low, a.high, a.low )
4649 : le128( a.high, a.low, b.high, b.low );
4650
4651 }
4652
4653 /*----------------------------------------------------------------------------
4654 | Returns 1 if the extended double-precision floating-point value `a' is less
4655 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
4656 | an exception. Otherwise, the comparison is performed according to the
4657 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4658 *----------------------------------------------------------------------------*/
4659
4660 int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4661 {
4662 flag aSign, bSign;
4663
4664 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4665 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4666 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4667 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4668 ) {
4669 if ( floatx80_is_signaling_nan( a )
4670 || floatx80_is_signaling_nan( b ) ) {
4671 float_raise( float_flag_invalid STATUS_VAR);
4672 }
4673 return 0;
4674 }
4675 aSign = extractFloatx80Sign( a );
4676 bSign = extractFloatx80Sign( b );
4677 if ( aSign != bSign ) {
4678 return
4679 aSign
4680 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4681 != 0 );
4682 }
4683 return
4684 aSign ? lt128( b.high, b.low, a.high, a.low )
4685 : lt128( a.high, a.low, b.high, b.low );
4686
4687 }
4688
4689 #endif
4690
4691 #ifdef FLOAT128
4692
4693 /*----------------------------------------------------------------------------
4694 | Returns the result of converting the quadruple-precision floating-point
4695 | value `a' to the 32-bit two's complement integer format. The conversion
4696 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4697 | Arithmetic---which means in particular that the conversion is rounded
4698 | according to the current rounding mode. If `a' is a NaN, the largest
4699 | positive integer is returned. Otherwise, if the conversion overflows, the
4700 | largest integer with the same sign as `a' is returned.
4701 *----------------------------------------------------------------------------*/
4702
4703 int32 float128_to_int32( float128 a STATUS_PARAM )
4704 {
4705 flag aSign;
4706 int32 aExp, shiftCount;
4707 bits64 aSig0, aSig1;
4708
4709 aSig1 = extractFloat128Frac1( a );
4710 aSig0 = extractFloat128Frac0( a );
4711 aExp = extractFloat128Exp( a );
4712 aSign = extractFloat128Sign( a );
4713 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4714 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4715 aSig0 |= ( aSig1 != 0 );
4716 shiftCount = 0x4028 - aExp;
4717 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4718 return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
4719
4720 }
4721
4722 /*----------------------------------------------------------------------------
4723 | Returns the result of converting the quadruple-precision floating-point
4724 | value `a' to the 32-bit two's complement integer format. The conversion
4725 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4726 | Arithmetic, except that the conversion is always rounded toward zero. If
4727 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
4728 | conversion overflows, the largest integer with the same sign as `a' is
4729 | returned.
4730 *----------------------------------------------------------------------------*/
4731
4732 int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
4733 {
4734 flag aSign;
4735 int32 aExp, shiftCount;
4736 bits64 aSig0, aSig1, savedASig;
4737 int32 z;
4738
4739 aSig1 = extractFloat128Frac1( a );
4740 aSig0 = extractFloat128Frac0( a );
4741 aExp = extractFloat128Exp( a );
4742 aSign = extractFloat128Sign( a );
4743 aSig0 |= ( aSig1 != 0 );
4744 if ( 0x401E < aExp ) {
4745 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4746 goto invalid;
4747 }
4748 else if ( aExp < 0x3FFF ) {
4749 if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
4750 return 0;
4751 }
4752 aSig0 |= LIT64( 0x0001000000000000 );
4753 shiftCount = 0x402F - aExp;
4754 savedASig = aSig0;
4755 aSig0 >>= shiftCount;
4756 z = aSig0;
4757 if ( aSign ) z = - z;
4758 if ( ( z < 0 ) ^ aSign ) {
4759 invalid:
4760 float_raise( float_flag_invalid STATUS_VAR);
4761 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4762 }
4763 if ( ( aSig0<<shiftCount ) != savedASig ) {
4764 STATUS(float_exception_flags) |= float_flag_inexact;
4765 }
4766 return z;
4767
4768 }
4769
4770 /*----------------------------------------------------------------------------
4771 | Returns the result of converting the quadruple-precision floating-point
4772 | value `a' to the 64-bit two's complement integer format. The conversion
4773 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4774 | Arithmetic---which means in particular that the conversion is rounded
4775 | according to the current rounding mode. If `a' is a NaN, the largest
4776 | positive integer is returned. Otherwise, if the conversion overflows, the
4777 | largest integer with the same sign as `a' is returned.
4778 *----------------------------------------------------------------------------*/
4779
4780 int64 float128_to_int64( float128 a STATUS_PARAM )
4781 {
4782 flag aSign;
4783 int32 aExp, shiftCount;
4784 bits64 aSig0, aSig1;
4785
4786 aSig1 = extractFloat128Frac1( a );
4787 aSig0 = extractFloat128Frac0( a );
4788 aExp = extractFloat128Exp( a );
4789 aSign = extractFloat128Sign( a );
4790 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4791 shiftCount = 0x402F - aExp;
4792 if ( shiftCount <= 0 ) {
4793 if ( 0x403E < aExp ) {
4794 float_raise( float_flag_invalid STATUS_VAR);
4795 if ( ! aSign
4796 || ( ( aExp == 0x7FFF )
4797 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4798 )
4799 ) {
4800 return LIT64( 0x7FFFFFFFFFFFFFFF );
4801 }
4802 return (sbits64) LIT64( 0x8000000000000000 );
4803 }
4804 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4805 }
4806 else {
4807 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4808 }
4809 return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
4810
4811 }
4812
4813 /*----------------------------------------------------------------------------
4814 | Returns the result of converting the quadruple-precision floating-point
4815 | value `a' to the 64-bit two's complement integer format. The conversion
4816 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4817 | Arithmetic, except that the conversion is always rounded toward zero.
4818 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
4819 | the conversion overflows, the largest integer with the same sign as `a' is
4820 | returned.
4821 *----------------------------------------------------------------------------*/
4822
4823 int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
4824 {
4825 flag aSign;
4826 int32 aExp, shiftCount;
4827 bits64 aSig0, aSig1;
4828 int64 z;
4829
4830 aSig1 = extractFloat128Frac1( a );
4831 aSig0 = extractFloat128Frac0( a );
4832 aExp = extractFloat128Exp( a );
4833 aSign = extractFloat128Sign( a );
4834 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4835 shiftCount = aExp - 0x402F;
4836 if ( 0 < shiftCount ) {
4837 if ( 0x403E <= aExp ) {
4838 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4839 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
4840 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4841 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
4842 }
4843 else {
4844 float_raise( float_flag_invalid STATUS_VAR);
4845 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4846 return LIT64( 0x7FFFFFFFFFFFFFFF );
4847 }
4848 }
4849 return (sbits64) LIT64( 0x8000000000000000 );
4850 }
4851 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4852 if ( (bits64) ( aSig1<<shiftCount ) ) {
4853 STATUS(float_exception_flags) |= float_flag_inexact;
4854 }
4855 }
4856 else {
4857 if ( aExp < 0x3FFF ) {
4858 if ( aExp | aSig0 | aSig1 ) {
4859 STATUS(float_exception_flags) |= float_flag_inexact;
4860 }
4861 return 0;
4862 }
4863 z = aSig0>>( - shiftCount );
4864 if ( aSig1
4865 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4866 STATUS(float_exception_flags) |= float_flag_inexact;
4867 }
4868 }
4869 if ( aSign ) z = - z;
4870 return z;
4871
4872 }
4873
4874 /*----------------------------------------------------------------------------
4875 | Returns the result of converting the quadruple-precision floating-point
4876 | value `a' to the single-precision floating-point format. The conversion
4877 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4878 | Arithmetic.
4879 *----------------------------------------------------------------------------*/
4880
4881 float32 float128_to_float32( float128 a STATUS_PARAM )
4882 {
4883 flag aSign;
4884 int32 aExp;
4885 bits64 aSig0, aSig1;
4886 bits32 zSig;
4887
4888 aSig1 = extractFloat128Frac1( a );
4889 aSig0 = extractFloat128Frac0( a );
4890 aExp = extractFloat128Exp( a );
4891 aSign = extractFloat128Sign( a );
4892 if ( aExp == 0x7FFF ) {
4893 if ( aSig0 | aSig1 ) {
4894 return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) );
4895 }
4896 return packFloat32( aSign, 0xFF, 0 );
4897 }
4898 aSig0 |= ( aSig1 != 0 );
4899 shift64RightJamming( aSig0, 18, &aSig0 );
4900 zSig = aSig0;
4901 if ( aExp || zSig ) {
4902 zSig |= 0x40000000;
4903 aExp -= 0x3F81;
4904 }
4905 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
4906
4907 }
4908
4909 /*----------------------------------------------------------------------------
4910 | Returns the result of converting the quadruple-precision floating-point
4911 | value `a' to the double-precision floating-point format. The conversion
4912 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4913 | Arithmetic.
4914 *----------------------------------------------------------------------------*/
4915
4916 float64 float128_to_float64( float128 a STATUS_PARAM )
4917 {
4918 flag aSign;
4919 int32 aExp;
4920 bits64 aSig0, aSig1;
4921
4922 aSig1 = extractFloat128Frac1( a );
4923 aSig0 = extractFloat128Frac0( a );
4924 aExp = extractFloat128Exp( a );
4925 aSign = extractFloat128Sign( a );
4926 if ( aExp == 0x7FFF ) {
4927 if ( aSig0 | aSig1 ) {
4928 return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) );
4929 }
4930 return packFloat64( aSign, 0x7FF, 0 );
4931 }
4932 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4933 aSig0 |= ( aSig1 != 0 );
4934 if ( aExp || aSig0 ) {
4935 aSig0 |= LIT64( 0x4000000000000000 );
4936 aExp -= 0x3C01;
4937 }
4938 return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
4939
4940 }
4941
4942 #ifdef FLOATX80
4943
4944 /*----------------------------------------------------------------------------
4945 | Returns the result of converting the quadruple-precision floating-point
4946 | value `a' to the extended double-precision floating-point format. The
4947 | conversion is performed according to the IEC/IEEE Standard for Binary
4948 | Floating-Point Arithmetic.
4949 *----------------------------------------------------------------------------*/
4950
4951 floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
4952 {
4953 flag aSign;
4954 int32 aExp;
4955 bits64 aSig0, aSig1;
4956
4957 aSig1 = extractFloat128Frac1( a );
4958 aSig0 = extractFloat128Frac0( a );
4959 aExp = extractFloat128Exp( a );
4960 aSign = extractFloat128Sign( a );
4961 if ( aExp == 0x7FFF ) {
4962 if ( aSig0 | aSig1 ) {
4963 return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) );
4964 }
4965 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4966 }
4967 if ( aExp == 0 ) {
4968 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4969 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4970 }
4971 else {
4972 aSig0 |= LIT64( 0x0001000000000000 );
4973 }
4974 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4975 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
4976
4977 }
4978
4979 #endif
4980
4981 /*----------------------------------------------------------------------------
4982 | Rounds the quadruple-precision floating-point value `a' to an integer, and
4983 | returns the result as a quadruple-precision floating-point value. The
4984 | operation is performed according to the IEC/IEEE Standard for Binary
4985 | Floating-Point Arithmetic.
4986 *----------------------------------------------------------------------------*/
4987
4988 float128 float128_round_to_int( float128 a STATUS_PARAM )
4989 {
4990 flag aSign;
4991 int32 aExp;
4992 bits64 lastBitMask, roundBitsMask;
4993 int8 roundingMode;
4994 float128 z;
4995
4996 aExp = extractFloat128Exp( a );
4997 if ( 0x402F <= aExp ) {
4998 if ( 0x406F <= aExp ) {
4999 if ( ( aExp == 0x7FFF )
5000 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5001 ) {
5002 return propagateFloat128NaN( a, a STATUS_VAR );
5003 }
5004 return a;
5005 }
5006 lastBitMask = 1;
5007 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5008 roundBitsMask = lastBitMask - 1;
5009 z = a;
5010 roundingMode = STATUS(float_rounding_mode);
5011 if ( roundingMode == float_round_nearest_even ) {
5012 if ( lastBitMask ) {
5013 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
5014 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
5015 }
5016 else {
5017 if ( (sbits64) z.low < 0 ) {
5018 ++z.high;
5019 if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
5020 }
5021 }
5022 }
5023 else if ( roundingMode != float_round_to_zero ) {
5024 if ( extractFloat128Sign( z )
5025 ^ ( roundingMode == float_round_up ) ) {
5026 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
5027 }
5028 }
5029 z.low &= ~ roundBitsMask;
5030 }
5031 else {
5032 if ( aExp < 0x3FFF ) {
5033 if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
5034 STATUS(float_exception_flags) |= float_flag_inexact;
5035 aSign = extractFloat128Sign( a );
5036 switch ( STATUS(float_rounding_mode) ) {
5037 case float_round_nearest_even:
5038 if ( ( aExp == 0x3FFE )
5039 && ( extractFloat128Frac0( a )
5040 | extractFloat128Frac1( a ) )
5041 ) {
5042 return packFloat128( aSign, 0x3FFF, 0, 0 );
5043 }
5044 break;
5045 case float_round_down:
5046 return
5047 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
5048 : packFloat128( 0, 0, 0, 0 );
5049 case float_round_up:
5050 return
5051 aSign ? packFloat128( 1, 0, 0, 0 )
5052 : packFloat128( 0, 0x3FFF, 0, 0 );
5053 }
5054 return packFloat128( aSign, 0, 0, 0 );
5055 }
5056 lastBitMask = 1;
5057 lastBitMask <<= 0x402F - aExp;
5058 roundBitsMask = lastBitMask - 1;
5059 z.low = 0;
5060 z.high = a.high;
5061 roundingMode = STATUS(float_rounding_mode);
5062 if ( roundingMode == float_round_nearest_even ) {
5063 z.high += lastBitMask>>1;
5064 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
5065 z.high &= ~ lastBitMask;
5066 }
5067 }
5068 else if ( roundingMode != float_round_to_zero ) {
5069 if ( extractFloat128Sign( z )
5070 ^ ( roundingMode == float_round_up ) ) {
5071 z.high |= ( a.low != 0 );
5072 z.high += roundBitsMask;
5073 }
5074 }
5075 z.high &= ~ roundBitsMask;
5076 }
5077 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
5078 STATUS(float_exception_flags) |= float_flag_inexact;
5079 }
5080 return z;
5081
5082 }
5083
5084 /*----------------------------------------------------------------------------
5085 | Returns the result of adding the absolute values of the quadruple-precision
5086 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
5087 | before being returned. `zSign' is ignored if the result is a NaN.
5088 | The addition is performed according to the IEC/IEEE Standard for Binary
5089 | Floating-Point Arithmetic.
5090 *----------------------------------------------------------------------------*/
5091
5092 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5093 {
5094 int32 aExp, bExp, zExp;
5095 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5096 int32 expDiff;
5097
5098 aSig1 = extractFloat128Frac1( a );
5099 aSig0 = extractFloat128Frac0( a );
5100 aExp = extractFloat128Exp( a );
5101 bSig1 = extractFloat128Frac1( b );
5102 bSig0 = extractFloat128Frac0( b );
5103 bExp = extractFloat128Exp( b );
5104 expDiff = aExp - bExp;
5105 if ( 0 < expDiff ) {
5106 if ( aExp == 0x7FFF ) {
5107 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5108 return a;
5109 }
5110 if ( bExp == 0 ) {
5111 --expDiff;
5112 }
5113 else {
5114 bSig0 |= LIT64( 0x0001000000000000 );
5115 }
5116 shift128ExtraRightJamming(
5117 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
5118 zExp = aExp;
5119 }
5120 else if ( expDiff < 0 ) {
5121 if ( bExp == 0x7FFF ) {
5122 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5123 return packFloat128( zSign, 0x7FFF, 0, 0 );
5124 }
5125 if ( aExp == 0 ) {
5126 ++expDiff;
5127 }
5128 else {
5129 aSig0 |= LIT64( 0x0001000000000000 );
5130 }
5131 shift128ExtraRightJamming(
5132 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
5133 zExp = bExp;
5134 }
5135 else {
5136 if ( aExp == 0x7FFF ) {
5137 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5138 return propagateFloat128NaN( a, b STATUS_VAR );
5139 }
5140 return a;
5141 }
5142 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5143 if ( aExp == 0 ) {
5144 if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
5145 return packFloat128( zSign, 0, zSig0, zSig1 );
5146 }
5147 zSig2 = 0;
5148 zSig0 |= LIT64( 0x0002000000000000 );
5149 zExp = aExp;
5150 goto shiftRight1;
5151 }
5152 aSig0 |= LIT64( 0x0001000000000000 );
5153 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5154 --zExp;
5155 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
5156 ++zExp;
5157 shiftRight1:
5158 shift128ExtraRightJamming(
5159 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5160 roundAndPack:
5161 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5162
5163 }
5164
5165 /*----------------------------------------------------------------------------
5166 | Returns the result of subtracting the absolute values of the quadruple-
5167 | precision floating-point values `a' and `b'. If `zSign' is 1, the
5168 | difference is negated before being returned. `zSign' is ignored if the
5169 | result is a NaN. The subtraction is performed according to the IEC/IEEE
5170 | Standard for Binary Floating-Point Arithmetic.
5171 *----------------------------------------------------------------------------*/
5172
5173 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5174 {
5175 int32 aExp, bExp, zExp;
5176 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
5177 int32 expDiff;
5178 float128 z;
5179
5180 aSig1 = extractFloat128Frac1( a );
5181 aSig0 = extractFloat128Frac0( a );
5182 aExp = extractFloat128Exp( a );
5183 bSig1 = extractFloat128Frac1( b );
5184 bSig0 = extractFloat128Frac0( b );
5185 bExp = extractFloat128Exp( b );
5186 expDiff = aExp - bExp;
5187 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5188 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
5189 if ( 0 < expDiff ) goto aExpBigger;
5190 if ( expDiff < 0 ) goto bExpBigger;
5191 if ( aExp == 0x7FFF ) {
5192 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5193 return propagateFloat128NaN( a, b STATUS_VAR );
5194 }
5195 float_raise( float_flag_invalid STATUS_VAR);
5196 z.low = float128_default_nan_low;
5197 z.high = float128_default_nan_high;
5198 return z;
5199 }
5200 if ( aExp == 0 ) {
5201 aExp = 1;
5202 bExp = 1;
5203 }
5204 if ( bSig0 < aSig0 ) goto aBigger;
5205 if ( aSig0 < bSig0 ) goto bBigger;
5206 if ( bSig1 < aSig1 ) goto aBigger;
5207 if ( aSig1 < bSig1 ) goto bBigger;
5208 return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
5209 bExpBigger:
5210 if ( bExp == 0x7FFF ) {
5211 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5212 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
5213 }
5214 if ( aExp == 0 ) {
5215 ++expDiff;
5216 }
5217 else {
5218 aSig0 |= LIT64( 0x4000000000000000 );
5219 }
5220 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5221 bSig0 |= LIT64( 0x4000000000000000 );
5222 bBigger:
5223 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
5224 zExp = bExp;
5225 zSign ^= 1;
5226 goto normalizeRoundAndPack;
5227 aExpBigger:
5228 if ( aExp == 0x7FFF ) {
5229 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5230 return a;
5231 }
5232 if ( bExp == 0 ) {
5233 --expDiff;
5234 }
5235 else {
5236 bSig0 |= LIT64( 0x4000000000000000 );
5237 }
5238 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
5239 aSig0 |= LIT64( 0x4000000000000000 );
5240 aBigger:
5241 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5242 zExp = aExp;
5243 normalizeRoundAndPack:
5244 --zExp;
5245 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
5246
5247 }
5248
5249 /*----------------------------------------------------------------------------
5250 | Returns the result of adding the quadruple-precision floating-point values
5251 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
5252 | for Binary Floating-Point Arithmetic.
5253 *----------------------------------------------------------------------------*/
5254
5255 float128 float128_add( float128 a, float128 b STATUS_PARAM )
5256 {
5257 flag aSign, bSign;
5258
5259 aSign = extractFloat128Sign( a );
5260 bSign = extractFloat128Sign( b );
5261 if ( aSign == bSign ) {
5262 return addFloat128Sigs( a, b, aSign STATUS_VAR );
5263 }
5264 else {
5265 return subFloat128Sigs( a, b, aSign STATUS_VAR );
5266 }
5267
5268 }
5269
5270 /*----------------------------------------------------------------------------
5271 | Returns the result of subtracting the quadruple-precision floating-point
5272 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5273 | Standard for Binary Floating-Point Arithmetic.
5274 *----------------------------------------------------------------------------*/
5275
5276 float128 float128_sub( float128 a, float128 b STATUS_PARAM )
5277 {
5278 flag aSign, bSign;
5279
5280 aSign = extractFloat128Sign( a );
5281 bSign = extractFloat128Sign( b );
5282 if ( aSign == bSign ) {
5283 return subFloat128Sigs( a, b, aSign STATUS_VAR );
5284 }
5285 else {
5286 return addFloat128Sigs( a, b, aSign STATUS_VAR );
5287 }
5288
5289 }
5290
5291 /*----------------------------------------------------------------------------
5292 | Returns the result of multiplying the quadruple-precision floating-point
5293 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5294 | Standard for Binary Floating-Point Arithmetic.
5295 *----------------------------------------------------------------------------*/
5296
5297 float128 float128_mul( float128 a, float128 b STATUS_PARAM )
5298 {
5299 flag aSign, bSign, zSign;
5300 int32 aExp, bExp, zExp;
5301 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
5302 float128 z;
5303
5304 aSig1 = extractFloat128Frac1( a );
5305 aSig0 = extractFloat128Frac0( a );
5306 aExp = extractFloat128Exp( a );
5307 aSign = extractFloat128Sign( a );
5308 bSig1 = extractFloat128Frac1( b );
5309 bSig0 = extractFloat128Frac0( b );
5310 bExp = extractFloat128Exp( b );
5311 bSign = extractFloat128Sign( b );
5312 zSign = aSign ^ bSign;
5313 if ( aExp == 0x7FFF ) {
5314 if ( ( aSig0 | aSig1 )
5315 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5316 return propagateFloat128NaN( a, b STATUS_VAR );
5317 }
5318 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
5319 return packFloat128( zSign, 0x7FFF, 0, 0 );
5320 }
5321 if ( bExp == 0x7FFF ) {
5322 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5323 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5324 invalid:
5325 float_raise( float_flag_invalid STATUS_VAR);
5326 z.low = float128_default_nan_low;
5327 z.high = float128_default_nan_high;
5328 return z;
5329 }
5330 return packFloat128( zSign, 0x7FFF, 0, 0 );
5331 }
5332 if ( aExp == 0 ) {
5333 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5334 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5335 }
5336 if ( bExp == 0 ) {
5337 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5338 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5339 }
5340 zExp = aExp + bExp - 0x4000;
5341 aSig0 |= LIT64( 0x0001000000000000 );
5342 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5343 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5344 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5345 zSig2 |= ( zSig3 != 0 );
5346 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5347 shift128ExtraRightJamming(
5348 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5349 ++zExp;
5350 }
5351 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5352
5353 }
5354
5355 /*----------------------------------------------------------------------------
5356 | Returns the result of dividing the quadruple-precision floating-point value
5357 | `a' by the corresponding value `b'. The operation is performed according to
5358 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5359 *----------------------------------------------------------------------------*/
5360
5361 float128 float128_div( float128 a, float128 b STATUS_PARAM )
5362 {
5363 flag aSign, bSign, zSign;
5364 int32 aExp, bExp, zExp;
5365 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5366 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5367 float128 z;
5368
5369 aSig1 = extractFloat128Frac1( a );
5370 aSig0 = extractFloat128Frac0( a );
5371 aExp = extractFloat128Exp( a );
5372 aSign = extractFloat128Sign( a );
5373 bSig1 = extractFloat128Frac1( b );
5374 bSig0 = extractFloat128Frac0( b );
5375 bExp = extractFloat128Exp( b );
5376 bSign = extractFloat128Sign( b );
5377 zSign = aSign ^ bSign;
5378 if ( aExp == 0x7FFF ) {
5379 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5380 if ( bExp == 0x7FFF ) {
5381 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5382 goto invalid;
5383 }
5384 return packFloat128( zSign, 0x7FFF, 0, 0 );
5385 }
5386 if ( bExp == 0x7FFF ) {
5387 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5388 return packFloat128( zSign, 0, 0, 0 );
5389 }
5390 if ( bExp == 0 ) {
5391 if ( ( bSig0 | bSig1 ) == 0 ) {
5392 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5393 invalid:
5394 float_raise( float_flag_invalid STATUS_VAR);
5395 z.low = float128_default_nan_low;
5396 z.high = float128_default_nan_high;
5397 return z;
5398 }
5399 float_raise( float_flag_divbyzero STATUS_VAR);
5400 return packFloat128( zSign, 0x7FFF, 0, 0 );
5401 }
5402 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5403 }
5404 if ( aExp == 0 ) {
5405 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5406 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5407 }
5408 zExp = aExp - bExp + 0x3FFD;
5409 shortShift128Left(
5410 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5411 shortShift128Left(
5412 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5413 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5414 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5415 ++zExp;
5416 }
5417 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5418 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5419 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5420 while ( (sbits64) rem0 < 0 ) {
5421 --zSig0;
5422 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5423 }
5424 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5425 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5426 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5427 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5428 while ( (sbits64) rem1 < 0 ) {
5429 --zSig1;
5430 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5431 }
5432 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5433 }
5434 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5435 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5436
5437 }
5438
5439 /*----------------------------------------------------------------------------
5440 | Returns the remainder of the quadruple-precision floating-point value `a'
5441 | with respect to the corresponding value `b'. The operation is performed
5442 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5443 *----------------------------------------------------------------------------*/
5444
5445 float128 float128_rem( float128 a, float128 b STATUS_PARAM )
5446 {
5447 flag aSign, zSign;
5448 int32 aExp, bExp, expDiff;
5449 bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5450 bits64 allZero, alternateASig0, alternateASig1, sigMean1;
5451 sbits64 sigMean0;
5452 float128 z;
5453
5454 aSig1 = extractFloat128Frac1( a );
5455 aSig0 = extractFloat128Frac0( a );
5456 aExp = extractFloat128Exp( a );
5457 aSign = extractFloat128Sign( a );
5458 bSig1 = extractFloat128Frac1( b );
5459 bSig0 = extractFloat128Frac0( b );
5460 bExp = extractFloat128Exp( b );
5461 if ( aExp == 0x7FFF ) {
5462 if ( ( aSig0 | aSig1 )
5463 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5464 return propagateFloat128NaN( a, b STATUS_VAR );
5465 }
5466 goto invalid;
5467 }
5468 if ( bExp == 0x7FFF ) {
5469 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5470 return a;
5471 }
5472 if ( bExp == 0 ) {
5473 if ( ( bSig0 | bSig1 ) == 0 ) {
5474 invalid:
5475 float_raise( float_flag_invalid STATUS_VAR);
5476 z.low = float128_default_nan_low;
5477 z.high = float128_default_nan_high;
5478 return z;
5479 }
5480 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5481 }
5482 if ( aExp == 0 ) {
5483 if ( ( aSig0 | aSig1 ) == 0 ) return a;
5484 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5485 }
5486 expDiff = aExp - bExp;
5487 if ( expDiff < -1 ) return a;
5488 shortShift128Left(
5489 aSig0 | LIT64( 0x0001000000000000 ),
5490 aSig1,
5491 15 - ( expDiff < 0 ),
5492 &aSig0,
5493 &aSig1
5494 );
5495 shortShift128Left(
5496 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5497 q = le128( bSig0, bSig1, aSig0, aSig1 );
5498 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5499 expDiff -= 64;
5500 while ( 0 < expDiff ) {
5501 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5502 q = ( 4 < q ) ? q - 4 : 0;
5503 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5504 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5505 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5506 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5507 expDiff -= 61;
5508 }
5509 if ( -64 < expDiff ) {
5510 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5511 q = ( 4 < q ) ? q - 4 : 0;
5512 q >>= - expDiff;
5513 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5514 expDiff += 52;
5515 if ( expDiff < 0 ) {
5516 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5517 }
5518 else {
5519 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5520 }
5521 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5522 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5523 }
5524 else {
5525 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5526 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5527 }
5528 do {
5529 alternateASig0 = aSig0;
5530 alternateASig1 = aSig1;
5531 ++q;
5532 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5533 } while ( 0 <= (sbits64) aSig0 );
5534 add128(
5535 aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 );
5536 if ( ( sigMean0 < 0 )
5537 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5538 aSig0 = alternateASig0;
5539 aSig1 = alternateASig1;
5540 }
5541 zSign = ( (sbits64) aSig0 < 0 );
5542 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5543 return
5544 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
5545
5546 }
5547
5548 /*----------------------------------------------------------------------------
5549 | Returns the square root of the quadruple-precision floating-point value `a'.
5550 | The operation is performed according to the IEC/IEEE Standard for Binary
5551 | Floating-Point Arithmetic.
5552 *----------------------------------------------------------------------------*/
5553
5554 float128 float128_sqrt( float128 a STATUS_PARAM )
5555 {
5556 flag aSign;
5557 int32 aExp, zExp;
5558 bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5559 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5560 float128 z;
5561
5562 aSig1 = extractFloat128Frac1( a );
5563 aSig0 = extractFloat128Frac0( a );
5564 aExp = extractFloat128Exp( a );
5565 aSign = extractFloat128Sign( a );
5566 if ( aExp == 0x7FFF ) {
5567 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
5568 if ( ! aSign ) return a;
5569 goto invalid;
5570 }
5571 if ( aSign ) {
5572 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5573 invalid:
5574 float_raise( float_flag_invalid STATUS_VAR);
5575 z.low = float128_default_nan_low;
5576 z.high = float128_default_nan_high;
5577 return z;
5578 }
5579 if ( aExp == 0 ) {
5580 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5581 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5582 }
5583 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5584 aSig0 |= LIT64( 0x0001000000000000 );
5585 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5586 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5587 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5588 doubleZSig0 = zSig0<<1;
5589 mul64To128( zSig0, zSig0, &term0, &term1 );
5590 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5591 while ( (sbits64) rem0 < 0 ) {
5592 --zSig0;
5593 doubleZSig0 -= 2;
5594 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5595 }
5596 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5597 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5598 if ( zSig1 == 0 ) zSig1 = 1;
5599 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5600 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5601 mul64To128( zSig1, zSig1, &term2, &term3 );
5602 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5603 while ( (sbits64) rem1 < 0 ) {
5604 --zSig1;
5605 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5606 term3 |= 1;
5607 term2 |= doubleZSig0;
5608 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5609 }
5610 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5611 }
5612 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5613 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5614
5615 }
5616
5617 /*----------------------------------------------------------------------------
5618 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5619 | the corresponding value `b', and 0 otherwise. The comparison is performed
5620 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5621 *----------------------------------------------------------------------------*/
5622
5623 int float128_eq( float128 a, float128 b STATUS_PARAM )
5624 {
5625
5626 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5627 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5628 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5629 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5630 ) {
5631 if ( float128_is_signaling_nan( a )
5632 || float128_is_signaling_nan( b ) ) {
5633 float_raise( float_flag_invalid STATUS_VAR);
5634 }
5635 return 0;
5636 }
5637 return
5638 ( a.low == b.low )
5639 && ( ( a.high == b.high )
5640 || ( ( a.low == 0 )
5641 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5642 );
5643
5644 }
5645
5646 /*----------------------------------------------------------------------------
5647 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5648 | or equal to the corresponding value `b', and 0 otherwise. The comparison
5649 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5650 | Arithmetic.
5651 *----------------------------------------------------------------------------*/
5652
5653 int float128_le( float128 a, float128 b STATUS_PARAM )
5654 {
5655 flag aSign, bSign;
5656
5657 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5658 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5659 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5660 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5661 ) {
5662 float_raise( float_flag_invalid STATUS_VAR);
5663 return 0;
5664 }
5665 aSign = extractFloat128Sign( a );
5666 bSign = extractFloat128Sign( b );
5667 if ( aSign != bSign ) {
5668 return
5669 aSign
5670 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5671 == 0 );
5672 }
5673 return
5674 aSign ? le128( b.high, b.low, a.high, a.low )
5675 : le128( a.high, a.low, b.high, b.low );
5676
5677 }
5678
5679 /*----------------------------------------------------------------------------
5680 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5681 | the corresponding value `b', and 0 otherwise. The comparison is performed
5682 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5683 *----------------------------------------------------------------------------*/
5684
5685 int float128_lt( float128 a, float128 b STATUS_PARAM )
5686 {
5687 flag aSign, bSign;
5688
5689 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5690 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5691 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5692 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5693 ) {
5694 float_raise( float_flag_invalid STATUS_VAR);
5695 return 0;
5696 }
5697 aSign = extractFloat128Sign( a );
5698 bSign = extractFloat128Sign( b );
5699 if ( aSign != bSign ) {
5700 return
5701 aSign
5702 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5703 != 0 );
5704 }
5705 return
5706 aSign ? lt128( b.high, b.low, a.high, a.low )
5707 : lt128( a.high, a.low, b.high, b.low );
5708
5709 }
5710
5711 /*----------------------------------------------------------------------------
5712 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5713 | the corresponding value `b', and 0 otherwise. The invalid exception is
5714 | raised if either operand is a NaN. Otherwise, the comparison is performed
5715 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5716 *----------------------------------------------------------------------------*/
5717
5718 int float128_eq_signaling( float128 a, float128 b STATUS_PARAM )
5719 {
5720
5721 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5722 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5723 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5724 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5725 ) {
5726 float_raise( float_flag_invalid STATUS_VAR);
5727 return 0;
5728 }
5729 return
5730 ( a.low == b.low )
5731 && ( ( a.high == b.high )
5732 || ( ( a.low == 0 )
5733 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5734 );
5735
5736 }
5737
5738 /*----------------------------------------------------------------------------
5739 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5740 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5741 | cause an exception. Otherwise, the comparison is performed according to the
5742 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5743 *----------------------------------------------------------------------------*/
5744
5745 int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
5746 {
5747 flag aSign, bSign;
5748
5749 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5750 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5751 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5752 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5753 ) {
5754 if ( float128_is_signaling_nan( a )
5755 || float128_is_signaling_nan( b ) ) {
5756 float_raise( float_flag_invalid STATUS_VAR);
5757 }
5758 return 0;
5759 }
5760 aSign = extractFloat128Sign( a );
5761 bSign = extractFloat128Sign( b );
5762 if ( aSign != bSign ) {
5763 return
5764 aSign
5765 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5766 == 0 );
5767 }
5768 return
5769 aSign ? le128( b.high, b.low, a.high, a.low )
5770 : le128( a.high, a.low, b.high, b.low );
5771
5772 }
5773
5774 /*----------------------------------------------------------------------------
5775 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5776 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5777 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
5778 | Standard for Binary Floating-Point Arithmetic.
5779 *----------------------------------------------------------------------------*/
5780
5781 int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
5782 {
5783 flag aSign, bSign;
5784
5785 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5786 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5787 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5788 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5789 ) {
5790 if ( float128_is_signaling_nan( a )
5791 || float128_is_signaling_nan( b ) ) {
5792 float_raise( float_flag_invalid STATUS_VAR);
5793 }
5794 return 0;
5795 }
5796 aSign = extractFloat128Sign( a );
5797 bSign = extractFloat128Sign( b );
5798 if ( aSign != bSign ) {
5799 return
5800 aSign
5801 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5802 != 0 );
5803 }
5804 return
5805 aSign ? lt128( b.high, b.low, a.high, a.low )
5806 : lt128( a.high, a.low, b.high, b.low );
5807
5808 }
5809
5810 #endif
5811
5812 /* misc functions */
5813 float32 uint32_to_float32( unsigned int a STATUS_PARAM )
5814 {
5815 return int64_to_float32(a STATUS_VAR);
5816 }
5817
5818 float64 uint32_to_float64( unsigned int a STATUS_PARAM )
5819 {
5820 return int64_to_float64(a STATUS_VAR);
5821 }
5822
5823 unsigned int float32_to_uint32( float32 a STATUS_PARAM )
5824 {
5825 int64_t v;
5826 unsigned int res;
5827
5828 v = float32_to_int64(a STATUS_VAR);
5829 if (v < 0) {
5830 res = 0;
5831 float_raise( float_flag_invalid STATUS_VAR);
5832 } else if (v > 0xffffffff) {
5833 res = 0xffffffff;
5834 float_raise( float_flag_invalid STATUS_VAR);
5835 } else {
5836 res = v;
5837 }
5838 return res;
5839 }
5840
5841 unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
5842 {
5843 int64_t v;
5844 unsigned int res;
5845
5846 v = float32_to_int64_round_to_zero(a STATUS_VAR);
5847 if (v < 0) {
5848 res = 0;
5849 float_raise( float_flag_invalid STATUS_VAR);
5850 } else if (v > 0xffffffff) {
5851 res = 0xffffffff;
5852 float_raise( float_flag_invalid STATUS_VAR);
5853 } else {
5854 res = v;
5855 }
5856 return res;
5857 }
5858
5859 unsigned int float32_to_uint16_round_to_zero( float32 a STATUS_PARAM )
5860 {
5861 int64_t v;
5862 unsigned int res;
5863
5864 v = float32_to_int64_round_to_zero(a STATUS_VAR);
5865 if (v < 0) {
5866 res = 0;
5867 float_raise( float_flag_invalid STATUS_VAR);
5868 } else if (v > 0xffff) {
5869 res = 0xffff;
5870 float_raise( float_flag_invalid STATUS_VAR);
5871 } else {
5872 res = v;
5873 }
5874 return res;
5875 }
5876
5877 unsigned int float64_to_uint32( float64 a STATUS_PARAM )
5878 {
5879 int64_t v;
5880 unsigned int res;
5881
5882 v = float64_to_int64(a STATUS_VAR);
5883 if (v < 0) {
5884 res = 0;
5885 float_raise( float_flag_invalid STATUS_VAR);
5886 } else if (v > 0xffffffff) {
5887 res = 0xffffffff;
5888 float_raise( float_flag_invalid STATUS_VAR);
5889 } else {
5890 res = v;
5891 }
5892 return res;
5893 }
5894
5895 unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
5896 {
5897 int64_t v;
5898 unsigned int res;
5899
5900 v = float64_to_int64_round_to_zero(a STATUS_VAR);
5901 if (v < 0) {
5902 res = 0;
5903 float_raise( float_flag_invalid STATUS_VAR);
5904 } else if (v > 0xffffffff) {
5905 res = 0xffffffff;
5906 float_raise( float_flag_invalid STATUS_VAR);
5907 } else {
5908 res = v;
5909 }
5910 return res;
5911 }
5912
5913 unsigned int float64_to_uint16_round_to_zero( float64 a STATUS_PARAM )
5914 {
5915 int64_t v;
5916 unsigned int res;
5917
5918 v = float64_to_int64_round_to_zero(a STATUS_VAR);
5919 if (v < 0) {
5920 res = 0;
5921 float_raise( float_flag_invalid STATUS_VAR);
5922 } else if (v > 0xffff) {
5923 res = 0xffff;
5924 float_raise( float_flag_invalid STATUS_VAR);
5925 } else {
5926 res = v;
5927 }
5928 return res;
5929 }
5930
5931 /* FIXME: This looks broken. */
5932 uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
5933 {
5934 int64_t v;
5935
5936 v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5937 v += float64_val(a);
5938 v = float64_to_int64(make_float64(v) STATUS_VAR);
5939
5940 return v - INT64_MIN;
5941 }
5942
5943 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
5944 {
5945 int64_t v;
5946
5947 v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5948 v += float64_val(a);
5949 v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);
5950
5951 return v - INT64_MIN;
5952 }
5953
5954 #define COMPARE(s, nan_exp) \
5955 INLINE int float ## s ## _compare_internal( float ## s a, float ## s b, \
5956 int is_quiet STATUS_PARAM ) \
5957 { \
5958 flag aSign, bSign; \
5959 bits ## s av, bv; \
5960 a = float ## s ## _squash_input_denormal(a STATUS_VAR); \
5961 b = float ## s ## _squash_input_denormal(b STATUS_VAR); \
5962 \
5963 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
5964 extractFloat ## s ## Frac( a ) ) || \
5965 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
5966 extractFloat ## s ## Frac( b ) )) { \
5967 if (!is_quiet || \
5968 float ## s ## _is_signaling_nan( a ) || \
5969 float ## s ## _is_signaling_nan( b ) ) { \
5970 float_raise( float_flag_invalid STATUS_VAR); \
5971 } \
5972 return float_relation_unordered; \
5973 } \
5974 aSign = extractFloat ## s ## Sign( a ); \
5975 bSign = extractFloat ## s ## Sign( b ); \
5976 av = float ## s ## _val(a); \
5977 bv = float ## s ## _val(b); \
5978 if ( aSign != bSign ) { \
5979 if ( (bits ## s) ( ( av | bv )<<1 ) == 0 ) { \
5980 /* zero case */ \
5981 return float_relation_equal; \
5982 } else { \
5983 return 1 - (2 * aSign); \
5984 } \
5985 } else { \
5986 if (av == bv) { \
5987 return float_relation_equal; \
5988 } else { \
5989 return 1 - 2 * (aSign ^ ( av < bv )); \
5990 } \
5991 } \
5992 } \
5993 \
5994 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM ) \
5995 { \
5996 return float ## s ## _compare_internal(a, b, 0 STATUS_VAR); \
5997 } \
5998 \
5999 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \
6000 { \
6001 return float ## s ## _compare_internal(a, b, 1 STATUS_VAR); \
6002 }
6003
6004 COMPARE(32, 0xff)
6005 COMPARE(64, 0x7ff)
6006
6007 INLINE int float128_compare_internal( float128 a, float128 b,
6008 int is_quiet STATUS_PARAM )
6009 {
6010 flag aSign, bSign;
6011
6012 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6013 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6014 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6015 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6016 if (!is_quiet ||
6017 float128_is_signaling_nan( a ) ||
6018 float128_is_signaling_nan( b ) ) {
6019 float_raise( float_flag_invalid STATUS_VAR);
6020 }
6021 return float_relation_unordered;
6022 }
6023 aSign = extractFloat128Sign( a );
6024 bSign = extractFloat128Sign( b );
6025 if ( aSign != bSign ) {
6026 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6027 /* zero case */
6028 return float_relation_equal;
6029 } else {
6030 return 1 - (2 * aSign);
6031 }
6032 } else {
6033 if (a.low == b.low && a.high == b.high) {
6034 return float_relation_equal;
6035 } else {
6036 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6037 }
6038 }
6039 }
6040
6041 int float128_compare( float128 a, float128 b STATUS_PARAM )
6042 {
6043 return float128_compare_internal(a, b, 0 STATUS_VAR);
6044 }
6045
6046 int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
6047 {
6048 return float128_compare_internal(a, b, 1 STATUS_VAR);
6049 }
6050
6051 /* Multiply A by 2 raised to the power N. */
6052 float32 float32_scalbn( float32 a, int n STATUS_PARAM )
6053 {
6054 flag aSign;
6055 int16 aExp;
6056 bits32 aSig;
6057
6058 a = float32_squash_input_denormal(a STATUS_VAR);
6059 aSig = extractFloat32Frac( a );
6060 aExp = extractFloat32Exp( a );
6061 aSign = extractFloat32Sign( a );
6062
6063 if ( aExp == 0xFF ) {
6064 return a;
6065 }
6066 if ( aExp != 0 )
6067 aSig |= 0x00800000;
6068 else if ( aSig == 0 )
6069 return a;
6070
6071 aExp += n - 1;
6072 aSig <<= 7;
6073 return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
6074 }
6075
6076 float64 float64_scalbn( float64 a, int n STATUS_PARAM )
6077 {
6078 flag aSign;
6079 int16 aExp;
6080 bits64 aSig;
6081
6082 a = float64_squash_input_denormal(a STATUS_VAR);
6083 aSig = extractFloat64Frac( a );
6084 aExp = extractFloat64Exp( a );
6085 aSign = extractFloat64Sign( a );
6086
6087 if ( aExp == 0x7FF ) {
6088 return a;
6089 }
6090 if ( aExp != 0 )
6091 aSig |= LIT64( 0x0010000000000000 );
6092 else if ( aSig == 0 )
6093 return a;
6094
6095 aExp += n - 1;
6096 aSig <<= 10;
6097 return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
6098 }
6099
6100 #ifdef FLOATX80
6101 floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
6102 {
6103 flag aSign;
6104 int16 aExp;
6105 bits64 aSig;
6106
6107 aSig = extractFloatx80Frac( a );
6108 aExp = extractFloatx80Exp( a );
6109 aSign = extractFloatx80Sign( a );
6110
6111 if ( aExp == 0x7FF ) {
6112 return a;
6113 }
6114 if (aExp == 0 && aSig == 0)
6115 return a;
6116
6117 aExp += n;
6118 return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
6119 aSign, aExp, aSig, 0 STATUS_VAR );
6120 }
6121 #endif
6122
6123 #ifdef FLOAT128
6124 float128 float128_scalbn( float128 a, int n STATUS_PARAM )
6125 {
6126 flag aSign;
6127 int32 aExp;
6128 bits64 aSig0, aSig1;
6129
6130 aSig1 = extractFloat128Frac1( a );
6131 aSig0 = extractFloat128Frac0( a );
6132 aExp = extractFloat128Exp( a );
6133 aSign = extractFloat128Sign( a );
6134 if ( aExp == 0x7FFF ) {
6135 return a;
6136 }
6137 if ( aExp != 0 )
6138 aSig0 |= LIT64( 0x0001000000000000 );
6139 else if ( aSig0 == 0 && aSig1 == 0 )
6140 return a;
6141
6142 aExp += n - 1;
6143 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
6144 STATUS_VAR );
6145
6146 }
6147 #endif