]> git.proxmox.com Git - qemu.git/blob - fpu/softfloat.c
Merge branch 'spice.v23.pull' of git://anongit.freedesktop.org/spice/qemu
[qemu.git] / fpu / softfloat.c
1
2 /*============================================================================
3
4 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
5 Package, Release 2b.
6
7 Written by John R. Hauser. This work was made possible in part by the
8 International Computer Science Institute, located at Suite 600, 1947 Center
9 Street, Berkeley, California 94704. Funding was partially provided by the
10 National Science Foundation under grant MIP-9311980. The original version
11 of this code was written as part of a project to build a fixed-point vector
12 processor in collaboration with the University of California at Berkeley,
13 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
14 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
15 arithmetic/SoftFloat.html'.
16
17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
18 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
19 RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
20 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
21 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
22 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
23 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
24 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
25
26 Derivative works are acceptable, even for commercial purposes, so long as
27 (1) the source code for the derivative work includes prominent notice that
28 the work is derivative, and (2) the source code includes prominent notice with
29 these four paragraphs for those parts of this code that are retained.
30
31 =============================================================================*/
32
33 /* FIXME: Flush-To-Zero only effects results. Denormal inputs should also
34 be flushed to zero. */
35 #include "softfloat.h"
36
37 /*----------------------------------------------------------------------------
38 | Primitive arithmetic functions, including multi-word arithmetic, and
39 | division and square root approximations. (Can be specialized to target if
40 | desired.)
41 *----------------------------------------------------------------------------*/
42 #include "softfloat-macros.h"
43
44 /*----------------------------------------------------------------------------
45 | Functions and definitions to determine: (1) whether tininess for underflow
46 | is detected before or after rounding by default, (2) what (if anything)
47 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
48 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
49 | are propagated from function inputs to output. These details are target-
50 | specific.
51 *----------------------------------------------------------------------------*/
52 #include "softfloat-specialize.h"
53
54 void set_float_rounding_mode(int val STATUS_PARAM)
55 {
56 STATUS(float_rounding_mode) = val;
57 }
58
59 void set_float_exception_flags(int val STATUS_PARAM)
60 {
61 STATUS(float_exception_flags) = val;
62 }
63
64 #ifdef FLOATX80
65 void set_floatx80_rounding_precision(int val STATUS_PARAM)
66 {
67 STATUS(floatx80_rounding_precision) = val;
68 }
69 #endif
70
71 /*----------------------------------------------------------------------------
72 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
73 | and 7, and returns the properly rounded 32-bit integer corresponding to the
74 | input. If `zSign' is 1, the input is negated before being converted to an
75 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
76 | is simply rounded to an integer, with the inexact exception raised if the
77 | input cannot be represented exactly as an integer. However, if the fixed-
78 | point input is too large, the invalid exception is raised and the largest
79 | positive or negative integer is returned.
80 *----------------------------------------------------------------------------*/
81
82 static int32 roundAndPackInt32( flag zSign, bits64 absZ STATUS_PARAM)
83 {
84 int8 roundingMode;
85 flag roundNearestEven;
86 int8 roundIncrement, roundBits;
87 int32 z;
88
89 roundingMode = STATUS(float_rounding_mode);
90 roundNearestEven = ( roundingMode == float_round_nearest_even );
91 roundIncrement = 0x40;
92 if ( ! roundNearestEven ) {
93 if ( roundingMode == float_round_to_zero ) {
94 roundIncrement = 0;
95 }
96 else {
97 roundIncrement = 0x7F;
98 if ( zSign ) {
99 if ( roundingMode == float_round_up ) roundIncrement = 0;
100 }
101 else {
102 if ( roundingMode == float_round_down ) roundIncrement = 0;
103 }
104 }
105 }
106 roundBits = absZ & 0x7F;
107 absZ = ( absZ + roundIncrement )>>7;
108 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
109 z = absZ;
110 if ( zSign ) z = - z;
111 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
112 float_raise( float_flag_invalid STATUS_VAR);
113 return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
114 }
115 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
116 return z;
117
118 }
119
120 /*----------------------------------------------------------------------------
121 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
122 | `absZ1', with binary point between bits 63 and 64 (between the input words),
123 | and returns the properly rounded 64-bit integer corresponding to the input.
124 | If `zSign' is 1, the input is negated before being converted to an integer.
125 | Ordinarily, the fixed-point input is simply rounded to an integer, with
126 | the inexact exception raised if the input cannot be represented exactly as
127 | an integer. However, if the fixed-point input is too large, the invalid
128 | exception is raised and the largest positive or negative integer is
129 | returned.
130 *----------------------------------------------------------------------------*/
131
132 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 STATUS_PARAM)
133 {
134 int8 roundingMode;
135 flag roundNearestEven, increment;
136 int64 z;
137
138 roundingMode = STATUS(float_rounding_mode);
139 roundNearestEven = ( roundingMode == float_round_nearest_even );
140 increment = ( (sbits64) absZ1 < 0 );
141 if ( ! roundNearestEven ) {
142 if ( roundingMode == float_round_to_zero ) {
143 increment = 0;
144 }
145 else {
146 if ( zSign ) {
147 increment = ( roundingMode == float_round_down ) && absZ1;
148 }
149 else {
150 increment = ( roundingMode == float_round_up ) && absZ1;
151 }
152 }
153 }
154 if ( increment ) {
155 ++absZ0;
156 if ( absZ0 == 0 ) goto overflow;
157 absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
158 }
159 z = absZ0;
160 if ( zSign ) z = - z;
161 if ( z && ( ( z < 0 ) ^ zSign ) ) {
162 overflow:
163 float_raise( float_flag_invalid STATUS_VAR);
164 return
165 zSign ? (sbits64) LIT64( 0x8000000000000000 )
166 : LIT64( 0x7FFFFFFFFFFFFFFF );
167 }
168 if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
169 return z;
170
171 }
172
173 /*----------------------------------------------------------------------------
174 | Returns the fraction bits of the single-precision floating-point value `a'.
175 *----------------------------------------------------------------------------*/
176
177 INLINE bits32 extractFloat32Frac( float32 a )
178 {
179
180 return float32_val(a) & 0x007FFFFF;
181
182 }
183
184 /*----------------------------------------------------------------------------
185 | Returns the exponent bits of the single-precision floating-point value `a'.
186 *----------------------------------------------------------------------------*/
187
188 INLINE int16 extractFloat32Exp( float32 a )
189 {
190
191 return ( float32_val(a)>>23 ) & 0xFF;
192
193 }
194
195 /*----------------------------------------------------------------------------
196 | Returns the sign bit of the single-precision floating-point value `a'.
197 *----------------------------------------------------------------------------*/
198
199 INLINE flag extractFloat32Sign( float32 a )
200 {
201
202 return float32_val(a)>>31;
203
204 }
205
206 /*----------------------------------------------------------------------------
207 | Normalizes the subnormal single-precision floating-point value represented
208 | by the denormalized significand `aSig'. The normalized exponent and
209 | significand are stored at the locations pointed to by `zExpPtr' and
210 | `zSigPtr', respectively.
211 *----------------------------------------------------------------------------*/
212
213 static void
214 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
215 {
216 int8 shiftCount;
217
218 shiftCount = countLeadingZeros32( aSig ) - 8;
219 *zSigPtr = aSig<<shiftCount;
220 *zExpPtr = 1 - shiftCount;
221
222 }
223
224 /*----------------------------------------------------------------------------
225 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
226 | single-precision floating-point value, returning the result. After being
227 | shifted into the proper positions, the three fields are simply added
228 | together to form the result. This means that any integer portion of `zSig'
229 | will be added into the exponent. Since a properly normalized significand
230 | will have an integer portion equal to 1, the `zExp' input should be 1 less
231 | than the desired result exponent whenever `zSig' is a complete, normalized
232 | significand.
233 *----------------------------------------------------------------------------*/
234
235 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
236 {
237
238 return make_float32(
239 ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig);
240
241 }
242
243 /*----------------------------------------------------------------------------
244 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
245 | and significand `zSig', and returns the proper single-precision floating-
246 | point value corresponding to the abstract input. Ordinarily, the abstract
247 | value is simply rounded and packed into the single-precision format, with
248 | the inexact exception raised if the abstract input cannot be represented
249 | exactly. However, if the abstract value is too large, the overflow and
250 | inexact exceptions are raised and an infinity or maximal finite value is
251 | returned. If the abstract value is too small, the input value is rounded to
252 | a subnormal number, and the underflow and inexact exceptions are raised if
253 | the abstract input cannot be represented exactly as a subnormal single-
254 | precision floating-point number.
255 | The input significand `zSig' has its binary point between bits 30
256 | and 29, which is 7 bits to the left of the usual location. This shifted
257 | significand must be normalized or smaller. If `zSig' is not normalized,
258 | `zExp' must be 0; in that case, the result returned is a subnormal number,
259 | and it must not require rounding. In the usual case that `zSig' is
260 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
261 | The handling of underflow and overflow follows the IEC/IEEE Standard for
262 | Binary Floating-Point Arithmetic.
263 *----------------------------------------------------------------------------*/
264
265 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
266 {
267 int8 roundingMode;
268 flag roundNearestEven;
269 int8 roundIncrement, roundBits;
270 flag isTiny;
271
272 roundingMode = STATUS(float_rounding_mode);
273 roundNearestEven = ( roundingMode == float_round_nearest_even );
274 roundIncrement = 0x40;
275 if ( ! roundNearestEven ) {
276 if ( roundingMode == float_round_to_zero ) {
277 roundIncrement = 0;
278 }
279 else {
280 roundIncrement = 0x7F;
281 if ( zSign ) {
282 if ( roundingMode == float_round_up ) roundIncrement = 0;
283 }
284 else {
285 if ( roundingMode == float_round_down ) roundIncrement = 0;
286 }
287 }
288 }
289 roundBits = zSig & 0x7F;
290 if ( 0xFD <= (bits16) zExp ) {
291 if ( ( 0xFD < zExp )
292 || ( ( zExp == 0xFD )
293 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
294 ) {
295 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
296 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
297 }
298 if ( zExp < 0 ) {
299 if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
300 isTiny =
301 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
302 || ( zExp < -1 )
303 || ( zSig + roundIncrement < 0x80000000 );
304 shift32RightJamming( zSig, - zExp, &zSig );
305 zExp = 0;
306 roundBits = zSig & 0x7F;
307 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
308 }
309 }
310 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
311 zSig = ( zSig + roundIncrement )>>7;
312 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
313 if ( zSig == 0 ) zExp = 0;
314 return packFloat32( zSign, zExp, zSig );
315
316 }
317
318 /*----------------------------------------------------------------------------
319 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
320 | and significand `zSig', and returns the proper single-precision floating-
321 | point value corresponding to the abstract input. This routine is just like
322 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
323 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
324 | floating-point exponent.
325 *----------------------------------------------------------------------------*/
326
327 static float32
328 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
329 {
330 int8 shiftCount;
331
332 shiftCount = countLeadingZeros32( zSig ) - 1;
333 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
334
335 }
336
337 /*----------------------------------------------------------------------------
338 | Returns the fraction bits of the double-precision floating-point value `a'.
339 *----------------------------------------------------------------------------*/
340
341 INLINE bits64 extractFloat64Frac( float64 a )
342 {
343
344 return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
345
346 }
347
348 /*----------------------------------------------------------------------------
349 | Returns the exponent bits of the double-precision floating-point value `a'.
350 *----------------------------------------------------------------------------*/
351
352 INLINE int16 extractFloat64Exp( float64 a )
353 {
354
355 return ( float64_val(a)>>52 ) & 0x7FF;
356
357 }
358
359 /*----------------------------------------------------------------------------
360 | Returns the sign bit of the double-precision floating-point value `a'.
361 *----------------------------------------------------------------------------*/
362
363 INLINE flag extractFloat64Sign( float64 a )
364 {
365
366 return float64_val(a)>>63;
367
368 }
369
370 /*----------------------------------------------------------------------------
371 | Normalizes the subnormal double-precision floating-point value represented
372 | by the denormalized significand `aSig'. The normalized exponent and
373 | significand are stored at the locations pointed to by `zExpPtr' and
374 | `zSigPtr', respectively.
375 *----------------------------------------------------------------------------*/
376
377 static void
378 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
379 {
380 int8 shiftCount;
381
382 shiftCount = countLeadingZeros64( aSig ) - 11;
383 *zSigPtr = aSig<<shiftCount;
384 *zExpPtr = 1 - shiftCount;
385
386 }
387
388 /*----------------------------------------------------------------------------
389 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
390 | double-precision floating-point value, returning the result. After being
391 | shifted into the proper positions, the three fields are simply added
392 | together to form the result. This means that any integer portion of `zSig'
393 | will be added into the exponent. Since a properly normalized significand
394 | will have an integer portion equal to 1, the `zExp' input should be 1 less
395 | than the desired result exponent whenever `zSig' is a complete, normalized
396 | significand.
397 *----------------------------------------------------------------------------*/
398
399 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
400 {
401
402 return make_float64(
403 ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig);
404
405 }
406
407 /*----------------------------------------------------------------------------
408 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
409 | and significand `zSig', and returns the proper double-precision floating-
410 | point value corresponding to the abstract input. Ordinarily, the abstract
411 | value is simply rounded and packed into the double-precision format, with
412 | the inexact exception raised if the abstract input cannot be represented
413 | exactly. However, if the abstract value is too large, the overflow and
414 | inexact exceptions are raised and an infinity or maximal finite value is
415 | returned. If the abstract value is too small, the input value is rounded
416 | to a subnormal number, and the underflow and inexact exceptions are raised
417 | if the abstract input cannot be represented exactly as a subnormal double-
418 | precision floating-point number.
419 | The input significand `zSig' has its binary point between bits 62
420 | and 61, which is 10 bits to the left of the usual location. This shifted
421 | significand must be normalized or smaller. If `zSig' is not normalized,
422 | `zExp' must be 0; in that case, the result returned is a subnormal number,
423 | and it must not require rounding. In the usual case that `zSig' is
424 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
425 | The handling of underflow and overflow follows the IEC/IEEE Standard for
426 | Binary Floating-Point Arithmetic.
427 *----------------------------------------------------------------------------*/
428
429 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
430 {
431 int8 roundingMode;
432 flag roundNearestEven;
433 int16 roundIncrement, roundBits;
434 flag isTiny;
435
436 roundingMode = STATUS(float_rounding_mode);
437 roundNearestEven = ( roundingMode == float_round_nearest_even );
438 roundIncrement = 0x200;
439 if ( ! roundNearestEven ) {
440 if ( roundingMode == float_round_to_zero ) {
441 roundIncrement = 0;
442 }
443 else {
444 roundIncrement = 0x3FF;
445 if ( zSign ) {
446 if ( roundingMode == float_round_up ) roundIncrement = 0;
447 }
448 else {
449 if ( roundingMode == float_round_down ) roundIncrement = 0;
450 }
451 }
452 }
453 roundBits = zSig & 0x3FF;
454 if ( 0x7FD <= (bits16) zExp ) {
455 if ( ( 0x7FD < zExp )
456 || ( ( zExp == 0x7FD )
457 && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
458 ) {
459 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
460 return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
461 }
462 if ( zExp < 0 ) {
463 if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
464 isTiny =
465 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
466 || ( zExp < -1 )
467 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
468 shift64RightJamming( zSig, - zExp, &zSig );
469 zExp = 0;
470 roundBits = zSig & 0x3FF;
471 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
472 }
473 }
474 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
475 zSig = ( zSig + roundIncrement )>>10;
476 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
477 if ( zSig == 0 ) zExp = 0;
478 return packFloat64( zSign, zExp, zSig );
479
480 }
481
482 /*----------------------------------------------------------------------------
483 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
484 | and significand `zSig', and returns the proper double-precision floating-
485 | point value corresponding to the abstract input. This routine is just like
486 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
487 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
488 | floating-point exponent.
489 *----------------------------------------------------------------------------*/
490
491 static float64
492 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
493 {
494 int8 shiftCount;
495
496 shiftCount = countLeadingZeros64( zSig ) - 1;
497 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
498
499 }
500
501 #ifdef FLOATX80
502
503 /*----------------------------------------------------------------------------
504 | Returns the fraction bits of the extended double-precision floating-point
505 | value `a'.
506 *----------------------------------------------------------------------------*/
507
508 INLINE bits64 extractFloatx80Frac( floatx80 a )
509 {
510
511 return a.low;
512
513 }
514
515 /*----------------------------------------------------------------------------
516 | Returns the exponent bits of the extended double-precision floating-point
517 | value `a'.
518 *----------------------------------------------------------------------------*/
519
520 INLINE int32 extractFloatx80Exp( floatx80 a )
521 {
522
523 return a.high & 0x7FFF;
524
525 }
526
527 /*----------------------------------------------------------------------------
528 | Returns the sign bit of the extended double-precision floating-point value
529 | `a'.
530 *----------------------------------------------------------------------------*/
531
532 INLINE flag extractFloatx80Sign( floatx80 a )
533 {
534
535 return a.high>>15;
536
537 }
538
539 /*----------------------------------------------------------------------------
540 | Normalizes the subnormal extended double-precision floating-point value
541 | represented by the denormalized significand `aSig'. The normalized exponent
542 | and significand are stored at the locations pointed to by `zExpPtr' and
543 | `zSigPtr', respectively.
544 *----------------------------------------------------------------------------*/
545
546 static void
547 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
548 {
549 int8 shiftCount;
550
551 shiftCount = countLeadingZeros64( aSig );
552 *zSigPtr = aSig<<shiftCount;
553 *zExpPtr = 1 - shiftCount;
554
555 }
556
557 /*----------------------------------------------------------------------------
558 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
559 | extended double-precision floating-point value, returning the result.
560 *----------------------------------------------------------------------------*/
561
562 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
563 {
564 floatx80 z;
565
566 z.low = zSig;
567 z.high = ( ( (bits16) zSign )<<15 ) + zExp;
568 return z;
569
570 }
571
572 /*----------------------------------------------------------------------------
573 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
574 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
575 | and returns the proper extended double-precision floating-point value
576 | corresponding to the abstract input. Ordinarily, the abstract value is
577 | rounded and packed into the extended double-precision format, with the
578 | inexact exception raised if the abstract input cannot be represented
579 | exactly. However, if the abstract value is too large, the overflow and
580 | inexact exceptions are raised and an infinity or maximal finite value is
581 | returned. If the abstract value is too small, the input value is rounded to
582 | a subnormal number, and the underflow and inexact exceptions are raised if
583 | the abstract input cannot be represented exactly as a subnormal extended
584 | double-precision floating-point number.
585 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
586 | number of bits as single or double precision, respectively. Otherwise, the
587 | result is rounded to the full precision of the extended double-precision
588 | format.
589 | The input significand must be normalized or smaller. If the input
590 | significand is not normalized, `zExp' must be 0; in that case, the result
591 | returned is a subnormal number, and it must not require rounding. The
592 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
593 | Floating-Point Arithmetic.
594 *----------------------------------------------------------------------------*/
595
596 static floatx80
597 roundAndPackFloatx80(
598 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
599 STATUS_PARAM)
600 {
601 int8 roundingMode;
602 flag roundNearestEven, increment, isTiny;
603 int64 roundIncrement, roundMask, roundBits;
604
605 roundingMode = STATUS(float_rounding_mode);
606 roundNearestEven = ( roundingMode == float_round_nearest_even );
607 if ( roundingPrecision == 80 ) goto precision80;
608 if ( roundingPrecision == 64 ) {
609 roundIncrement = LIT64( 0x0000000000000400 );
610 roundMask = LIT64( 0x00000000000007FF );
611 }
612 else if ( roundingPrecision == 32 ) {
613 roundIncrement = LIT64( 0x0000008000000000 );
614 roundMask = LIT64( 0x000000FFFFFFFFFF );
615 }
616 else {
617 goto precision80;
618 }
619 zSig0 |= ( zSig1 != 0 );
620 if ( ! roundNearestEven ) {
621 if ( roundingMode == float_round_to_zero ) {
622 roundIncrement = 0;
623 }
624 else {
625 roundIncrement = roundMask;
626 if ( zSign ) {
627 if ( roundingMode == float_round_up ) roundIncrement = 0;
628 }
629 else {
630 if ( roundingMode == float_round_down ) roundIncrement = 0;
631 }
632 }
633 }
634 roundBits = zSig0 & roundMask;
635 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
636 if ( ( 0x7FFE < zExp )
637 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
638 ) {
639 goto overflow;
640 }
641 if ( zExp <= 0 ) {
642 if ( STATUS(flush_to_zero) ) return packFloatx80( zSign, 0, 0 );
643 isTiny =
644 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
645 || ( zExp < 0 )
646 || ( zSig0 <= zSig0 + roundIncrement );
647 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
648 zExp = 0;
649 roundBits = zSig0 & roundMask;
650 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
651 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
652 zSig0 += roundIncrement;
653 if ( (sbits64) zSig0 < 0 ) zExp = 1;
654 roundIncrement = roundMask + 1;
655 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
656 roundMask |= roundIncrement;
657 }
658 zSig0 &= ~ roundMask;
659 return packFloatx80( zSign, zExp, zSig0 );
660 }
661 }
662 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
663 zSig0 += roundIncrement;
664 if ( zSig0 < roundIncrement ) {
665 ++zExp;
666 zSig0 = LIT64( 0x8000000000000000 );
667 }
668 roundIncrement = roundMask + 1;
669 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
670 roundMask |= roundIncrement;
671 }
672 zSig0 &= ~ roundMask;
673 if ( zSig0 == 0 ) zExp = 0;
674 return packFloatx80( zSign, zExp, zSig0 );
675 precision80:
676 increment = ( (sbits64) zSig1 < 0 );
677 if ( ! roundNearestEven ) {
678 if ( roundingMode == float_round_to_zero ) {
679 increment = 0;
680 }
681 else {
682 if ( zSign ) {
683 increment = ( roundingMode == float_round_down ) && zSig1;
684 }
685 else {
686 increment = ( roundingMode == float_round_up ) && zSig1;
687 }
688 }
689 }
690 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
691 if ( ( 0x7FFE < zExp )
692 || ( ( zExp == 0x7FFE )
693 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
694 && increment
695 )
696 ) {
697 roundMask = 0;
698 overflow:
699 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
700 if ( ( roundingMode == float_round_to_zero )
701 || ( zSign && ( roundingMode == float_round_up ) )
702 || ( ! zSign && ( roundingMode == float_round_down ) )
703 ) {
704 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
705 }
706 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
707 }
708 if ( zExp <= 0 ) {
709 isTiny =
710 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
711 || ( zExp < 0 )
712 || ! increment
713 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
714 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
715 zExp = 0;
716 if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
717 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
718 if ( roundNearestEven ) {
719 increment = ( (sbits64) zSig1 < 0 );
720 }
721 else {
722 if ( zSign ) {
723 increment = ( roundingMode == float_round_down ) && zSig1;
724 }
725 else {
726 increment = ( roundingMode == float_round_up ) && zSig1;
727 }
728 }
729 if ( increment ) {
730 ++zSig0;
731 zSig0 &=
732 ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
733 if ( (sbits64) zSig0 < 0 ) zExp = 1;
734 }
735 return packFloatx80( zSign, zExp, zSig0 );
736 }
737 }
738 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
739 if ( increment ) {
740 ++zSig0;
741 if ( zSig0 == 0 ) {
742 ++zExp;
743 zSig0 = LIT64( 0x8000000000000000 );
744 }
745 else {
746 zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
747 }
748 }
749 else {
750 if ( zSig0 == 0 ) zExp = 0;
751 }
752 return packFloatx80( zSign, zExp, zSig0 );
753
754 }
755
756 /*----------------------------------------------------------------------------
757 | Takes an abstract floating-point value having sign `zSign', exponent
758 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
759 | and returns the proper extended double-precision floating-point value
760 | corresponding to the abstract input. This routine is just like
761 | `roundAndPackFloatx80' except that the input significand does not have to be
762 | normalized.
763 *----------------------------------------------------------------------------*/
764
765 static floatx80
766 normalizeRoundAndPackFloatx80(
767 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
768 STATUS_PARAM)
769 {
770 int8 shiftCount;
771
772 if ( zSig0 == 0 ) {
773 zSig0 = zSig1;
774 zSig1 = 0;
775 zExp -= 64;
776 }
777 shiftCount = countLeadingZeros64( zSig0 );
778 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
779 zExp -= shiftCount;
780 return
781 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
782
783 }
784
785 #endif
786
787 #ifdef FLOAT128
788
789 /*----------------------------------------------------------------------------
790 | Returns the least-significant 64 fraction bits of the quadruple-precision
791 | floating-point value `a'.
792 *----------------------------------------------------------------------------*/
793
794 INLINE bits64 extractFloat128Frac1( float128 a )
795 {
796
797 return a.low;
798
799 }
800
801 /*----------------------------------------------------------------------------
802 | Returns the most-significant 48 fraction bits of the quadruple-precision
803 | floating-point value `a'.
804 *----------------------------------------------------------------------------*/
805
806 INLINE bits64 extractFloat128Frac0( float128 a )
807 {
808
809 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
810
811 }
812
813 /*----------------------------------------------------------------------------
814 | Returns the exponent bits of the quadruple-precision floating-point value
815 | `a'.
816 *----------------------------------------------------------------------------*/
817
818 INLINE int32 extractFloat128Exp( float128 a )
819 {
820
821 return ( a.high>>48 ) & 0x7FFF;
822
823 }
824
825 /*----------------------------------------------------------------------------
826 | Returns the sign bit of the quadruple-precision floating-point value `a'.
827 *----------------------------------------------------------------------------*/
828
829 INLINE flag extractFloat128Sign( float128 a )
830 {
831
832 return a.high>>63;
833
834 }
835
836 /*----------------------------------------------------------------------------
837 | Normalizes the subnormal quadruple-precision floating-point value
838 | represented by the denormalized significand formed by the concatenation of
839 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
840 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
841 | significand are stored at the location pointed to by `zSig0Ptr', and the
842 | least significant 64 bits of the normalized significand are stored at the
843 | location pointed to by `zSig1Ptr'.
844 *----------------------------------------------------------------------------*/
845
846 static void
847 normalizeFloat128Subnormal(
848 bits64 aSig0,
849 bits64 aSig1,
850 int32 *zExpPtr,
851 bits64 *zSig0Ptr,
852 bits64 *zSig1Ptr
853 )
854 {
855 int8 shiftCount;
856
857 if ( aSig0 == 0 ) {
858 shiftCount = countLeadingZeros64( aSig1 ) - 15;
859 if ( shiftCount < 0 ) {
860 *zSig0Ptr = aSig1>>( - shiftCount );
861 *zSig1Ptr = aSig1<<( shiftCount & 63 );
862 }
863 else {
864 *zSig0Ptr = aSig1<<shiftCount;
865 *zSig1Ptr = 0;
866 }
867 *zExpPtr = - shiftCount - 63;
868 }
869 else {
870 shiftCount = countLeadingZeros64( aSig0 ) - 15;
871 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
872 *zExpPtr = 1 - shiftCount;
873 }
874
875 }
876
877 /*----------------------------------------------------------------------------
878 | Packs the sign `zSign', the exponent `zExp', and the significand formed
879 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
880 | floating-point value, returning the result. After being shifted into the
881 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
882 | added together to form the most significant 32 bits of the result. This
883 | means that any integer portion of `zSig0' will be added into the exponent.
884 | Since a properly normalized significand will have an integer portion equal
885 | to 1, the `zExp' input should be 1 less than the desired result exponent
886 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
887 | significand.
888 *----------------------------------------------------------------------------*/
889
890 INLINE float128
891 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
892 {
893 float128 z;
894
895 z.low = zSig1;
896 z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
897 return z;
898
899 }
900
901 /*----------------------------------------------------------------------------
902 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
903 | and extended significand formed by the concatenation of `zSig0', `zSig1',
904 | and `zSig2', and returns the proper quadruple-precision floating-point value
905 | corresponding to the abstract input. Ordinarily, the abstract value is
906 | simply rounded and packed into the quadruple-precision format, with the
907 | inexact exception raised if the abstract input cannot be represented
908 | exactly. However, if the abstract value is too large, the overflow and
909 | inexact exceptions are raised and an infinity or maximal finite value is
910 | returned. If the abstract value is too small, the input value is rounded to
911 | a subnormal number, and the underflow and inexact exceptions are raised if
912 | the abstract input cannot be represented exactly as a subnormal quadruple-
913 | precision floating-point number.
914 | The input significand must be normalized or smaller. If the input
915 | significand is not normalized, `zExp' must be 0; in that case, the result
916 | returned is a subnormal number, and it must not require rounding. In the
917 | usual case that the input significand is normalized, `zExp' must be 1 less
918 | than the ``true'' floating-point exponent. The handling of underflow and
919 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
920 *----------------------------------------------------------------------------*/
921
922 static float128
923 roundAndPackFloat128(
924 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 STATUS_PARAM)
925 {
926 int8 roundingMode;
927 flag roundNearestEven, increment, isTiny;
928
929 roundingMode = STATUS(float_rounding_mode);
930 roundNearestEven = ( roundingMode == float_round_nearest_even );
931 increment = ( (sbits64) zSig2 < 0 );
932 if ( ! roundNearestEven ) {
933 if ( roundingMode == float_round_to_zero ) {
934 increment = 0;
935 }
936 else {
937 if ( zSign ) {
938 increment = ( roundingMode == float_round_down ) && zSig2;
939 }
940 else {
941 increment = ( roundingMode == float_round_up ) && zSig2;
942 }
943 }
944 }
945 if ( 0x7FFD <= (bits32) zExp ) {
946 if ( ( 0x7FFD < zExp )
947 || ( ( zExp == 0x7FFD )
948 && eq128(
949 LIT64( 0x0001FFFFFFFFFFFF ),
950 LIT64( 0xFFFFFFFFFFFFFFFF ),
951 zSig0,
952 zSig1
953 )
954 && increment
955 )
956 ) {
957 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
958 if ( ( roundingMode == float_round_to_zero )
959 || ( zSign && ( roundingMode == float_round_up ) )
960 || ( ! zSign && ( roundingMode == float_round_down ) )
961 ) {
962 return
963 packFloat128(
964 zSign,
965 0x7FFE,
966 LIT64( 0x0000FFFFFFFFFFFF ),
967 LIT64( 0xFFFFFFFFFFFFFFFF )
968 );
969 }
970 return packFloat128( zSign, 0x7FFF, 0, 0 );
971 }
972 if ( zExp < 0 ) {
973 if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
974 isTiny =
975 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
976 || ( zExp < -1 )
977 || ! increment
978 || lt128(
979 zSig0,
980 zSig1,
981 LIT64( 0x0001FFFFFFFFFFFF ),
982 LIT64( 0xFFFFFFFFFFFFFFFF )
983 );
984 shift128ExtraRightJamming(
985 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
986 zExp = 0;
987 if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
988 if ( roundNearestEven ) {
989 increment = ( (sbits64) zSig2 < 0 );
990 }
991 else {
992 if ( zSign ) {
993 increment = ( roundingMode == float_round_down ) && zSig2;
994 }
995 else {
996 increment = ( roundingMode == float_round_up ) && zSig2;
997 }
998 }
999 }
1000 }
1001 if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1002 if ( increment ) {
1003 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1004 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1005 }
1006 else {
1007 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1008 }
1009 return packFloat128( zSign, zExp, zSig0, zSig1 );
1010
1011 }
1012
1013 /*----------------------------------------------------------------------------
1014 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1015 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1016 | returns the proper quadruple-precision floating-point value corresponding
1017 | to the abstract input. This routine is just like `roundAndPackFloat128'
1018 | except that the input significand has fewer bits and does not have to be
1019 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1020 | point exponent.
1021 *----------------------------------------------------------------------------*/
1022
1023 static float128
1024 normalizeRoundAndPackFloat128(
1025 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 STATUS_PARAM)
1026 {
1027 int8 shiftCount;
1028 bits64 zSig2;
1029
1030 if ( zSig0 == 0 ) {
1031 zSig0 = zSig1;
1032 zSig1 = 0;
1033 zExp -= 64;
1034 }
1035 shiftCount = countLeadingZeros64( zSig0 ) - 15;
1036 if ( 0 <= shiftCount ) {
1037 zSig2 = 0;
1038 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1039 }
1040 else {
1041 shift128ExtraRightJamming(
1042 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1043 }
1044 zExp -= shiftCount;
1045 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1046
1047 }
1048
1049 #endif
1050
1051 /*----------------------------------------------------------------------------
1052 | Returns the result of converting the 32-bit two's complement integer `a'
1053 | to the single-precision floating-point format. The conversion is performed
1054 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1055 *----------------------------------------------------------------------------*/
1056
1057 float32 int32_to_float32( int32 a STATUS_PARAM )
1058 {
1059 flag zSign;
1060
1061 if ( a == 0 ) return float32_zero;
1062 if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1063 zSign = ( a < 0 );
1064 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1065
1066 }
1067
1068 /*----------------------------------------------------------------------------
1069 | Returns the result of converting the 32-bit two's complement integer `a'
1070 | to the double-precision floating-point format. The conversion is performed
1071 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1072 *----------------------------------------------------------------------------*/
1073
1074 float64 int32_to_float64( int32 a STATUS_PARAM )
1075 {
1076 flag zSign;
1077 uint32 absA;
1078 int8 shiftCount;
1079 bits64 zSig;
1080
1081 if ( a == 0 ) return float64_zero;
1082 zSign = ( a < 0 );
1083 absA = zSign ? - a : a;
1084 shiftCount = countLeadingZeros32( absA ) + 21;
1085 zSig = absA;
1086 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1087
1088 }
1089
1090 #ifdef FLOATX80
1091
1092 /*----------------------------------------------------------------------------
1093 | Returns the result of converting the 32-bit two's complement integer `a'
1094 | to the extended double-precision floating-point format. The conversion
1095 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1096 | Arithmetic.
1097 *----------------------------------------------------------------------------*/
1098
1099 floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1100 {
1101 flag zSign;
1102 uint32 absA;
1103 int8 shiftCount;
1104 bits64 zSig;
1105
1106 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1107 zSign = ( a < 0 );
1108 absA = zSign ? - a : a;
1109 shiftCount = countLeadingZeros32( absA ) + 32;
1110 zSig = absA;
1111 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1112
1113 }
1114
1115 #endif
1116
1117 #ifdef FLOAT128
1118
1119 /*----------------------------------------------------------------------------
1120 | Returns the result of converting the 32-bit two's complement integer `a' to
1121 | the quadruple-precision floating-point format. The conversion is performed
1122 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1123 *----------------------------------------------------------------------------*/
1124
1125 float128 int32_to_float128( int32 a STATUS_PARAM )
1126 {
1127 flag zSign;
1128 uint32 absA;
1129 int8 shiftCount;
1130 bits64 zSig0;
1131
1132 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1133 zSign = ( a < 0 );
1134 absA = zSign ? - a : a;
1135 shiftCount = countLeadingZeros32( absA ) + 17;
1136 zSig0 = absA;
1137 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1138
1139 }
1140
1141 #endif
1142
1143 /*----------------------------------------------------------------------------
1144 | Returns the result of converting the 64-bit two's complement integer `a'
1145 | to the single-precision floating-point format. The conversion is performed
1146 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1147 *----------------------------------------------------------------------------*/
1148
1149 float32 int64_to_float32( int64 a STATUS_PARAM )
1150 {
1151 flag zSign;
1152 uint64 absA;
1153 int8 shiftCount;
1154
1155 if ( a == 0 ) return float32_zero;
1156 zSign = ( a < 0 );
1157 absA = zSign ? - a : a;
1158 shiftCount = countLeadingZeros64( absA ) - 40;
1159 if ( 0 <= shiftCount ) {
1160 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1161 }
1162 else {
1163 shiftCount += 7;
1164 if ( shiftCount < 0 ) {
1165 shift64RightJamming( absA, - shiftCount, &absA );
1166 }
1167 else {
1168 absA <<= shiftCount;
1169 }
1170 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1171 }
1172
1173 }
1174
1175 float32 uint64_to_float32( uint64 a STATUS_PARAM )
1176 {
1177 int8 shiftCount;
1178
1179 if ( a == 0 ) return float32_zero;
1180 shiftCount = countLeadingZeros64( a ) - 40;
1181 if ( 0 <= shiftCount ) {
1182 return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount );
1183 }
1184 else {
1185 shiftCount += 7;
1186 if ( shiftCount < 0 ) {
1187 shift64RightJamming( a, - shiftCount, &a );
1188 }
1189 else {
1190 a <<= shiftCount;
1191 }
1192 return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR );
1193 }
1194 }
1195
1196 /*----------------------------------------------------------------------------
1197 | Returns the result of converting the 64-bit two's complement integer `a'
1198 | to the double-precision floating-point format. The conversion is performed
1199 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1200 *----------------------------------------------------------------------------*/
1201
1202 float64 int64_to_float64( int64 a STATUS_PARAM )
1203 {
1204 flag zSign;
1205
1206 if ( a == 0 ) return float64_zero;
1207 if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1208 return packFloat64( 1, 0x43E, 0 );
1209 }
1210 zSign = ( a < 0 );
1211 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1212
1213 }
1214
1215 float64 uint64_to_float64( uint64 a STATUS_PARAM )
1216 {
1217 if ( a == 0 ) return float64_zero;
1218 return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR );
1219
1220 }
1221
1222 #ifdef FLOATX80
1223
1224 /*----------------------------------------------------------------------------
1225 | Returns the result of converting the 64-bit two's complement integer `a'
1226 | to the extended double-precision floating-point format. The conversion
1227 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1228 | Arithmetic.
1229 *----------------------------------------------------------------------------*/
1230
1231 floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1232 {
1233 flag zSign;
1234 uint64 absA;
1235 int8 shiftCount;
1236
1237 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1238 zSign = ( a < 0 );
1239 absA = zSign ? - a : a;
1240 shiftCount = countLeadingZeros64( absA );
1241 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1242
1243 }
1244
1245 #endif
1246
1247 #ifdef FLOAT128
1248
1249 /*----------------------------------------------------------------------------
1250 | Returns the result of converting the 64-bit two's complement integer `a' to
1251 | the quadruple-precision floating-point format. The conversion is performed
1252 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1253 *----------------------------------------------------------------------------*/
1254
1255 float128 int64_to_float128( int64 a STATUS_PARAM )
1256 {
1257 flag zSign;
1258 uint64 absA;
1259 int8 shiftCount;
1260 int32 zExp;
1261 bits64 zSig0, zSig1;
1262
1263 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1264 zSign = ( a < 0 );
1265 absA = zSign ? - a : a;
1266 shiftCount = countLeadingZeros64( absA ) + 49;
1267 zExp = 0x406E - shiftCount;
1268 if ( 64 <= shiftCount ) {
1269 zSig1 = 0;
1270 zSig0 = absA;
1271 shiftCount -= 64;
1272 }
1273 else {
1274 zSig1 = absA;
1275 zSig0 = 0;
1276 }
1277 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1278 return packFloat128( zSign, zExp, zSig0, zSig1 );
1279
1280 }
1281
1282 #endif
1283
1284 /*----------------------------------------------------------------------------
1285 | Returns the result of converting the single-precision floating-point value
1286 | `a' to the 32-bit two's complement integer format. The conversion is
1287 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1288 | Arithmetic---which means in particular that the conversion is rounded
1289 | according to the current rounding mode. If `a' is a NaN, the largest
1290 | positive integer is returned. Otherwise, if the conversion overflows, the
1291 | largest integer with the same sign as `a' is returned.
1292 *----------------------------------------------------------------------------*/
1293
1294 int32 float32_to_int32( float32 a STATUS_PARAM )
1295 {
1296 flag aSign;
1297 int16 aExp, shiftCount;
1298 bits32 aSig;
1299 bits64 aSig64;
1300
1301 aSig = extractFloat32Frac( a );
1302 aExp = extractFloat32Exp( a );
1303 aSign = extractFloat32Sign( a );
1304 if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1305 if ( aExp ) aSig |= 0x00800000;
1306 shiftCount = 0xAF - aExp;
1307 aSig64 = aSig;
1308 aSig64 <<= 32;
1309 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1310 return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1311
1312 }
1313
1314 /*----------------------------------------------------------------------------
1315 | Returns the result of converting the single-precision floating-point value
1316 | `a' to the 32-bit two's complement integer format. The conversion is
1317 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1318 | Arithmetic, except that the conversion is always rounded toward zero.
1319 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1320 | the conversion overflows, the largest integer with the same sign as `a' is
1321 | returned.
1322 *----------------------------------------------------------------------------*/
1323
1324 int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1325 {
1326 flag aSign;
1327 int16 aExp, shiftCount;
1328 bits32 aSig;
1329 int32 z;
1330
1331 aSig = extractFloat32Frac( a );
1332 aExp = extractFloat32Exp( a );
1333 aSign = extractFloat32Sign( a );
1334 shiftCount = aExp - 0x9E;
1335 if ( 0 <= shiftCount ) {
1336 if ( float32_val(a) != 0xCF000000 ) {
1337 float_raise( float_flag_invalid STATUS_VAR);
1338 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1339 }
1340 return (sbits32) 0x80000000;
1341 }
1342 else if ( aExp <= 0x7E ) {
1343 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1344 return 0;
1345 }
1346 aSig = ( aSig | 0x00800000 )<<8;
1347 z = aSig>>( - shiftCount );
1348 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1349 STATUS(float_exception_flags) |= float_flag_inexact;
1350 }
1351 if ( aSign ) z = - z;
1352 return z;
1353
1354 }
1355
1356 /*----------------------------------------------------------------------------
1357 | Returns the result of converting the single-precision floating-point value
1358 | `a' to the 16-bit two's complement integer format. The conversion is
1359 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1360 | Arithmetic, except that the conversion is always rounded toward zero.
1361 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1362 | the conversion overflows, the largest integer with the same sign as `a' is
1363 | returned.
1364 *----------------------------------------------------------------------------*/
1365
1366 int16 float32_to_int16_round_to_zero( float32 a STATUS_PARAM )
1367 {
1368 flag aSign;
1369 int16 aExp, shiftCount;
1370 bits32 aSig;
1371 int32 z;
1372
1373 aSig = extractFloat32Frac( a );
1374 aExp = extractFloat32Exp( a );
1375 aSign = extractFloat32Sign( a );
1376 shiftCount = aExp - 0x8E;
1377 if ( 0 <= shiftCount ) {
1378 if ( float32_val(a) != 0xC7000000 ) {
1379 float_raise( float_flag_invalid STATUS_VAR);
1380 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1381 return 0x7FFF;
1382 }
1383 }
1384 return (sbits32) 0xffff8000;
1385 }
1386 else if ( aExp <= 0x7E ) {
1387 if ( aExp | aSig ) {
1388 STATUS(float_exception_flags) |= float_flag_inexact;
1389 }
1390 return 0;
1391 }
1392 shiftCount -= 0x10;
1393 aSig = ( aSig | 0x00800000 )<<8;
1394 z = aSig>>( - shiftCount );
1395 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1396 STATUS(float_exception_flags) |= float_flag_inexact;
1397 }
1398 if ( aSign ) {
1399 z = - z;
1400 }
1401 return z;
1402
1403 }
1404
1405 /*----------------------------------------------------------------------------
1406 | Returns the result of converting the single-precision floating-point value
1407 | `a' to the 64-bit two's complement integer format. The conversion is
1408 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1409 | Arithmetic---which means in particular that the conversion is rounded
1410 | according to the current rounding mode. If `a' is a NaN, the largest
1411 | positive integer is returned. Otherwise, if the conversion overflows, the
1412 | largest integer with the same sign as `a' is returned.
1413 *----------------------------------------------------------------------------*/
1414
1415 int64 float32_to_int64( float32 a STATUS_PARAM )
1416 {
1417 flag aSign;
1418 int16 aExp, shiftCount;
1419 bits32 aSig;
1420 bits64 aSig64, aSigExtra;
1421
1422 aSig = extractFloat32Frac( a );
1423 aExp = extractFloat32Exp( a );
1424 aSign = extractFloat32Sign( a );
1425 shiftCount = 0xBE - aExp;
1426 if ( shiftCount < 0 ) {
1427 float_raise( float_flag_invalid STATUS_VAR);
1428 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1429 return LIT64( 0x7FFFFFFFFFFFFFFF );
1430 }
1431 return (sbits64) LIT64( 0x8000000000000000 );
1432 }
1433 if ( aExp ) aSig |= 0x00800000;
1434 aSig64 = aSig;
1435 aSig64 <<= 40;
1436 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1437 return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1438
1439 }
1440
1441 /*----------------------------------------------------------------------------
1442 | Returns the result of converting the single-precision floating-point value
1443 | `a' to the 64-bit two's complement integer format. The conversion is
1444 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1445 | Arithmetic, except that the conversion is always rounded toward zero. If
1446 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1447 | conversion overflows, the largest integer with the same sign as `a' is
1448 | returned.
1449 *----------------------------------------------------------------------------*/
1450
1451 int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1452 {
1453 flag aSign;
1454 int16 aExp, shiftCount;
1455 bits32 aSig;
1456 bits64 aSig64;
1457 int64 z;
1458
1459 aSig = extractFloat32Frac( a );
1460 aExp = extractFloat32Exp( a );
1461 aSign = extractFloat32Sign( a );
1462 shiftCount = aExp - 0xBE;
1463 if ( 0 <= shiftCount ) {
1464 if ( float32_val(a) != 0xDF000000 ) {
1465 float_raise( float_flag_invalid STATUS_VAR);
1466 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1467 return LIT64( 0x7FFFFFFFFFFFFFFF );
1468 }
1469 }
1470 return (sbits64) LIT64( 0x8000000000000000 );
1471 }
1472 else if ( aExp <= 0x7E ) {
1473 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1474 return 0;
1475 }
1476 aSig64 = aSig | 0x00800000;
1477 aSig64 <<= 40;
1478 z = aSig64>>( - shiftCount );
1479 if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1480 STATUS(float_exception_flags) |= float_flag_inexact;
1481 }
1482 if ( aSign ) z = - z;
1483 return z;
1484
1485 }
1486
1487 /*----------------------------------------------------------------------------
1488 | Returns the result of converting the single-precision floating-point value
1489 | `a' to the double-precision floating-point format. The conversion is
1490 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1491 | Arithmetic.
1492 *----------------------------------------------------------------------------*/
1493
1494 float64 float32_to_float64( float32 a STATUS_PARAM )
1495 {
1496 flag aSign;
1497 int16 aExp;
1498 bits32 aSig;
1499
1500 aSig = extractFloat32Frac( a );
1501 aExp = extractFloat32Exp( a );
1502 aSign = extractFloat32Sign( a );
1503 if ( aExp == 0xFF ) {
1504 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ));
1505 return packFloat64( aSign, 0x7FF, 0 );
1506 }
1507 if ( aExp == 0 ) {
1508 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1509 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1510 --aExp;
1511 }
1512 return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1513
1514 }
1515
1516 #ifdef FLOATX80
1517
1518 /*----------------------------------------------------------------------------
1519 | Returns the result of converting the single-precision floating-point value
1520 | `a' to the extended double-precision floating-point format. The conversion
1521 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1522 | Arithmetic.
1523 *----------------------------------------------------------------------------*/
1524
1525 floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1526 {
1527 flag aSign;
1528 int16 aExp;
1529 bits32 aSig;
1530
1531 aSig = extractFloat32Frac( a );
1532 aExp = extractFloat32Exp( a );
1533 aSign = extractFloat32Sign( a );
1534 if ( aExp == 0xFF ) {
1535 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) );
1536 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1537 }
1538 if ( aExp == 0 ) {
1539 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1540 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1541 }
1542 aSig |= 0x00800000;
1543 return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1544
1545 }
1546
1547 #endif
1548
1549 #ifdef FLOAT128
1550
1551 /*----------------------------------------------------------------------------
1552 | Returns the result of converting the single-precision floating-point value
1553 | `a' to the double-precision floating-point format. The conversion is
1554 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1555 | Arithmetic.
1556 *----------------------------------------------------------------------------*/
1557
1558 float128 float32_to_float128( float32 a STATUS_PARAM )
1559 {
1560 flag aSign;
1561 int16 aExp;
1562 bits32 aSig;
1563
1564 aSig = extractFloat32Frac( a );
1565 aExp = extractFloat32Exp( a );
1566 aSign = extractFloat32Sign( a );
1567 if ( aExp == 0xFF ) {
1568 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) );
1569 return packFloat128( aSign, 0x7FFF, 0, 0 );
1570 }
1571 if ( aExp == 0 ) {
1572 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1573 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1574 --aExp;
1575 }
1576 return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1577
1578 }
1579
1580 #endif
1581
1582 /*----------------------------------------------------------------------------
1583 | Rounds the single-precision floating-point value `a' to an integer, and
1584 | returns the result as a single-precision floating-point value. The
1585 | operation is performed according to the IEC/IEEE Standard for Binary
1586 | Floating-Point Arithmetic.
1587 *----------------------------------------------------------------------------*/
1588
1589 float32 float32_round_to_int( float32 a STATUS_PARAM)
1590 {
1591 flag aSign;
1592 int16 aExp;
1593 bits32 lastBitMask, roundBitsMask;
1594 int8 roundingMode;
1595 bits32 z;
1596
1597 aExp = extractFloat32Exp( a );
1598 if ( 0x96 <= aExp ) {
1599 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1600 return propagateFloat32NaN( a, a STATUS_VAR );
1601 }
1602 return a;
1603 }
1604 if ( aExp <= 0x7E ) {
1605 if ( (bits32) ( float32_val(a)<<1 ) == 0 ) return a;
1606 STATUS(float_exception_flags) |= float_flag_inexact;
1607 aSign = extractFloat32Sign( a );
1608 switch ( STATUS(float_rounding_mode) ) {
1609 case float_round_nearest_even:
1610 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1611 return packFloat32( aSign, 0x7F, 0 );
1612 }
1613 break;
1614 case float_round_down:
1615 return make_float32(aSign ? 0xBF800000 : 0);
1616 case float_round_up:
1617 return make_float32(aSign ? 0x80000000 : 0x3F800000);
1618 }
1619 return packFloat32( aSign, 0, 0 );
1620 }
1621 lastBitMask = 1;
1622 lastBitMask <<= 0x96 - aExp;
1623 roundBitsMask = lastBitMask - 1;
1624 z = float32_val(a);
1625 roundingMode = STATUS(float_rounding_mode);
1626 if ( roundingMode == float_round_nearest_even ) {
1627 z += lastBitMask>>1;
1628 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1629 }
1630 else if ( roundingMode != float_round_to_zero ) {
1631 if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
1632 z += roundBitsMask;
1633 }
1634 }
1635 z &= ~ roundBitsMask;
1636 if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1637 return make_float32(z);
1638
1639 }
1640
1641 /*----------------------------------------------------------------------------
1642 | Returns the result of adding the absolute values of the single-precision
1643 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1644 | before being returned. `zSign' is ignored if the result is a NaN.
1645 | The addition is performed according to the IEC/IEEE Standard for Binary
1646 | Floating-Point Arithmetic.
1647 *----------------------------------------------------------------------------*/
1648
1649 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1650 {
1651 int16 aExp, bExp, zExp;
1652 bits32 aSig, bSig, zSig;
1653 int16 expDiff;
1654
1655 aSig = extractFloat32Frac( a );
1656 aExp = extractFloat32Exp( a );
1657 bSig = extractFloat32Frac( b );
1658 bExp = extractFloat32Exp( b );
1659 expDiff = aExp - bExp;
1660 aSig <<= 6;
1661 bSig <<= 6;
1662 if ( 0 < expDiff ) {
1663 if ( aExp == 0xFF ) {
1664 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1665 return a;
1666 }
1667 if ( bExp == 0 ) {
1668 --expDiff;
1669 }
1670 else {
1671 bSig |= 0x20000000;
1672 }
1673 shift32RightJamming( bSig, expDiff, &bSig );
1674 zExp = aExp;
1675 }
1676 else if ( expDiff < 0 ) {
1677 if ( bExp == 0xFF ) {
1678 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1679 return packFloat32( zSign, 0xFF, 0 );
1680 }
1681 if ( aExp == 0 ) {
1682 ++expDiff;
1683 }
1684 else {
1685 aSig |= 0x20000000;
1686 }
1687 shift32RightJamming( aSig, - expDiff, &aSig );
1688 zExp = bExp;
1689 }
1690 else {
1691 if ( aExp == 0xFF ) {
1692 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1693 return a;
1694 }
1695 if ( aExp == 0 ) {
1696 if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
1697 return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1698 }
1699 zSig = 0x40000000 + aSig + bSig;
1700 zExp = aExp;
1701 goto roundAndPack;
1702 }
1703 aSig |= 0x20000000;
1704 zSig = ( aSig + bSig )<<1;
1705 --zExp;
1706 if ( (sbits32) zSig < 0 ) {
1707 zSig = aSig + bSig;
1708 ++zExp;
1709 }
1710 roundAndPack:
1711 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1712
1713 }
1714
1715 /*----------------------------------------------------------------------------
1716 | Returns the result of subtracting the absolute values of the single-
1717 | precision floating-point values `a' and `b'. If `zSign' is 1, the
1718 | difference is negated before being returned. `zSign' is ignored if the
1719 | result is a NaN. The subtraction is performed according to the IEC/IEEE
1720 | Standard for Binary Floating-Point Arithmetic.
1721 *----------------------------------------------------------------------------*/
1722
1723 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1724 {
1725 int16 aExp, bExp, zExp;
1726 bits32 aSig, bSig, zSig;
1727 int16 expDiff;
1728
1729 aSig = extractFloat32Frac( a );
1730 aExp = extractFloat32Exp( a );
1731 bSig = extractFloat32Frac( b );
1732 bExp = extractFloat32Exp( b );
1733 expDiff = aExp - bExp;
1734 aSig <<= 7;
1735 bSig <<= 7;
1736 if ( 0 < expDiff ) goto aExpBigger;
1737 if ( expDiff < 0 ) goto bExpBigger;
1738 if ( aExp == 0xFF ) {
1739 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1740 float_raise( float_flag_invalid STATUS_VAR);
1741 return float32_default_nan;
1742 }
1743 if ( aExp == 0 ) {
1744 aExp = 1;
1745 bExp = 1;
1746 }
1747 if ( bSig < aSig ) goto aBigger;
1748 if ( aSig < bSig ) goto bBigger;
1749 return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1750 bExpBigger:
1751 if ( bExp == 0xFF ) {
1752 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1753 return packFloat32( zSign ^ 1, 0xFF, 0 );
1754 }
1755 if ( aExp == 0 ) {
1756 ++expDiff;
1757 }
1758 else {
1759 aSig |= 0x40000000;
1760 }
1761 shift32RightJamming( aSig, - expDiff, &aSig );
1762 bSig |= 0x40000000;
1763 bBigger:
1764 zSig = bSig - aSig;
1765 zExp = bExp;
1766 zSign ^= 1;
1767 goto normalizeRoundAndPack;
1768 aExpBigger:
1769 if ( aExp == 0xFF ) {
1770 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1771 return a;
1772 }
1773 if ( bExp == 0 ) {
1774 --expDiff;
1775 }
1776 else {
1777 bSig |= 0x40000000;
1778 }
1779 shift32RightJamming( bSig, expDiff, &bSig );
1780 aSig |= 0x40000000;
1781 aBigger:
1782 zSig = aSig - bSig;
1783 zExp = aExp;
1784 normalizeRoundAndPack:
1785 --zExp;
1786 return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1787
1788 }
1789
1790 /*----------------------------------------------------------------------------
1791 | Returns the result of adding the single-precision floating-point values `a'
1792 | and `b'. The operation is performed according to the IEC/IEEE Standard for
1793 | Binary Floating-Point Arithmetic.
1794 *----------------------------------------------------------------------------*/
1795
1796 float32 float32_add( float32 a, float32 b STATUS_PARAM )
1797 {
1798 flag aSign, bSign;
1799
1800 aSign = extractFloat32Sign( a );
1801 bSign = extractFloat32Sign( b );
1802 if ( aSign == bSign ) {
1803 return addFloat32Sigs( a, b, aSign STATUS_VAR);
1804 }
1805 else {
1806 return subFloat32Sigs( a, b, aSign STATUS_VAR );
1807 }
1808
1809 }
1810
1811 /*----------------------------------------------------------------------------
1812 | Returns the result of subtracting the single-precision floating-point values
1813 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1814 | for Binary Floating-Point Arithmetic.
1815 *----------------------------------------------------------------------------*/
1816
1817 float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1818 {
1819 flag aSign, bSign;
1820
1821 aSign = extractFloat32Sign( a );
1822 bSign = extractFloat32Sign( b );
1823 if ( aSign == bSign ) {
1824 return subFloat32Sigs( a, b, aSign STATUS_VAR );
1825 }
1826 else {
1827 return addFloat32Sigs( a, b, aSign STATUS_VAR );
1828 }
1829
1830 }
1831
1832 /*----------------------------------------------------------------------------
1833 | Returns the result of multiplying the single-precision floating-point values
1834 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1835 | for Binary Floating-Point Arithmetic.
1836 *----------------------------------------------------------------------------*/
1837
1838 float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1839 {
1840 flag aSign, bSign, zSign;
1841 int16 aExp, bExp, zExp;
1842 bits32 aSig, bSig;
1843 bits64 zSig64;
1844 bits32 zSig;
1845
1846 aSig = extractFloat32Frac( a );
1847 aExp = extractFloat32Exp( a );
1848 aSign = extractFloat32Sign( a );
1849 bSig = extractFloat32Frac( b );
1850 bExp = extractFloat32Exp( b );
1851 bSign = extractFloat32Sign( b );
1852 zSign = aSign ^ bSign;
1853 if ( aExp == 0xFF ) {
1854 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1855 return propagateFloat32NaN( a, b STATUS_VAR );
1856 }
1857 if ( ( bExp | bSig ) == 0 ) {
1858 float_raise( float_flag_invalid STATUS_VAR);
1859 return float32_default_nan;
1860 }
1861 return packFloat32( zSign, 0xFF, 0 );
1862 }
1863 if ( bExp == 0xFF ) {
1864 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1865 if ( ( aExp | aSig ) == 0 ) {
1866 float_raise( float_flag_invalid STATUS_VAR);
1867 return float32_default_nan;
1868 }
1869 return packFloat32( zSign, 0xFF, 0 );
1870 }
1871 if ( aExp == 0 ) {
1872 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1873 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1874 }
1875 if ( bExp == 0 ) {
1876 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1877 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1878 }
1879 zExp = aExp + bExp - 0x7F;
1880 aSig = ( aSig | 0x00800000 )<<7;
1881 bSig = ( bSig | 0x00800000 )<<8;
1882 shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1883 zSig = zSig64;
1884 if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1885 zSig <<= 1;
1886 --zExp;
1887 }
1888 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1889
1890 }
1891
1892 /*----------------------------------------------------------------------------
1893 | Returns the result of dividing the single-precision floating-point value `a'
1894 | by the corresponding value `b'. The operation is performed according to the
1895 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1896 *----------------------------------------------------------------------------*/
1897
1898 float32 float32_div( float32 a, float32 b STATUS_PARAM )
1899 {
1900 flag aSign, bSign, zSign;
1901 int16 aExp, bExp, zExp;
1902 bits32 aSig, bSig, zSig;
1903
1904 aSig = extractFloat32Frac( a );
1905 aExp = extractFloat32Exp( a );
1906 aSign = extractFloat32Sign( a );
1907 bSig = extractFloat32Frac( b );
1908 bExp = extractFloat32Exp( b );
1909 bSign = extractFloat32Sign( b );
1910 zSign = aSign ^ bSign;
1911 if ( aExp == 0xFF ) {
1912 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1913 if ( bExp == 0xFF ) {
1914 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1915 float_raise( float_flag_invalid STATUS_VAR);
1916 return float32_default_nan;
1917 }
1918 return packFloat32( zSign, 0xFF, 0 );
1919 }
1920 if ( bExp == 0xFF ) {
1921 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1922 return packFloat32( zSign, 0, 0 );
1923 }
1924 if ( bExp == 0 ) {
1925 if ( bSig == 0 ) {
1926 if ( ( aExp | aSig ) == 0 ) {
1927 float_raise( float_flag_invalid STATUS_VAR);
1928 return float32_default_nan;
1929 }
1930 float_raise( float_flag_divbyzero STATUS_VAR);
1931 return packFloat32( zSign, 0xFF, 0 );
1932 }
1933 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1934 }
1935 if ( aExp == 0 ) {
1936 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1937 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1938 }
1939 zExp = aExp - bExp + 0x7D;
1940 aSig = ( aSig | 0x00800000 )<<7;
1941 bSig = ( bSig | 0x00800000 )<<8;
1942 if ( bSig <= ( aSig + aSig ) ) {
1943 aSig >>= 1;
1944 ++zExp;
1945 }
1946 zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1947 if ( ( zSig & 0x3F ) == 0 ) {
1948 zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
1949 }
1950 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1951
1952 }
1953
1954 /*----------------------------------------------------------------------------
1955 | Returns the remainder of the single-precision floating-point value `a'
1956 | with respect to the corresponding value `b'. The operation is performed
1957 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1958 *----------------------------------------------------------------------------*/
1959
1960 float32 float32_rem( float32 a, float32 b STATUS_PARAM )
1961 {
1962 flag aSign, zSign;
1963 int16 aExp, bExp, expDiff;
1964 bits32 aSig, bSig;
1965 bits32 q;
1966 bits64 aSig64, bSig64, q64;
1967 bits32 alternateASig;
1968 sbits32 sigMean;
1969
1970 aSig = extractFloat32Frac( a );
1971 aExp = extractFloat32Exp( a );
1972 aSign = extractFloat32Sign( a );
1973 bSig = extractFloat32Frac( b );
1974 bExp = extractFloat32Exp( b );
1975 if ( aExp == 0xFF ) {
1976 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1977 return propagateFloat32NaN( a, b STATUS_VAR );
1978 }
1979 float_raise( float_flag_invalid STATUS_VAR);
1980 return float32_default_nan;
1981 }
1982 if ( bExp == 0xFF ) {
1983 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1984 return a;
1985 }
1986 if ( bExp == 0 ) {
1987 if ( bSig == 0 ) {
1988 float_raise( float_flag_invalid STATUS_VAR);
1989 return float32_default_nan;
1990 }
1991 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1992 }
1993 if ( aExp == 0 ) {
1994 if ( aSig == 0 ) return a;
1995 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1996 }
1997 expDiff = aExp - bExp;
1998 aSig |= 0x00800000;
1999 bSig |= 0x00800000;
2000 if ( expDiff < 32 ) {
2001 aSig <<= 8;
2002 bSig <<= 8;
2003 if ( expDiff < 0 ) {
2004 if ( expDiff < -1 ) return a;
2005 aSig >>= 1;
2006 }
2007 q = ( bSig <= aSig );
2008 if ( q ) aSig -= bSig;
2009 if ( 0 < expDiff ) {
2010 q = ( ( (bits64) aSig )<<32 ) / bSig;
2011 q >>= 32 - expDiff;
2012 bSig >>= 2;
2013 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2014 }
2015 else {
2016 aSig >>= 2;
2017 bSig >>= 2;
2018 }
2019 }
2020 else {
2021 if ( bSig <= aSig ) aSig -= bSig;
2022 aSig64 = ( (bits64) aSig )<<40;
2023 bSig64 = ( (bits64) bSig )<<40;
2024 expDiff -= 64;
2025 while ( 0 < expDiff ) {
2026 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2027 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2028 aSig64 = - ( ( bSig * q64 )<<38 );
2029 expDiff -= 62;
2030 }
2031 expDiff += 64;
2032 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2033 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2034 q = q64>>( 64 - expDiff );
2035 bSig <<= 6;
2036 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2037 }
2038 do {
2039 alternateASig = aSig;
2040 ++q;
2041 aSig -= bSig;
2042 } while ( 0 <= (sbits32) aSig );
2043 sigMean = aSig + alternateASig;
2044 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2045 aSig = alternateASig;
2046 }
2047 zSign = ( (sbits32) aSig < 0 );
2048 if ( zSign ) aSig = - aSig;
2049 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2050
2051 }
2052
2053 /*----------------------------------------------------------------------------
2054 | Returns the square root of the single-precision floating-point value `a'.
2055 | The operation is performed according to the IEC/IEEE Standard for Binary
2056 | Floating-Point Arithmetic.
2057 *----------------------------------------------------------------------------*/
2058
2059 float32 float32_sqrt( float32 a STATUS_PARAM )
2060 {
2061 flag aSign;
2062 int16 aExp, zExp;
2063 bits32 aSig, zSig;
2064 bits64 rem, term;
2065
2066 aSig = extractFloat32Frac( a );
2067 aExp = extractFloat32Exp( a );
2068 aSign = extractFloat32Sign( a );
2069 if ( aExp == 0xFF ) {
2070 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2071 if ( ! aSign ) return a;
2072 float_raise( float_flag_invalid STATUS_VAR);
2073 return float32_default_nan;
2074 }
2075 if ( aSign ) {
2076 if ( ( aExp | aSig ) == 0 ) return a;
2077 float_raise( float_flag_invalid STATUS_VAR);
2078 return float32_default_nan;
2079 }
2080 if ( aExp == 0 ) {
2081 if ( aSig == 0 ) return float32_zero;
2082 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2083 }
2084 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2085 aSig = ( aSig | 0x00800000 )<<8;
2086 zSig = estimateSqrt32( aExp, aSig ) + 2;
2087 if ( ( zSig & 0x7F ) <= 5 ) {
2088 if ( zSig < 2 ) {
2089 zSig = 0x7FFFFFFF;
2090 goto roundAndPack;
2091 }
2092 aSig >>= aExp & 1;
2093 term = ( (bits64) zSig ) * zSig;
2094 rem = ( ( (bits64) aSig )<<32 ) - term;
2095 while ( (sbits64) rem < 0 ) {
2096 --zSig;
2097 rem += ( ( (bits64) zSig )<<1 ) | 1;
2098 }
2099 zSig |= ( rem != 0 );
2100 }
2101 shift32RightJamming( zSig, 1, &zSig );
2102 roundAndPack:
2103 return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2104
2105 }
2106
2107 /*----------------------------------------------------------------------------
2108 | Returns the binary exponential of the single-precision floating-point value
2109 | `a'. The operation is performed according to the IEC/IEEE Standard for
2110 | Binary Floating-Point Arithmetic.
2111 |
2112 | Uses the following identities:
2113 |
2114 | 1. -------------------------------------------------------------------------
2115 | x x*ln(2)
2116 | 2 = e
2117 |
2118 | 2. -------------------------------------------------------------------------
2119 | 2 3 4 5 n
2120 | x x x x x x x
2121 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2122 | 1! 2! 3! 4! 5! n!
2123 *----------------------------------------------------------------------------*/
2124
2125 static const float64 float32_exp2_coefficients[15] =
2126 {
2127 make_float64( 0x3ff0000000000000ll ), /* 1 */
2128 make_float64( 0x3fe0000000000000ll ), /* 2 */
2129 make_float64( 0x3fc5555555555555ll ), /* 3 */
2130 make_float64( 0x3fa5555555555555ll ), /* 4 */
2131 make_float64( 0x3f81111111111111ll ), /* 5 */
2132 make_float64( 0x3f56c16c16c16c17ll ), /* 6 */
2133 make_float64( 0x3f2a01a01a01a01all ), /* 7 */
2134 make_float64( 0x3efa01a01a01a01all ), /* 8 */
2135 make_float64( 0x3ec71de3a556c734ll ), /* 9 */
2136 make_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2137 make_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2138 make_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2139 make_float64( 0x3de6124613a86d09ll ), /* 13 */
2140 make_float64( 0x3da93974a8c07c9dll ), /* 14 */
2141 make_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2142 };
2143
2144 float32 float32_exp2( float32 a STATUS_PARAM )
2145 {
2146 flag aSign;
2147 int16 aExp;
2148 bits32 aSig;
2149 float64 r, x, xn;
2150 int i;
2151
2152 aSig = extractFloat32Frac( a );
2153 aExp = extractFloat32Exp( a );
2154 aSign = extractFloat32Sign( a );
2155
2156 if ( aExp == 0xFF) {
2157 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2158 return (aSign) ? float32_zero : a;
2159 }
2160 if (aExp == 0) {
2161 if (aSig == 0) return float32_one;
2162 }
2163
2164 float_raise( float_flag_inexact STATUS_VAR);
2165
2166 /* ******************************* */
2167 /* using float64 for approximation */
2168 /* ******************************* */
2169 x = float32_to_float64(a STATUS_VAR);
2170 x = float64_mul(x, float64_ln2 STATUS_VAR);
2171
2172 xn = x;
2173 r = float64_one;
2174 for (i = 0 ; i < 15 ; i++) {
2175 float64 f;
2176
2177 f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2178 r = float64_add(r, f STATUS_VAR);
2179
2180 xn = float64_mul(xn, x STATUS_VAR);
2181 }
2182
2183 return float64_to_float32(r, status);
2184 }
2185
2186 /*----------------------------------------------------------------------------
2187 | Returns the binary log of the single-precision floating-point value `a'.
2188 | The operation is performed according to the IEC/IEEE Standard for Binary
2189 | Floating-Point Arithmetic.
2190 *----------------------------------------------------------------------------*/
2191 float32 float32_log2( float32 a STATUS_PARAM )
2192 {
2193 flag aSign, zSign;
2194 int16 aExp;
2195 bits32 aSig, zSig, i;
2196
2197 aSig = extractFloat32Frac( a );
2198 aExp = extractFloat32Exp( a );
2199 aSign = extractFloat32Sign( a );
2200
2201 if ( aExp == 0 ) {
2202 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2203 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2204 }
2205 if ( aSign ) {
2206 float_raise( float_flag_invalid STATUS_VAR);
2207 return float32_default_nan;
2208 }
2209 if ( aExp == 0xFF ) {
2210 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2211 return a;
2212 }
2213
2214 aExp -= 0x7F;
2215 aSig |= 0x00800000;
2216 zSign = aExp < 0;
2217 zSig = aExp << 23;
2218
2219 for (i = 1 << 22; i > 0; i >>= 1) {
2220 aSig = ( (bits64)aSig * aSig ) >> 23;
2221 if ( aSig & 0x01000000 ) {
2222 aSig >>= 1;
2223 zSig |= i;
2224 }
2225 }
2226
2227 if ( zSign )
2228 zSig = -zSig;
2229
2230 return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2231 }
2232
2233 /*----------------------------------------------------------------------------
2234 | Returns 1 if the single-precision floating-point value `a' is equal to
2235 | the corresponding value `b', and 0 otherwise. The comparison is performed
2236 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2237 *----------------------------------------------------------------------------*/
2238
2239 int float32_eq( float32 a, float32 b STATUS_PARAM )
2240 {
2241
2242 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2243 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2244 ) {
2245 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2246 float_raise( float_flag_invalid STATUS_VAR);
2247 }
2248 return 0;
2249 }
2250 return ( float32_val(a) == float32_val(b) ) ||
2251 ( (bits32) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2252
2253 }
2254
2255 /*----------------------------------------------------------------------------
2256 | Returns 1 if the single-precision floating-point value `a' is less than
2257 | or equal to the corresponding value `b', and 0 otherwise. The comparison
2258 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2259 | Arithmetic.
2260 *----------------------------------------------------------------------------*/
2261
2262 int float32_le( float32 a, float32 b STATUS_PARAM )
2263 {
2264 flag aSign, bSign;
2265 bits32 av, bv;
2266
2267 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2268 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2269 ) {
2270 float_raise( float_flag_invalid STATUS_VAR);
2271 return 0;
2272 }
2273 aSign = extractFloat32Sign( a );
2274 bSign = extractFloat32Sign( b );
2275 av = float32_val(a);
2276 bv = float32_val(b);
2277 if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2278 return ( av == bv ) || ( aSign ^ ( av < bv ) );
2279
2280 }
2281
2282 /*----------------------------------------------------------------------------
2283 | Returns 1 if the single-precision floating-point value `a' is less than
2284 | the corresponding value `b', and 0 otherwise. The comparison is performed
2285 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2286 *----------------------------------------------------------------------------*/
2287
2288 int float32_lt( float32 a, float32 b STATUS_PARAM )
2289 {
2290 flag aSign, bSign;
2291 bits32 av, bv;
2292
2293 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2294 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2295 ) {
2296 float_raise( float_flag_invalid STATUS_VAR);
2297 return 0;
2298 }
2299 aSign = extractFloat32Sign( a );
2300 bSign = extractFloat32Sign( b );
2301 av = float32_val(a);
2302 bv = float32_val(b);
2303 if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 );
2304 return ( av != bv ) && ( aSign ^ ( av < bv ) );
2305
2306 }
2307
2308 /*----------------------------------------------------------------------------
2309 | Returns 1 if the single-precision floating-point value `a' is equal to
2310 | the corresponding value `b', and 0 otherwise. The invalid exception is
2311 | raised if either operand is a NaN. Otherwise, the comparison is performed
2312 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2313 *----------------------------------------------------------------------------*/
2314
2315 int float32_eq_signaling( float32 a, float32 b STATUS_PARAM )
2316 {
2317 bits32 av, bv;
2318
2319 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2320 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2321 ) {
2322 float_raise( float_flag_invalid STATUS_VAR);
2323 return 0;
2324 }
2325 av = float32_val(a);
2326 bv = float32_val(b);
2327 return ( av == bv ) || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2328
2329 }
2330
2331 /*----------------------------------------------------------------------------
2332 | Returns 1 if the single-precision floating-point value `a' is less than or
2333 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2334 | cause an exception. Otherwise, the comparison is performed according to the
2335 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2336 *----------------------------------------------------------------------------*/
2337
2338 int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2339 {
2340 flag aSign, bSign;
2341 bits32 av, bv;
2342
2343 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2344 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2345 ) {
2346 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2347 float_raise( float_flag_invalid STATUS_VAR);
2348 }
2349 return 0;
2350 }
2351 aSign = extractFloat32Sign( a );
2352 bSign = extractFloat32Sign( b );
2353 av = float32_val(a);
2354 bv = float32_val(b);
2355 if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2356 return ( av == bv ) || ( aSign ^ ( av < bv ) );
2357
2358 }
2359
2360 /*----------------------------------------------------------------------------
2361 | Returns 1 if the single-precision floating-point value `a' is less than
2362 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2363 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
2364 | Standard for Binary Floating-Point Arithmetic.
2365 *----------------------------------------------------------------------------*/
2366
2367 int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2368 {
2369 flag aSign, bSign;
2370 bits32 av, bv;
2371
2372 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2373 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2374 ) {
2375 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2376 float_raise( float_flag_invalid STATUS_VAR);
2377 }
2378 return 0;
2379 }
2380 aSign = extractFloat32Sign( a );
2381 bSign = extractFloat32Sign( b );
2382 av = float32_val(a);
2383 bv = float32_val(b);
2384 if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 );
2385 return ( av != bv ) && ( aSign ^ ( av < bv ) );
2386
2387 }
2388
2389 /*----------------------------------------------------------------------------
2390 | Returns the result of converting the double-precision floating-point value
2391 | `a' to the 32-bit two's complement integer format. The conversion is
2392 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2393 | Arithmetic---which means in particular that the conversion is rounded
2394 | according to the current rounding mode. If `a' is a NaN, the largest
2395 | positive integer is returned. Otherwise, if the conversion overflows, the
2396 | largest integer with the same sign as `a' is returned.
2397 *----------------------------------------------------------------------------*/
2398
2399 int32 float64_to_int32( float64 a STATUS_PARAM )
2400 {
2401 flag aSign;
2402 int16 aExp, shiftCount;
2403 bits64 aSig;
2404
2405 aSig = extractFloat64Frac( a );
2406 aExp = extractFloat64Exp( a );
2407 aSign = extractFloat64Sign( a );
2408 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2409 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2410 shiftCount = 0x42C - aExp;
2411 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2412 return roundAndPackInt32( aSign, aSig STATUS_VAR );
2413
2414 }
2415
2416 /*----------------------------------------------------------------------------
2417 | Returns the result of converting the double-precision floating-point value
2418 | `a' to the 32-bit two's complement integer format. The conversion is
2419 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2420 | Arithmetic, except that the conversion is always rounded toward zero.
2421 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2422 | the conversion overflows, the largest integer with the same sign as `a' is
2423 | returned.
2424 *----------------------------------------------------------------------------*/
2425
2426 int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2427 {
2428 flag aSign;
2429 int16 aExp, shiftCount;
2430 bits64 aSig, savedASig;
2431 int32 z;
2432
2433 aSig = extractFloat64Frac( a );
2434 aExp = extractFloat64Exp( a );
2435 aSign = extractFloat64Sign( a );
2436 if ( 0x41E < aExp ) {
2437 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2438 goto invalid;
2439 }
2440 else if ( aExp < 0x3FF ) {
2441 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2442 return 0;
2443 }
2444 aSig |= LIT64( 0x0010000000000000 );
2445 shiftCount = 0x433 - aExp;
2446 savedASig = aSig;
2447 aSig >>= shiftCount;
2448 z = aSig;
2449 if ( aSign ) z = - z;
2450 if ( ( z < 0 ) ^ aSign ) {
2451 invalid:
2452 float_raise( float_flag_invalid STATUS_VAR);
2453 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2454 }
2455 if ( ( aSig<<shiftCount ) != savedASig ) {
2456 STATUS(float_exception_flags) |= float_flag_inexact;
2457 }
2458 return z;
2459
2460 }
2461
2462 /*----------------------------------------------------------------------------
2463 | Returns the result of converting the double-precision floating-point value
2464 | `a' to the 16-bit two's complement integer format. The conversion is
2465 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2466 | Arithmetic, except that the conversion is always rounded toward zero.
2467 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2468 | the conversion overflows, the largest integer with the same sign as `a' is
2469 | returned.
2470 *----------------------------------------------------------------------------*/
2471
2472 int16 float64_to_int16_round_to_zero( float64 a STATUS_PARAM )
2473 {
2474 flag aSign;
2475 int16 aExp, shiftCount;
2476 bits64 aSig, savedASig;
2477 int32 z;
2478
2479 aSig = extractFloat64Frac( a );
2480 aExp = extractFloat64Exp( a );
2481 aSign = extractFloat64Sign( a );
2482 if ( 0x40E < aExp ) {
2483 if ( ( aExp == 0x7FF ) && aSig ) {
2484 aSign = 0;
2485 }
2486 goto invalid;
2487 }
2488 else if ( aExp < 0x3FF ) {
2489 if ( aExp || aSig ) {
2490 STATUS(float_exception_flags) |= float_flag_inexact;
2491 }
2492 return 0;
2493 }
2494 aSig |= LIT64( 0x0010000000000000 );
2495 shiftCount = 0x433 - aExp;
2496 savedASig = aSig;
2497 aSig >>= shiftCount;
2498 z = aSig;
2499 if ( aSign ) {
2500 z = - z;
2501 }
2502 if ( ( (int16_t)z < 0 ) ^ aSign ) {
2503 invalid:
2504 float_raise( float_flag_invalid STATUS_VAR);
2505 return aSign ? (sbits32) 0xffff8000 : 0x7FFF;
2506 }
2507 if ( ( aSig<<shiftCount ) != savedASig ) {
2508 STATUS(float_exception_flags) |= float_flag_inexact;
2509 }
2510 return z;
2511 }
2512
2513 /*----------------------------------------------------------------------------
2514 | Returns the result of converting the double-precision floating-point value
2515 | `a' to the 64-bit two's complement integer format. The conversion is
2516 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2517 | Arithmetic---which means in particular that the conversion is rounded
2518 | according to the current rounding mode. If `a' is a NaN, the largest
2519 | positive integer is returned. Otherwise, if the conversion overflows, the
2520 | largest integer with the same sign as `a' is returned.
2521 *----------------------------------------------------------------------------*/
2522
2523 int64 float64_to_int64( float64 a STATUS_PARAM )
2524 {
2525 flag aSign;
2526 int16 aExp, shiftCount;
2527 bits64 aSig, aSigExtra;
2528
2529 aSig = extractFloat64Frac( a );
2530 aExp = extractFloat64Exp( a );
2531 aSign = extractFloat64Sign( a );
2532 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2533 shiftCount = 0x433 - aExp;
2534 if ( shiftCount <= 0 ) {
2535 if ( 0x43E < aExp ) {
2536 float_raise( float_flag_invalid STATUS_VAR);
2537 if ( ! aSign
2538 || ( ( aExp == 0x7FF )
2539 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2540 ) {
2541 return LIT64( 0x7FFFFFFFFFFFFFFF );
2542 }
2543 return (sbits64) LIT64( 0x8000000000000000 );
2544 }
2545 aSigExtra = 0;
2546 aSig <<= - shiftCount;
2547 }
2548 else {
2549 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2550 }
2551 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2552
2553 }
2554
2555 /*----------------------------------------------------------------------------
2556 | Returns the result of converting the double-precision floating-point value
2557 | `a' to the 64-bit two's complement integer format. The conversion is
2558 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2559 | Arithmetic, except that the conversion is always rounded toward zero.
2560 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2561 | the conversion overflows, the largest integer with the same sign as `a' is
2562 | returned.
2563 *----------------------------------------------------------------------------*/
2564
2565 int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2566 {
2567 flag aSign;
2568 int16 aExp, shiftCount;
2569 bits64 aSig;
2570 int64 z;
2571
2572 aSig = extractFloat64Frac( a );
2573 aExp = extractFloat64Exp( a );
2574 aSign = extractFloat64Sign( a );
2575 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2576 shiftCount = aExp - 0x433;
2577 if ( 0 <= shiftCount ) {
2578 if ( 0x43E <= aExp ) {
2579 if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
2580 float_raise( float_flag_invalid STATUS_VAR);
2581 if ( ! aSign
2582 || ( ( aExp == 0x7FF )
2583 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2584 ) {
2585 return LIT64( 0x7FFFFFFFFFFFFFFF );
2586 }
2587 }
2588 return (sbits64) LIT64( 0x8000000000000000 );
2589 }
2590 z = aSig<<shiftCount;
2591 }
2592 else {
2593 if ( aExp < 0x3FE ) {
2594 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2595 return 0;
2596 }
2597 z = aSig>>( - shiftCount );
2598 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2599 STATUS(float_exception_flags) |= float_flag_inexact;
2600 }
2601 }
2602 if ( aSign ) z = - z;
2603 return z;
2604
2605 }
2606
2607 /*----------------------------------------------------------------------------
2608 | Returns the result of converting the double-precision floating-point value
2609 | `a' to the single-precision floating-point format. The conversion is
2610 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2611 | Arithmetic.
2612 *----------------------------------------------------------------------------*/
2613
2614 float32 float64_to_float32( float64 a STATUS_PARAM )
2615 {
2616 flag aSign;
2617 int16 aExp;
2618 bits64 aSig;
2619 bits32 zSig;
2620
2621 aSig = extractFloat64Frac( a );
2622 aExp = extractFloat64Exp( a );
2623 aSign = extractFloat64Sign( a );
2624 if ( aExp == 0x7FF ) {
2625 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) );
2626 return packFloat32( aSign, 0xFF, 0 );
2627 }
2628 shift64RightJamming( aSig, 22, &aSig );
2629 zSig = aSig;
2630 if ( aExp || zSig ) {
2631 zSig |= 0x40000000;
2632 aExp -= 0x381;
2633 }
2634 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2635
2636 }
2637
2638
2639 /*----------------------------------------------------------------------------
2640 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2641 | half-precision floating-point value, returning the result. After being
2642 | shifted into the proper positions, the three fields are simply added
2643 | together to form the result. This means that any integer portion of `zSig'
2644 | will be added into the exponent. Since a properly normalized significand
2645 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2646 | than the desired result exponent whenever `zSig' is a complete, normalized
2647 | significand.
2648 *----------------------------------------------------------------------------*/
2649 static bits16 packFloat16(flag zSign, int16 zExp, bits16 zSig)
2650 {
2651 return (((bits32)zSign) << 15) + (((bits32)zExp) << 10) + zSig;
2652 }
2653
2654 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
2655 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
2656
2657 float32 float16_to_float32( bits16 a, flag ieee STATUS_PARAM )
2658 {
2659 flag aSign;
2660 int16 aExp;
2661 bits32 aSig;
2662
2663 aSign = a >> 15;
2664 aExp = (a >> 10) & 0x1f;
2665 aSig = a & 0x3ff;
2666
2667 if (aExp == 0x1f && ieee) {
2668 if (aSig) {
2669 /* Make sure correct exceptions are raised. */
2670 float32ToCommonNaN(a STATUS_VAR);
2671 aSig |= 0x200;
2672 }
2673 return packFloat32(aSign, 0xff, aSig << 13);
2674 }
2675 if (aExp == 0) {
2676 int8 shiftCount;
2677
2678 if (aSig == 0) {
2679 return packFloat32(aSign, 0, 0);
2680 }
2681
2682 shiftCount = countLeadingZeros32( aSig ) - 21;
2683 aSig = aSig << shiftCount;
2684 aExp = -shiftCount;
2685 }
2686 return packFloat32( aSign, aExp + 0x70, aSig << 13);
2687 }
2688
2689 bits16 float32_to_float16( float32 a, flag ieee STATUS_PARAM)
2690 {
2691 flag aSign;
2692 int16 aExp;
2693 bits32 aSig;
2694 bits32 mask;
2695 bits32 increment;
2696 int8 roundingMode;
2697
2698 aSig = extractFloat32Frac( a );
2699 aExp = extractFloat32Exp( a );
2700 aSign = extractFloat32Sign( a );
2701 if ( aExp == 0xFF ) {
2702 if (aSig) {
2703 /* Make sure correct exceptions are raised. */
2704 float32ToCommonNaN(a STATUS_VAR);
2705 aSig |= 0x00400000;
2706 }
2707 return packFloat16(aSign, 0x1f, aSig >> 13);
2708 }
2709 if (aExp == 0 && aSign == 0) {
2710 return packFloat16(aSign, 0, 0);
2711 }
2712 /* Decimal point between bits 22 and 23. */
2713 aSig |= 0x00800000;
2714 aExp -= 0x7f;
2715 if (aExp < -14) {
2716 mask = 0x007fffff;
2717 if (aExp < -24) {
2718 aExp = -25;
2719 } else {
2720 mask >>= 24 + aExp;
2721 }
2722 } else {
2723 mask = 0x00001fff;
2724 }
2725 if (aSig & mask) {
2726 float_raise( float_flag_underflow STATUS_VAR );
2727 roundingMode = STATUS(float_rounding_mode);
2728 switch (roundingMode) {
2729 case float_round_nearest_even:
2730 increment = (mask + 1) >> 1;
2731 if ((aSig & mask) == increment) {
2732 increment = aSig & (increment << 1);
2733 }
2734 break;
2735 case float_round_up:
2736 increment = aSign ? 0 : mask;
2737 break;
2738 case float_round_down:
2739 increment = aSign ? mask : 0;
2740 break;
2741 default: /* round_to_zero */
2742 increment = 0;
2743 break;
2744 }
2745 aSig += increment;
2746 if (aSig >= 0x01000000) {
2747 aSig >>= 1;
2748 aExp++;
2749 }
2750 } else if (aExp < -14
2751 && STATUS(float_detect_tininess) == float_tininess_before_rounding) {
2752 float_raise( float_flag_underflow STATUS_VAR);
2753 }
2754
2755 if (ieee) {
2756 if (aExp > 15) {
2757 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
2758 return packFloat16(aSign, 0x1f, 0);
2759 }
2760 } else {
2761 if (aExp > 16) {
2762 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
2763 return packFloat16(aSign, 0x1f, 0x3ff);
2764 }
2765 }
2766 if (aExp < -24) {
2767 return packFloat16(aSign, 0, 0);
2768 }
2769 if (aExp < -14) {
2770 aSig >>= -14 - aExp;
2771 aExp = -14;
2772 }
2773 return packFloat16(aSign, aExp + 14, aSig >> 13);
2774 }
2775
2776 #ifdef FLOATX80
2777
2778 /*----------------------------------------------------------------------------
2779 | Returns the result of converting the double-precision floating-point value
2780 | `a' to the extended double-precision floating-point format. The conversion
2781 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2782 | Arithmetic.
2783 *----------------------------------------------------------------------------*/
2784
2785 floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
2786 {
2787 flag aSign;
2788 int16 aExp;
2789 bits64 aSig;
2790
2791 aSig = extractFloat64Frac( a );
2792 aExp = extractFloat64Exp( a );
2793 aSign = extractFloat64Sign( a );
2794 if ( aExp == 0x7FF ) {
2795 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) );
2796 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2797 }
2798 if ( aExp == 0 ) {
2799 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2800 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2801 }
2802 return
2803 packFloatx80(
2804 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2805
2806 }
2807
2808 #endif
2809
2810 #ifdef FLOAT128
2811
2812 /*----------------------------------------------------------------------------
2813 | Returns the result of converting the double-precision floating-point value
2814 | `a' to the quadruple-precision floating-point format. The conversion is
2815 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2816 | Arithmetic.
2817 *----------------------------------------------------------------------------*/
2818
2819 float128 float64_to_float128( float64 a STATUS_PARAM )
2820 {
2821 flag aSign;
2822 int16 aExp;
2823 bits64 aSig, zSig0, zSig1;
2824
2825 aSig = extractFloat64Frac( a );
2826 aExp = extractFloat64Exp( a );
2827 aSign = extractFloat64Sign( a );
2828 if ( aExp == 0x7FF ) {
2829 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) );
2830 return packFloat128( aSign, 0x7FFF, 0, 0 );
2831 }
2832 if ( aExp == 0 ) {
2833 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2834 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2835 --aExp;
2836 }
2837 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2838 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2839
2840 }
2841
2842 #endif
2843
2844 /*----------------------------------------------------------------------------
2845 | Rounds the double-precision floating-point value `a' to an integer, and
2846 | returns the result as a double-precision floating-point value. The
2847 | operation is performed according to the IEC/IEEE Standard for Binary
2848 | Floating-Point Arithmetic.
2849 *----------------------------------------------------------------------------*/
2850
2851 float64 float64_round_to_int( float64 a STATUS_PARAM )
2852 {
2853 flag aSign;
2854 int16 aExp;
2855 bits64 lastBitMask, roundBitsMask;
2856 int8 roundingMode;
2857 bits64 z;
2858
2859 aExp = extractFloat64Exp( a );
2860 if ( 0x433 <= aExp ) {
2861 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2862 return propagateFloat64NaN( a, a STATUS_VAR );
2863 }
2864 return a;
2865 }
2866 if ( aExp < 0x3FF ) {
2867 if ( (bits64) ( float64_val(a)<<1 ) == 0 ) return a;
2868 STATUS(float_exception_flags) |= float_flag_inexact;
2869 aSign = extractFloat64Sign( a );
2870 switch ( STATUS(float_rounding_mode) ) {
2871 case float_round_nearest_even:
2872 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2873 return packFloat64( aSign, 0x3FF, 0 );
2874 }
2875 break;
2876 case float_round_down:
2877 return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
2878 case float_round_up:
2879 return make_float64(
2880 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
2881 }
2882 return packFloat64( aSign, 0, 0 );
2883 }
2884 lastBitMask = 1;
2885 lastBitMask <<= 0x433 - aExp;
2886 roundBitsMask = lastBitMask - 1;
2887 z = float64_val(a);
2888 roundingMode = STATUS(float_rounding_mode);
2889 if ( roundingMode == float_round_nearest_even ) {
2890 z += lastBitMask>>1;
2891 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2892 }
2893 else if ( roundingMode != float_round_to_zero ) {
2894 if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
2895 z += roundBitsMask;
2896 }
2897 }
2898 z &= ~ roundBitsMask;
2899 if ( z != float64_val(a) )
2900 STATUS(float_exception_flags) |= float_flag_inexact;
2901 return make_float64(z);
2902
2903 }
2904
2905 float64 float64_trunc_to_int( float64 a STATUS_PARAM)
2906 {
2907 int oldmode;
2908 float64 res;
2909 oldmode = STATUS(float_rounding_mode);
2910 STATUS(float_rounding_mode) = float_round_to_zero;
2911 res = float64_round_to_int(a STATUS_VAR);
2912 STATUS(float_rounding_mode) = oldmode;
2913 return res;
2914 }
2915
2916 /*----------------------------------------------------------------------------
2917 | Returns the result of adding the absolute values of the double-precision
2918 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
2919 | before being returned. `zSign' is ignored if the result is a NaN.
2920 | The addition is performed according to the IEC/IEEE Standard for Binary
2921 | Floating-Point Arithmetic.
2922 *----------------------------------------------------------------------------*/
2923
2924 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2925 {
2926 int16 aExp, bExp, zExp;
2927 bits64 aSig, bSig, zSig;
2928 int16 expDiff;
2929
2930 aSig = extractFloat64Frac( a );
2931 aExp = extractFloat64Exp( a );
2932 bSig = extractFloat64Frac( b );
2933 bExp = extractFloat64Exp( b );
2934 expDiff = aExp - bExp;
2935 aSig <<= 9;
2936 bSig <<= 9;
2937 if ( 0 < expDiff ) {
2938 if ( aExp == 0x7FF ) {
2939 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2940 return a;
2941 }
2942 if ( bExp == 0 ) {
2943 --expDiff;
2944 }
2945 else {
2946 bSig |= LIT64( 0x2000000000000000 );
2947 }
2948 shift64RightJamming( bSig, expDiff, &bSig );
2949 zExp = aExp;
2950 }
2951 else if ( expDiff < 0 ) {
2952 if ( bExp == 0x7FF ) {
2953 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2954 return packFloat64( zSign, 0x7FF, 0 );
2955 }
2956 if ( aExp == 0 ) {
2957 ++expDiff;
2958 }
2959 else {
2960 aSig |= LIT64( 0x2000000000000000 );
2961 }
2962 shift64RightJamming( aSig, - expDiff, &aSig );
2963 zExp = bExp;
2964 }
2965 else {
2966 if ( aExp == 0x7FF ) {
2967 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2968 return a;
2969 }
2970 if ( aExp == 0 ) {
2971 if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
2972 return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2973 }
2974 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2975 zExp = aExp;
2976 goto roundAndPack;
2977 }
2978 aSig |= LIT64( 0x2000000000000000 );
2979 zSig = ( aSig + bSig )<<1;
2980 --zExp;
2981 if ( (sbits64) zSig < 0 ) {
2982 zSig = aSig + bSig;
2983 ++zExp;
2984 }
2985 roundAndPack:
2986 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2987
2988 }
2989
2990 /*----------------------------------------------------------------------------
2991 | Returns the result of subtracting the absolute values of the double-
2992 | precision floating-point values `a' and `b'. If `zSign' is 1, the
2993 | difference is negated before being returned. `zSign' is ignored if the
2994 | result is a NaN. The subtraction is performed according to the IEC/IEEE
2995 | Standard for Binary Floating-Point Arithmetic.
2996 *----------------------------------------------------------------------------*/
2997
2998 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2999 {
3000 int16 aExp, bExp, zExp;
3001 bits64 aSig, bSig, zSig;
3002 int16 expDiff;
3003
3004 aSig = extractFloat64Frac( a );
3005 aExp = extractFloat64Exp( a );
3006 bSig = extractFloat64Frac( b );
3007 bExp = extractFloat64Exp( b );
3008 expDiff = aExp - bExp;
3009 aSig <<= 10;
3010 bSig <<= 10;
3011 if ( 0 < expDiff ) goto aExpBigger;
3012 if ( expDiff < 0 ) goto bExpBigger;
3013 if ( aExp == 0x7FF ) {
3014 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3015 float_raise( float_flag_invalid STATUS_VAR);
3016 return float64_default_nan;
3017 }
3018 if ( aExp == 0 ) {
3019 aExp = 1;
3020 bExp = 1;
3021 }
3022 if ( bSig < aSig ) goto aBigger;
3023 if ( aSig < bSig ) goto bBigger;
3024 return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3025 bExpBigger:
3026 if ( bExp == 0x7FF ) {
3027 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3028 return packFloat64( zSign ^ 1, 0x7FF, 0 );
3029 }
3030 if ( aExp == 0 ) {
3031 ++expDiff;
3032 }
3033 else {
3034 aSig |= LIT64( 0x4000000000000000 );
3035 }
3036 shift64RightJamming( aSig, - expDiff, &aSig );
3037 bSig |= LIT64( 0x4000000000000000 );
3038 bBigger:
3039 zSig = bSig - aSig;
3040 zExp = bExp;
3041 zSign ^= 1;
3042 goto normalizeRoundAndPack;
3043 aExpBigger:
3044 if ( aExp == 0x7FF ) {
3045 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3046 return a;
3047 }
3048 if ( bExp == 0 ) {
3049 --expDiff;
3050 }
3051 else {
3052 bSig |= LIT64( 0x4000000000000000 );
3053 }
3054 shift64RightJamming( bSig, expDiff, &bSig );
3055 aSig |= LIT64( 0x4000000000000000 );
3056 aBigger:
3057 zSig = aSig - bSig;
3058 zExp = aExp;
3059 normalizeRoundAndPack:
3060 --zExp;
3061 return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3062
3063 }
3064
3065 /*----------------------------------------------------------------------------
3066 | Returns the result of adding the double-precision floating-point values `a'
3067 | and `b'. The operation is performed according to the IEC/IEEE Standard for
3068 | Binary Floating-Point Arithmetic.
3069 *----------------------------------------------------------------------------*/
3070
3071 float64 float64_add( float64 a, float64 b STATUS_PARAM )
3072 {
3073 flag aSign, bSign;
3074
3075 aSign = extractFloat64Sign( a );
3076 bSign = extractFloat64Sign( b );
3077 if ( aSign == bSign ) {
3078 return addFloat64Sigs( a, b, aSign STATUS_VAR );
3079 }
3080 else {
3081 return subFloat64Sigs( a, b, aSign STATUS_VAR );
3082 }
3083
3084 }
3085
3086 /*----------------------------------------------------------------------------
3087 | Returns the result of subtracting the double-precision floating-point values
3088 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3089 | for Binary Floating-Point Arithmetic.
3090 *----------------------------------------------------------------------------*/
3091
3092 float64 float64_sub( float64 a, float64 b STATUS_PARAM )
3093 {
3094 flag aSign, bSign;
3095
3096 aSign = extractFloat64Sign( a );
3097 bSign = extractFloat64Sign( b );
3098 if ( aSign == bSign ) {
3099 return subFloat64Sigs( a, b, aSign STATUS_VAR );
3100 }
3101 else {
3102 return addFloat64Sigs( a, b, aSign STATUS_VAR );
3103 }
3104
3105 }
3106
3107 /*----------------------------------------------------------------------------
3108 | Returns the result of multiplying the double-precision floating-point values
3109 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3110 | for Binary Floating-Point Arithmetic.
3111 *----------------------------------------------------------------------------*/
3112
3113 float64 float64_mul( float64 a, float64 b STATUS_PARAM )
3114 {
3115 flag aSign, bSign, zSign;
3116 int16 aExp, bExp, zExp;
3117 bits64 aSig, bSig, zSig0, zSig1;
3118
3119 aSig = extractFloat64Frac( a );
3120 aExp = extractFloat64Exp( a );
3121 aSign = extractFloat64Sign( a );
3122 bSig = extractFloat64Frac( b );
3123 bExp = extractFloat64Exp( b );
3124 bSign = extractFloat64Sign( b );
3125 zSign = aSign ^ bSign;
3126 if ( aExp == 0x7FF ) {
3127 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3128 return propagateFloat64NaN( a, b STATUS_VAR );
3129 }
3130 if ( ( bExp | bSig ) == 0 ) {
3131 float_raise( float_flag_invalid STATUS_VAR);
3132 return float64_default_nan;
3133 }
3134 return packFloat64( zSign, 0x7FF, 0 );
3135 }
3136 if ( bExp == 0x7FF ) {
3137 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3138 if ( ( aExp | aSig ) == 0 ) {
3139 float_raise( float_flag_invalid STATUS_VAR);
3140 return float64_default_nan;
3141 }
3142 return packFloat64( zSign, 0x7FF, 0 );
3143 }
3144 if ( aExp == 0 ) {
3145 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3146 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3147 }
3148 if ( bExp == 0 ) {
3149 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
3150 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3151 }
3152 zExp = aExp + bExp - 0x3FF;
3153 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3154 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3155 mul64To128( aSig, bSig, &zSig0, &zSig1 );
3156 zSig0 |= ( zSig1 != 0 );
3157 if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
3158 zSig0 <<= 1;
3159 --zExp;
3160 }
3161 return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
3162
3163 }
3164
3165 /*----------------------------------------------------------------------------
3166 | Returns the result of dividing the double-precision floating-point value `a'
3167 | by the corresponding value `b'. The operation is performed according to
3168 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3169 *----------------------------------------------------------------------------*/
3170
3171 float64 float64_div( float64 a, float64 b STATUS_PARAM )
3172 {
3173 flag aSign, bSign, zSign;
3174 int16 aExp, bExp, zExp;
3175 bits64 aSig, bSig, zSig;
3176 bits64 rem0, rem1;
3177 bits64 term0, term1;
3178
3179 aSig = extractFloat64Frac( a );
3180 aExp = extractFloat64Exp( a );
3181 aSign = extractFloat64Sign( a );
3182 bSig = extractFloat64Frac( b );
3183 bExp = extractFloat64Exp( b );
3184 bSign = extractFloat64Sign( b );
3185 zSign = aSign ^ bSign;
3186 if ( aExp == 0x7FF ) {
3187 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3188 if ( bExp == 0x7FF ) {
3189 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3190 float_raise( float_flag_invalid STATUS_VAR);
3191 return float64_default_nan;
3192 }
3193 return packFloat64( zSign, 0x7FF, 0 );
3194 }
3195 if ( bExp == 0x7FF ) {
3196 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3197 return packFloat64( zSign, 0, 0 );
3198 }
3199 if ( bExp == 0 ) {
3200 if ( bSig == 0 ) {
3201 if ( ( aExp | aSig ) == 0 ) {
3202 float_raise( float_flag_invalid STATUS_VAR);
3203 return float64_default_nan;
3204 }
3205 float_raise( float_flag_divbyzero STATUS_VAR);
3206 return packFloat64( zSign, 0x7FF, 0 );
3207 }
3208 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3209 }
3210 if ( aExp == 0 ) {
3211 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3212 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3213 }
3214 zExp = aExp - bExp + 0x3FD;
3215 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3216 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3217 if ( bSig <= ( aSig + aSig ) ) {
3218 aSig >>= 1;
3219 ++zExp;
3220 }
3221 zSig = estimateDiv128To64( aSig, 0, bSig );
3222 if ( ( zSig & 0x1FF ) <= 2 ) {
3223 mul64To128( bSig, zSig, &term0, &term1 );
3224 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3225 while ( (sbits64) rem0 < 0 ) {
3226 --zSig;
3227 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3228 }
3229 zSig |= ( rem1 != 0 );
3230 }
3231 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3232
3233 }
3234
3235 /*----------------------------------------------------------------------------
3236 | Returns the remainder of the double-precision floating-point value `a'
3237 | with respect to the corresponding value `b'. The operation is performed
3238 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3239 *----------------------------------------------------------------------------*/
3240
3241 float64 float64_rem( float64 a, float64 b STATUS_PARAM )
3242 {
3243 flag aSign, zSign;
3244 int16 aExp, bExp, expDiff;
3245 bits64 aSig, bSig;
3246 bits64 q, alternateASig;
3247 sbits64 sigMean;
3248
3249 aSig = extractFloat64Frac( a );
3250 aExp = extractFloat64Exp( a );
3251 aSign = extractFloat64Sign( a );
3252 bSig = extractFloat64Frac( b );
3253 bExp = extractFloat64Exp( b );
3254 if ( aExp == 0x7FF ) {
3255 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3256 return propagateFloat64NaN( a, b STATUS_VAR );
3257 }
3258 float_raise( float_flag_invalid STATUS_VAR);
3259 return float64_default_nan;
3260 }
3261 if ( bExp == 0x7FF ) {
3262 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3263 return a;
3264 }
3265 if ( bExp == 0 ) {
3266 if ( bSig == 0 ) {
3267 float_raise( float_flag_invalid STATUS_VAR);
3268 return float64_default_nan;
3269 }
3270 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3271 }
3272 if ( aExp == 0 ) {
3273 if ( aSig == 0 ) return a;
3274 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3275 }
3276 expDiff = aExp - bExp;
3277 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3278 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3279 if ( expDiff < 0 ) {
3280 if ( expDiff < -1 ) return a;
3281 aSig >>= 1;
3282 }
3283 q = ( bSig <= aSig );
3284 if ( q ) aSig -= bSig;
3285 expDiff -= 64;
3286 while ( 0 < expDiff ) {
3287 q = estimateDiv128To64( aSig, 0, bSig );
3288 q = ( 2 < q ) ? q - 2 : 0;
3289 aSig = - ( ( bSig>>2 ) * q );
3290 expDiff -= 62;
3291 }
3292 expDiff += 64;
3293 if ( 0 < expDiff ) {
3294 q = estimateDiv128To64( aSig, 0, bSig );
3295 q = ( 2 < q ) ? q - 2 : 0;
3296 q >>= 64 - expDiff;
3297 bSig >>= 2;
3298 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3299 }
3300 else {
3301 aSig >>= 2;
3302 bSig >>= 2;
3303 }
3304 do {
3305 alternateASig = aSig;
3306 ++q;
3307 aSig -= bSig;
3308 } while ( 0 <= (sbits64) aSig );
3309 sigMean = aSig + alternateASig;
3310 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3311 aSig = alternateASig;
3312 }
3313 zSign = ( (sbits64) aSig < 0 );
3314 if ( zSign ) aSig = - aSig;
3315 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
3316
3317 }
3318
3319 /*----------------------------------------------------------------------------
3320 | Returns the square root of the double-precision floating-point value `a'.
3321 | The operation is performed according to the IEC/IEEE Standard for Binary
3322 | Floating-Point Arithmetic.
3323 *----------------------------------------------------------------------------*/
3324
3325 float64 float64_sqrt( float64 a STATUS_PARAM )
3326 {
3327 flag aSign;
3328 int16 aExp, zExp;
3329 bits64 aSig, zSig, doubleZSig;
3330 bits64 rem0, rem1, term0, term1;
3331
3332 aSig = extractFloat64Frac( a );
3333 aExp = extractFloat64Exp( a );
3334 aSign = extractFloat64Sign( a );
3335 if ( aExp == 0x7FF ) {
3336 if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
3337 if ( ! aSign ) return a;
3338 float_raise( float_flag_invalid STATUS_VAR);
3339 return float64_default_nan;
3340 }
3341 if ( aSign ) {
3342 if ( ( aExp | aSig ) == 0 ) return a;
3343 float_raise( float_flag_invalid STATUS_VAR);
3344 return float64_default_nan;
3345 }
3346 if ( aExp == 0 ) {
3347 if ( aSig == 0 ) return float64_zero;
3348 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3349 }
3350 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3351 aSig |= LIT64( 0x0010000000000000 );
3352 zSig = estimateSqrt32( aExp, aSig>>21 );
3353 aSig <<= 9 - ( aExp & 1 );
3354 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3355 if ( ( zSig & 0x1FF ) <= 5 ) {
3356 doubleZSig = zSig<<1;
3357 mul64To128( zSig, zSig, &term0, &term1 );
3358 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3359 while ( (sbits64) rem0 < 0 ) {
3360 --zSig;
3361 doubleZSig -= 2;
3362 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3363 }
3364 zSig |= ( ( rem0 | rem1 ) != 0 );
3365 }
3366 return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
3367
3368 }
3369
3370 /*----------------------------------------------------------------------------
3371 | Returns the binary log of the double-precision floating-point value `a'.
3372 | The operation is performed according to the IEC/IEEE Standard for Binary
3373 | Floating-Point Arithmetic.
3374 *----------------------------------------------------------------------------*/
3375 float64 float64_log2( float64 a STATUS_PARAM )
3376 {
3377 flag aSign, zSign;
3378 int16 aExp;
3379 bits64 aSig, aSig0, aSig1, zSig, i;
3380
3381 aSig = extractFloat64Frac( a );
3382 aExp = extractFloat64Exp( a );
3383 aSign = extractFloat64Sign( a );
3384
3385 if ( aExp == 0 ) {
3386 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
3387 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3388 }
3389 if ( aSign ) {
3390 float_raise( float_flag_invalid STATUS_VAR);
3391 return float64_default_nan;
3392 }
3393 if ( aExp == 0x7FF ) {
3394 if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
3395 return a;
3396 }
3397
3398 aExp -= 0x3FF;
3399 aSig |= LIT64( 0x0010000000000000 );
3400 zSign = aExp < 0;
3401 zSig = (bits64)aExp << 52;
3402 for (i = 1LL << 51; i > 0; i >>= 1) {
3403 mul64To128( aSig, aSig, &aSig0, &aSig1 );
3404 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
3405 if ( aSig & LIT64( 0x0020000000000000 ) ) {
3406 aSig >>= 1;
3407 zSig |= i;
3408 }
3409 }
3410
3411 if ( zSign )
3412 zSig = -zSig;
3413 return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
3414 }
3415
3416 /*----------------------------------------------------------------------------
3417 | Returns 1 if the double-precision floating-point value `a' is equal to the
3418 | corresponding value `b', and 0 otherwise. The comparison is performed
3419 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3420 *----------------------------------------------------------------------------*/
3421
3422 int float64_eq( float64 a, float64 b STATUS_PARAM )
3423 {
3424 bits64 av, bv;
3425
3426 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3427 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3428 ) {
3429 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3430 float_raise( float_flag_invalid STATUS_VAR);
3431 }
3432 return 0;
3433 }
3434 av = float64_val(a);
3435 bv = float64_val(b);
3436 return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3437
3438 }
3439
3440 /*----------------------------------------------------------------------------
3441 | Returns 1 if the double-precision floating-point value `a' is less than or
3442 | equal to the corresponding value `b', and 0 otherwise. The comparison is
3443 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3444 | Arithmetic.
3445 *----------------------------------------------------------------------------*/
3446
3447 int float64_le( float64 a, float64 b STATUS_PARAM )
3448 {
3449 flag aSign, bSign;
3450 bits64 av, bv;
3451
3452 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3453 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3454 ) {
3455 float_raise( float_flag_invalid STATUS_VAR);
3456 return 0;
3457 }
3458 aSign = extractFloat64Sign( a );
3459 bSign = extractFloat64Sign( b );
3460 av = float64_val(a);
3461 bv = float64_val(b);
3462 if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3463 return ( av == bv ) || ( aSign ^ ( av < bv ) );
3464
3465 }
3466
3467 /*----------------------------------------------------------------------------
3468 | Returns 1 if the double-precision floating-point value `a' is less than
3469 | the corresponding value `b', and 0 otherwise. The comparison is performed
3470 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3471 *----------------------------------------------------------------------------*/
3472
3473 int float64_lt( float64 a, float64 b STATUS_PARAM )
3474 {
3475 flag aSign, bSign;
3476 bits64 av, bv;
3477
3478 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3479 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3480 ) {
3481 float_raise( float_flag_invalid STATUS_VAR);
3482 return 0;
3483 }
3484 aSign = extractFloat64Sign( a );
3485 bSign = extractFloat64Sign( b );
3486 av = float64_val(a);
3487 bv = float64_val(b);
3488 if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 );
3489 return ( av != bv ) && ( aSign ^ ( av < bv ) );
3490
3491 }
3492
3493 /*----------------------------------------------------------------------------
3494 | Returns 1 if the double-precision floating-point value `a' is equal to the
3495 | corresponding value `b', and 0 otherwise. The invalid exception is raised
3496 | if either operand is a NaN. Otherwise, the comparison is performed
3497 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3498 *----------------------------------------------------------------------------*/
3499
3500 int float64_eq_signaling( float64 a, float64 b STATUS_PARAM )
3501 {
3502 bits64 av, bv;
3503
3504 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3505 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3506 ) {
3507 float_raise( float_flag_invalid STATUS_VAR);
3508 return 0;
3509 }
3510 av = float64_val(a);
3511 bv = float64_val(b);
3512 return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3513
3514 }
3515
3516 /*----------------------------------------------------------------------------
3517 | Returns 1 if the double-precision floating-point value `a' is less than or
3518 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3519 | cause an exception. Otherwise, the comparison is performed according to the
3520 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3521 *----------------------------------------------------------------------------*/
3522
3523 int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
3524 {
3525 flag aSign, bSign;
3526 bits64 av, bv;
3527
3528 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3529 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3530 ) {
3531 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3532 float_raise( float_flag_invalid STATUS_VAR);
3533 }
3534 return 0;
3535 }
3536 aSign = extractFloat64Sign( a );
3537 bSign = extractFloat64Sign( b );
3538 av = float64_val(a);
3539 bv = float64_val(b);
3540 if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3541 return ( av == bv ) || ( aSign ^ ( av < bv ) );
3542
3543 }
3544
3545 /*----------------------------------------------------------------------------
3546 | Returns 1 if the double-precision floating-point value `a' is less than
3547 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3548 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
3549 | Standard for Binary Floating-Point Arithmetic.
3550 *----------------------------------------------------------------------------*/
3551
3552 int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
3553 {
3554 flag aSign, bSign;
3555 bits64 av, bv;
3556
3557 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3558 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3559 ) {
3560 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3561 float_raise( float_flag_invalid STATUS_VAR);
3562 }
3563 return 0;
3564 }
3565 aSign = extractFloat64Sign( a );
3566 bSign = extractFloat64Sign( b );
3567 av = float64_val(a);
3568 bv = float64_val(b);
3569 if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 );
3570 return ( av != bv ) && ( aSign ^ ( av < bv ) );
3571
3572 }
3573
3574 #ifdef FLOATX80
3575
3576 /*----------------------------------------------------------------------------
3577 | Returns the result of converting the extended double-precision floating-
3578 | point value `a' to the 32-bit two's complement integer format. The
3579 | conversion is performed according to the IEC/IEEE Standard for Binary
3580 | Floating-Point Arithmetic---which means in particular that the conversion
3581 | is rounded according to the current rounding mode. If `a' is a NaN, the
3582 | largest positive integer is returned. Otherwise, if the conversion
3583 | overflows, the largest integer with the same sign as `a' is returned.
3584 *----------------------------------------------------------------------------*/
3585
3586 int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
3587 {
3588 flag aSign;
3589 int32 aExp, shiftCount;
3590 bits64 aSig;
3591
3592 aSig = extractFloatx80Frac( a );
3593 aExp = extractFloatx80Exp( a );
3594 aSign = extractFloatx80Sign( a );
3595 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3596 shiftCount = 0x4037 - aExp;
3597 if ( shiftCount <= 0 ) shiftCount = 1;
3598 shift64RightJamming( aSig, shiftCount, &aSig );
3599 return roundAndPackInt32( aSign, aSig STATUS_VAR );
3600
3601 }
3602
3603 /*----------------------------------------------------------------------------
3604 | Returns the result of converting the extended double-precision floating-
3605 | point value `a' to the 32-bit two's complement integer format. The
3606 | conversion is performed according to the IEC/IEEE Standard for Binary
3607 | Floating-Point Arithmetic, except that the conversion is always rounded
3608 | toward zero. If `a' is a NaN, the largest positive integer is returned.
3609 | Otherwise, if the conversion overflows, the largest integer with the same
3610 | sign as `a' is returned.
3611 *----------------------------------------------------------------------------*/
3612
3613 int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
3614 {
3615 flag aSign;
3616 int32 aExp, shiftCount;
3617 bits64 aSig, savedASig;
3618 int32 z;
3619
3620 aSig = extractFloatx80Frac( a );
3621 aExp = extractFloatx80Exp( a );
3622 aSign = extractFloatx80Sign( a );
3623 if ( 0x401E < aExp ) {
3624 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3625 goto invalid;
3626 }
3627 else if ( aExp < 0x3FFF ) {
3628 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3629 return 0;
3630 }
3631 shiftCount = 0x403E - aExp;
3632 savedASig = aSig;
3633 aSig >>= shiftCount;
3634 z = aSig;
3635 if ( aSign ) z = - z;
3636 if ( ( z < 0 ) ^ aSign ) {
3637 invalid:
3638 float_raise( float_flag_invalid STATUS_VAR);
3639 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3640 }
3641 if ( ( aSig<<shiftCount ) != savedASig ) {
3642 STATUS(float_exception_flags) |= float_flag_inexact;
3643 }
3644 return z;
3645
3646 }
3647
3648 /*----------------------------------------------------------------------------
3649 | Returns the result of converting the extended double-precision floating-
3650 | point value `a' to the 64-bit two's complement integer format. The
3651 | conversion is performed according to the IEC/IEEE Standard for Binary
3652 | Floating-Point Arithmetic---which means in particular that the conversion
3653 | is rounded according to the current rounding mode. If `a' is a NaN,
3654 | the largest positive integer is returned. Otherwise, if the conversion
3655 | overflows, the largest integer with the same sign as `a' is returned.
3656 *----------------------------------------------------------------------------*/
3657
3658 int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
3659 {
3660 flag aSign;
3661 int32 aExp, shiftCount;
3662 bits64 aSig, aSigExtra;
3663
3664 aSig = extractFloatx80Frac( a );
3665 aExp = extractFloatx80Exp( a );
3666 aSign = extractFloatx80Sign( a );
3667 shiftCount = 0x403E - aExp;
3668 if ( shiftCount <= 0 ) {
3669 if ( shiftCount ) {
3670 float_raise( float_flag_invalid STATUS_VAR);
3671 if ( ! aSign
3672 || ( ( aExp == 0x7FFF )
3673 && ( aSig != LIT64( 0x8000000000000000 ) ) )
3674 ) {
3675 return LIT64( 0x7FFFFFFFFFFFFFFF );
3676 }
3677 return (sbits64) LIT64( 0x8000000000000000 );
3678 }
3679 aSigExtra = 0;
3680 }
3681 else {
3682 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3683 }
3684 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3685
3686 }
3687
3688 /*----------------------------------------------------------------------------
3689 | Returns the result of converting the extended double-precision floating-
3690 | point value `a' to the 64-bit two's complement integer format. The
3691 | conversion is performed according to the IEC/IEEE Standard for Binary
3692 | Floating-Point Arithmetic, except that the conversion is always rounded
3693 | toward zero. If `a' is a NaN, the largest positive integer is returned.
3694 | Otherwise, if the conversion overflows, the largest integer with the same
3695 | sign as `a' is returned.
3696 *----------------------------------------------------------------------------*/
3697
3698 int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
3699 {
3700 flag aSign;
3701 int32 aExp, shiftCount;
3702 bits64 aSig;
3703 int64 z;
3704
3705 aSig = extractFloatx80Frac( a );
3706 aExp = extractFloatx80Exp( a );
3707 aSign = extractFloatx80Sign( a );
3708 shiftCount = aExp - 0x403E;
3709 if ( 0 <= shiftCount ) {
3710 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3711 if ( ( a.high != 0xC03E ) || aSig ) {
3712 float_raise( float_flag_invalid STATUS_VAR);
3713 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3714 return LIT64( 0x7FFFFFFFFFFFFFFF );
3715 }
3716 }
3717 return (sbits64) LIT64( 0x8000000000000000 );
3718 }
3719 else if ( aExp < 0x3FFF ) {
3720 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3721 return 0;
3722 }
3723 z = aSig>>( - shiftCount );
3724 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3725 STATUS(float_exception_flags) |= float_flag_inexact;
3726 }
3727 if ( aSign ) z = - z;
3728 return z;
3729
3730 }
3731
3732 /*----------------------------------------------------------------------------
3733 | Returns the result of converting the extended double-precision floating-
3734 | point value `a' to the single-precision floating-point format. The
3735 | conversion is performed according to the IEC/IEEE Standard for Binary
3736 | Floating-Point Arithmetic.
3737 *----------------------------------------------------------------------------*/
3738
3739 float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
3740 {
3741 flag aSign;
3742 int32 aExp;
3743 bits64 aSig;
3744
3745 aSig = extractFloatx80Frac( a );
3746 aExp = extractFloatx80Exp( a );
3747 aSign = extractFloatx80Sign( a );
3748 if ( aExp == 0x7FFF ) {
3749 if ( (bits64) ( aSig<<1 ) ) {
3750 return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) );
3751 }
3752 return packFloat32( aSign, 0xFF, 0 );
3753 }
3754 shift64RightJamming( aSig, 33, &aSig );
3755 if ( aExp || aSig ) aExp -= 0x3F81;
3756 return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
3757
3758 }
3759
3760 /*----------------------------------------------------------------------------
3761 | Returns the result of converting the extended double-precision floating-
3762 | point value `a' to the double-precision floating-point format. The
3763 | conversion is performed according to the IEC/IEEE Standard for Binary
3764 | Floating-Point Arithmetic.
3765 *----------------------------------------------------------------------------*/
3766
3767 float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
3768 {
3769 flag aSign;
3770 int32 aExp;
3771 bits64 aSig, zSig;
3772
3773 aSig = extractFloatx80Frac( a );
3774 aExp = extractFloatx80Exp( a );
3775 aSign = extractFloatx80Sign( a );
3776 if ( aExp == 0x7FFF ) {
3777 if ( (bits64) ( aSig<<1 ) ) {
3778 return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) );
3779 }
3780 return packFloat64( aSign, 0x7FF, 0 );
3781 }
3782 shift64RightJamming( aSig, 1, &zSig );
3783 if ( aExp || aSig ) aExp -= 0x3C01;
3784 return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
3785
3786 }
3787
3788 #ifdef FLOAT128
3789
3790 /*----------------------------------------------------------------------------
3791 | Returns the result of converting the extended double-precision floating-
3792 | point value `a' to the quadruple-precision floating-point format. The
3793 | conversion is performed according to the IEC/IEEE Standard for Binary
3794 | Floating-Point Arithmetic.
3795 *----------------------------------------------------------------------------*/
3796
3797 float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
3798 {
3799 flag aSign;
3800 int16 aExp;
3801 bits64 aSig, zSig0, zSig1;
3802
3803 aSig = extractFloatx80Frac( a );
3804 aExp = extractFloatx80Exp( a );
3805 aSign = extractFloatx80Sign( a );
3806 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3807 return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) );
3808 }
3809 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3810 return packFloat128( aSign, aExp, zSig0, zSig1 );
3811
3812 }
3813
3814 #endif
3815
3816 /*----------------------------------------------------------------------------
3817 | Rounds the extended double-precision floating-point value `a' to an integer,
3818 | and returns the result as an extended quadruple-precision floating-point
3819 | value. The operation is performed according to the IEC/IEEE Standard for
3820 | Binary Floating-Point Arithmetic.
3821 *----------------------------------------------------------------------------*/
3822
3823 floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
3824 {
3825 flag aSign;
3826 int32 aExp;
3827 bits64 lastBitMask, roundBitsMask;
3828 int8 roundingMode;
3829 floatx80 z;
3830
3831 aExp = extractFloatx80Exp( a );
3832 if ( 0x403E <= aExp ) {
3833 if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3834 return propagateFloatx80NaN( a, a STATUS_VAR );
3835 }
3836 return a;
3837 }
3838 if ( aExp < 0x3FFF ) {
3839 if ( ( aExp == 0 )
3840 && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3841 return a;
3842 }
3843 STATUS(float_exception_flags) |= float_flag_inexact;
3844 aSign = extractFloatx80Sign( a );
3845 switch ( STATUS(float_rounding_mode) ) {
3846 case float_round_nearest_even:
3847 if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3848 ) {
3849 return
3850 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3851 }
3852 break;
3853 case float_round_down:
3854 return
3855 aSign ?
3856 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3857 : packFloatx80( 0, 0, 0 );
3858 case float_round_up:
3859 return
3860 aSign ? packFloatx80( 1, 0, 0 )
3861 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3862 }
3863 return packFloatx80( aSign, 0, 0 );
3864 }
3865 lastBitMask = 1;
3866 lastBitMask <<= 0x403E - aExp;
3867 roundBitsMask = lastBitMask - 1;
3868 z = a;
3869 roundingMode = STATUS(float_rounding_mode);
3870 if ( roundingMode == float_round_nearest_even ) {
3871 z.low += lastBitMask>>1;
3872 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3873 }
3874 else if ( roundingMode != float_round_to_zero ) {
3875 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3876 z.low += roundBitsMask;
3877 }
3878 }
3879 z.low &= ~ roundBitsMask;
3880 if ( z.low == 0 ) {
3881 ++z.high;
3882 z.low = LIT64( 0x8000000000000000 );
3883 }
3884 if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
3885 return z;
3886
3887 }
3888
3889 /*----------------------------------------------------------------------------
3890 | Returns the result of adding the absolute values of the extended double-
3891 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
3892 | negated before being returned. `zSign' is ignored if the result is a NaN.
3893 | The addition is performed according to the IEC/IEEE Standard for Binary
3894 | Floating-Point Arithmetic.
3895 *----------------------------------------------------------------------------*/
3896
3897 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
3898 {
3899 int32 aExp, bExp, zExp;
3900 bits64 aSig, bSig, zSig0, zSig1;
3901 int32 expDiff;
3902
3903 aSig = extractFloatx80Frac( a );
3904 aExp = extractFloatx80Exp( a );
3905 bSig = extractFloatx80Frac( b );
3906 bExp = extractFloatx80Exp( b );
3907 expDiff = aExp - bExp;
3908 if ( 0 < expDiff ) {
3909 if ( aExp == 0x7FFF ) {
3910 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3911 return a;
3912 }
3913 if ( bExp == 0 ) --expDiff;
3914 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3915 zExp = aExp;
3916 }
3917 else if ( expDiff < 0 ) {
3918 if ( bExp == 0x7FFF ) {
3919 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3920 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3921 }
3922 if ( aExp == 0 ) ++expDiff;
3923 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3924 zExp = bExp;
3925 }
3926 else {
3927 if ( aExp == 0x7FFF ) {
3928 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3929 return propagateFloatx80NaN( a, b STATUS_VAR );
3930 }
3931 return a;
3932 }
3933 zSig1 = 0;
3934 zSig0 = aSig + bSig;
3935 if ( aExp == 0 ) {
3936 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3937 goto roundAndPack;
3938 }
3939 zExp = aExp;
3940 goto shiftRight1;
3941 }
3942 zSig0 = aSig + bSig;
3943 if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3944 shiftRight1:
3945 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3946 zSig0 |= LIT64( 0x8000000000000000 );
3947 ++zExp;
3948 roundAndPack:
3949 return
3950 roundAndPackFloatx80(
3951 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3952
3953 }
3954
3955 /*----------------------------------------------------------------------------
3956 | Returns the result of subtracting the absolute values of the extended
3957 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
3958 | difference is negated before being returned. `zSign' is ignored if the
3959 | result is a NaN. The subtraction is performed according to the IEC/IEEE
3960 | Standard for Binary Floating-Point Arithmetic.
3961 *----------------------------------------------------------------------------*/
3962
3963 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
3964 {
3965 int32 aExp, bExp, zExp;
3966 bits64 aSig, bSig, zSig0, zSig1;
3967 int32 expDiff;
3968 floatx80 z;
3969
3970 aSig = extractFloatx80Frac( a );
3971 aExp = extractFloatx80Exp( a );
3972 bSig = extractFloatx80Frac( b );
3973 bExp = extractFloatx80Exp( b );
3974 expDiff = aExp - bExp;
3975 if ( 0 < expDiff ) goto aExpBigger;
3976 if ( expDiff < 0 ) goto bExpBigger;
3977 if ( aExp == 0x7FFF ) {
3978 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3979 return propagateFloatx80NaN( a, b STATUS_VAR );
3980 }
3981 float_raise( float_flag_invalid STATUS_VAR);
3982 z.low = floatx80_default_nan_low;
3983 z.high = floatx80_default_nan_high;
3984 return z;
3985 }
3986 if ( aExp == 0 ) {
3987 aExp = 1;
3988 bExp = 1;
3989 }
3990 zSig1 = 0;
3991 if ( bSig < aSig ) goto aBigger;
3992 if ( aSig < bSig ) goto bBigger;
3993 return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3994 bExpBigger:
3995 if ( bExp == 0x7FFF ) {
3996 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3997 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3998 }
3999 if ( aExp == 0 ) ++expDiff;
4000 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4001 bBigger:
4002 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4003 zExp = bExp;
4004 zSign ^= 1;
4005 goto normalizeRoundAndPack;
4006 aExpBigger:
4007 if ( aExp == 0x7FFF ) {
4008 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4009 return a;
4010 }
4011 if ( bExp == 0 ) --expDiff;
4012 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4013 aBigger:
4014 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4015 zExp = aExp;
4016 normalizeRoundAndPack:
4017 return
4018 normalizeRoundAndPackFloatx80(
4019 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4020
4021 }
4022
4023 /*----------------------------------------------------------------------------
4024 | Returns the result of adding the extended double-precision floating-point
4025 | values `a' and `b'. The operation is performed according to the IEC/IEEE
4026 | Standard for Binary Floating-Point Arithmetic.
4027 *----------------------------------------------------------------------------*/
4028
4029 floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
4030 {
4031 flag aSign, bSign;
4032
4033 aSign = extractFloatx80Sign( a );
4034 bSign = extractFloatx80Sign( b );
4035 if ( aSign == bSign ) {
4036 return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4037 }
4038 else {
4039 return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4040 }
4041
4042 }
4043
4044 /*----------------------------------------------------------------------------
4045 | Returns the result of subtracting the extended double-precision floating-
4046 | point values `a' and `b'. The operation is performed according to the
4047 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4048 *----------------------------------------------------------------------------*/
4049
4050 floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
4051 {
4052 flag aSign, bSign;
4053
4054 aSign = extractFloatx80Sign( a );
4055 bSign = extractFloatx80Sign( b );
4056 if ( aSign == bSign ) {
4057 return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4058 }
4059 else {
4060 return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4061 }
4062
4063 }
4064
4065 /*----------------------------------------------------------------------------
4066 | Returns the result of multiplying the extended double-precision floating-
4067 | point values `a' and `b'. The operation is performed according to the
4068 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4069 *----------------------------------------------------------------------------*/
4070
4071 floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
4072 {
4073 flag aSign, bSign, zSign;
4074 int32 aExp, bExp, zExp;
4075 bits64 aSig, bSig, zSig0, zSig1;
4076 floatx80 z;
4077
4078 aSig = extractFloatx80Frac( a );
4079 aExp = extractFloatx80Exp( a );
4080 aSign = extractFloatx80Sign( a );
4081 bSig = extractFloatx80Frac( b );
4082 bExp = extractFloatx80Exp( b );
4083 bSign = extractFloatx80Sign( b );
4084 zSign = aSign ^ bSign;
4085 if ( aExp == 0x7FFF ) {
4086 if ( (bits64) ( aSig<<1 )
4087 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
4088 return propagateFloatx80NaN( a, b STATUS_VAR );
4089 }
4090 if ( ( bExp | bSig ) == 0 ) goto invalid;
4091 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4092 }
4093 if ( bExp == 0x7FFF ) {
4094 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4095 if ( ( aExp | aSig ) == 0 ) {
4096 invalid:
4097 float_raise( float_flag_invalid STATUS_VAR);
4098 z.low = floatx80_default_nan_low;
4099 z.high = floatx80_default_nan_high;
4100 return z;
4101 }
4102 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4103 }
4104 if ( aExp == 0 ) {
4105 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4106 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4107 }
4108 if ( bExp == 0 ) {
4109 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
4110 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4111 }
4112 zExp = aExp + bExp - 0x3FFE;
4113 mul64To128( aSig, bSig, &zSig0, &zSig1 );
4114 if ( 0 < (sbits64) zSig0 ) {
4115 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
4116 --zExp;
4117 }
4118 return
4119 roundAndPackFloatx80(
4120 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4121
4122 }
4123
4124 /*----------------------------------------------------------------------------
4125 | Returns the result of dividing the extended double-precision floating-point
4126 | value `a' by the corresponding value `b'. The operation is performed
4127 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4128 *----------------------------------------------------------------------------*/
4129
4130 floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
4131 {
4132 flag aSign, bSign, zSign;
4133 int32 aExp, bExp, zExp;
4134 bits64 aSig, bSig, zSig0, zSig1;
4135 bits64 rem0, rem1, rem2, term0, term1, term2;
4136 floatx80 z;
4137
4138 aSig = extractFloatx80Frac( a );
4139 aExp = extractFloatx80Exp( a );
4140 aSign = extractFloatx80Sign( a );
4141 bSig = extractFloatx80Frac( b );
4142 bExp = extractFloatx80Exp( b );
4143 bSign = extractFloatx80Sign( b );
4144 zSign = aSign ^ bSign;
4145 if ( aExp == 0x7FFF ) {
4146 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4147 if ( bExp == 0x7FFF ) {
4148 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4149 goto invalid;
4150 }
4151 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4152 }
4153 if ( bExp == 0x7FFF ) {
4154 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4155 return packFloatx80( zSign, 0, 0 );
4156 }
4157 if ( bExp == 0 ) {
4158 if ( bSig == 0 ) {
4159 if ( ( aExp | aSig ) == 0 ) {
4160 invalid:
4161 float_raise( float_flag_invalid STATUS_VAR);
4162 z.low = floatx80_default_nan_low;
4163 z.high = floatx80_default_nan_high;
4164 return z;
4165 }
4166 float_raise( float_flag_divbyzero STATUS_VAR);
4167 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4168 }
4169 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4170 }
4171 if ( aExp == 0 ) {
4172 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4173 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4174 }
4175 zExp = aExp - bExp + 0x3FFE;
4176 rem1 = 0;
4177 if ( bSig <= aSig ) {
4178 shift128Right( aSig, 0, 1, &aSig, &rem1 );
4179 ++zExp;
4180 }
4181 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
4182 mul64To128( bSig, zSig0, &term0, &term1 );
4183 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
4184 while ( (sbits64) rem0 < 0 ) {
4185 --zSig0;
4186 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4187 }
4188 zSig1 = estimateDiv128To64( rem1, 0, bSig );
4189 if ( (bits64) ( zSig1<<1 ) <= 8 ) {
4190 mul64To128( bSig, zSig1, &term1, &term2 );
4191 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4192 while ( (sbits64) rem1 < 0 ) {
4193 --zSig1;
4194 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
4195 }
4196 zSig1 |= ( ( rem1 | rem2 ) != 0 );
4197 }
4198 return
4199 roundAndPackFloatx80(
4200 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4201
4202 }
4203
4204 /*----------------------------------------------------------------------------
4205 | Returns the remainder of the extended double-precision floating-point value
4206 | `a' with respect to the corresponding value `b'. The operation is performed
4207 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4208 *----------------------------------------------------------------------------*/
4209
4210 floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
4211 {
4212 flag aSign, zSign;
4213 int32 aExp, bExp, expDiff;
4214 bits64 aSig0, aSig1, bSig;
4215 bits64 q, term0, term1, alternateASig0, alternateASig1;
4216 floatx80 z;
4217
4218 aSig0 = extractFloatx80Frac( a );
4219 aExp = extractFloatx80Exp( a );
4220 aSign = extractFloatx80Sign( a );
4221 bSig = extractFloatx80Frac( b );
4222 bExp = extractFloatx80Exp( b );
4223 if ( aExp == 0x7FFF ) {
4224 if ( (bits64) ( aSig0<<1 )
4225 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
4226 return propagateFloatx80NaN( a, b STATUS_VAR );
4227 }
4228 goto invalid;
4229 }
4230 if ( bExp == 0x7FFF ) {
4231 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4232 return a;
4233 }
4234 if ( bExp == 0 ) {
4235 if ( bSig == 0 ) {
4236 invalid:
4237 float_raise( float_flag_invalid STATUS_VAR);
4238 z.low = floatx80_default_nan_low;
4239 z.high = floatx80_default_nan_high;
4240 return z;
4241 }
4242 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4243 }
4244 if ( aExp == 0 ) {
4245 if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
4246 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4247 }
4248 bSig |= LIT64( 0x8000000000000000 );
4249 zSign = aSign;
4250 expDiff = aExp - bExp;
4251 aSig1 = 0;
4252 if ( expDiff < 0 ) {
4253 if ( expDiff < -1 ) return a;
4254 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
4255 expDiff = 0;
4256 }
4257 q = ( bSig <= aSig0 );
4258 if ( q ) aSig0 -= bSig;
4259 expDiff -= 64;
4260 while ( 0 < expDiff ) {
4261 q = estimateDiv128To64( aSig0, aSig1, bSig );
4262 q = ( 2 < q ) ? q - 2 : 0;
4263 mul64To128( bSig, q, &term0, &term1 );
4264 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4265 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
4266 expDiff -= 62;
4267 }
4268 expDiff += 64;
4269 if ( 0 < expDiff ) {
4270 q = estimateDiv128To64( aSig0, aSig1, bSig );
4271 q = ( 2 < q ) ? q - 2 : 0;
4272 q >>= 64 - expDiff;
4273 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
4274 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4275 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4276 while ( le128( term0, term1, aSig0, aSig1 ) ) {
4277 ++q;
4278 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4279 }
4280 }
4281 else {
4282 term1 = 0;
4283 term0 = bSig;
4284 }
4285 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4286 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4287 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4288 && ( q & 1 ) )
4289 ) {
4290 aSig0 = alternateASig0;
4291 aSig1 = alternateASig1;
4292 zSign = ! zSign;
4293 }
4294 return
4295 normalizeRoundAndPackFloatx80(
4296 80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
4297
4298 }
4299
4300 /*----------------------------------------------------------------------------
4301 | Returns the square root of the extended double-precision floating-point
4302 | value `a'. The operation is performed according to the IEC/IEEE Standard
4303 | for Binary Floating-Point Arithmetic.
4304 *----------------------------------------------------------------------------*/
4305
4306 floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
4307 {
4308 flag aSign;
4309 int32 aExp, zExp;
4310 bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4311 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4312 floatx80 z;
4313
4314 aSig0 = extractFloatx80Frac( a );
4315 aExp = extractFloatx80Exp( a );
4316 aSign = extractFloatx80Sign( a );
4317 if ( aExp == 0x7FFF ) {
4318 if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
4319 if ( ! aSign ) return a;
4320 goto invalid;
4321 }
4322 if ( aSign ) {
4323 if ( ( aExp | aSig0 ) == 0 ) return a;
4324 invalid:
4325 float_raise( float_flag_invalid STATUS_VAR);
4326 z.low = floatx80_default_nan_low;
4327 z.high = floatx80_default_nan_high;
4328 return z;
4329 }
4330 if ( aExp == 0 ) {
4331 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4332 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4333 }
4334 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4335 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4336 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4337 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4338 doubleZSig0 = zSig0<<1;
4339 mul64To128( zSig0, zSig0, &term0, &term1 );
4340 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4341 while ( (sbits64) rem0 < 0 ) {
4342 --zSig0;
4343 doubleZSig0 -= 2;
4344 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4345 }
4346 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4347 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4348 if ( zSig1 == 0 ) zSig1 = 1;
4349 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4350 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4351 mul64To128( zSig1, zSig1, &term2, &term3 );
4352 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4353 while ( (sbits64) rem1 < 0 ) {
4354 --zSig1;
4355 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4356 term3 |= 1;
4357 term2 |= doubleZSig0;
4358 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4359 }
4360 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4361 }
4362 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4363 zSig0 |= doubleZSig0;
4364 return
4365 roundAndPackFloatx80(
4366 STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
4367
4368 }
4369
4370 /*----------------------------------------------------------------------------
4371 | Returns 1 if the extended double-precision floating-point value `a' is
4372 | equal to the corresponding value `b', and 0 otherwise. The comparison is
4373 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4374 | Arithmetic.
4375 *----------------------------------------------------------------------------*/
4376
4377 int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
4378 {
4379
4380 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4381 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4382 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4383 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4384 ) {
4385 if ( floatx80_is_signaling_nan( a )
4386 || floatx80_is_signaling_nan( b ) ) {
4387 float_raise( float_flag_invalid STATUS_VAR);
4388 }
4389 return 0;
4390 }
4391 return
4392 ( a.low == b.low )
4393 && ( ( a.high == b.high )
4394 || ( ( a.low == 0 )
4395 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4396 );
4397
4398 }
4399
4400 /*----------------------------------------------------------------------------
4401 | Returns 1 if the extended double-precision floating-point value `a' is
4402 | less than or equal to the corresponding value `b', and 0 otherwise. The
4403 | comparison is performed according to the IEC/IEEE Standard for Binary
4404 | Floating-Point Arithmetic.
4405 *----------------------------------------------------------------------------*/
4406
4407 int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
4408 {
4409 flag aSign, bSign;
4410
4411 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4412 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4413 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4414 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4415 ) {
4416 float_raise( float_flag_invalid STATUS_VAR);
4417 return 0;
4418 }
4419 aSign = extractFloatx80Sign( a );
4420 bSign = extractFloatx80Sign( b );
4421 if ( aSign != bSign ) {
4422 return
4423 aSign
4424 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4425 == 0 );
4426 }
4427 return
4428 aSign ? le128( b.high, b.low, a.high, a.low )
4429 : le128( a.high, a.low, b.high, b.low );
4430
4431 }
4432
4433 /*----------------------------------------------------------------------------
4434 | Returns 1 if the extended double-precision floating-point value `a' is
4435 | less than the corresponding value `b', and 0 otherwise. The comparison
4436 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4437 | Arithmetic.
4438 *----------------------------------------------------------------------------*/
4439
4440 int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
4441 {
4442 flag aSign, bSign;
4443
4444 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4445 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4446 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4447 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4448 ) {
4449 float_raise( float_flag_invalid STATUS_VAR);
4450 return 0;
4451 }
4452 aSign = extractFloatx80Sign( a );
4453 bSign = extractFloatx80Sign( b );
4454 if ( aSign != bSign ) {
4455 return
4456 aSign
4457 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4458 != 0 );
4459 }
4460 return
4461 aSign ? lt128( b.high, b.low, a.high, a.low )
4462 : lt128( a.high, a.low, b.high, b.low );
4463
4464 }
4465
4466 /*----------------------------------------------------------------------------
4467 | Returns 1 if the extended double-precision floating-point value `a' is equal
4468 | to the corresponding value `b', and 0 otherwise. The invalid exception is
4469 | raised if either operand is a NaN. Otherwise, the comparison is performed
4470 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4471 *----------------------------------------------------------------------------*/
4472
4473 int floatx80_eq_signaling( floatx80 a, floatx80 b STATUS_PARAM )
4474 {
4475
4476 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4477 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4478 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4479 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4480 ) {
4481 float_raise( float_flag_invalid STATUS_VAR);
4482 return 0;
4483 }
4484 return
4485 ( a.low == b.low )
4486 && ( ( a.high == b.high )
4487 || ( ( a.low == 0 )
4488 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4489 );
4490
4491 }
4492
4493 /*----------------------------------------------------------------------------
4494 | Returns 1 if the extended double-precision floating-point value `a' is less
4495 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
4496 | do not cause an exception. Otherwise, the comparison is performed according
4497 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4498 *----------------------------------------------------------------------------*/
4499
4500 int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4501 {
4502 flag aSign, bSign;
4503
4504 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4505 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4506 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4507 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4508 ) {
4509 if ( floatx80_is_signaling_nan( a )
4510 || floatx80_is_signaling_nan( b ) ) {
4511 float_raise( float_flag_invalid STATUS_VAR);
4512 }
4513 return 0;
4514 }
4515 aSign = extractFloatx80Sign( a );
4516 bSign = extractFloatx80Sign( b );
4517 if ( aSign != bSign ) {
4518 return
4519 aSign
4520 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4521 == 0 );
4522 }
4523 return
4524 aSign ? le128( b.high, b.low, a.high, a.low )
4525 : le128( a.high, a.low, b.high, b.low );
4526
4527 }
4528
4529 /*----------------------------------------------------------------------------
4530 | Returns 1 if the extended double-precision floating-point value `a' is less
4531 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
4532 | an exception. Otherwise, the comparison is performed according to the
4533 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4534 *----------------------------------------------------------------------------*/
4535
4536 int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4537 {
4538 flag aSign, bSign;
4539
4540 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4541 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4542 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4543 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4544 ) {
4545 if ( floatx80_is_signaling_nan( a )
4546 || floatx80_is_signaling_nan( b ) ) {
4547 float_raise( float_flag_invalid STATUS_VAR);
4548 }
4549 return 0;
4550 }
4551 aSign = extractFloatx80Sign( a );
4552 bSign = extractFloatx80Sign( b );
4553 if ( aSign != bSign ) {
4554 return
4555 aSign
4556 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4557 != 0 );
4558 }
4559 return
4560 aSign ? lt128( b.high, b.low, a.high, a.low )
4561 : lt128( a.high, a.low, b.high, b.low );
4562
4563 }
4564
4565 #endif
4566
4567 #ifdef FLOAT128
4568
4569 /*----------------------------------------------------------------------------
4570 | Returns the result of converting the quadruple-precision floating-point
4571 | value `a' to the 32-bit two's complement integer format. The conversion
4572 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4573 | Arithmetic---which means in particular that the conversion is rounded
4574 | according to the current rounding mode. If `a' is a NaN, the largest
4575 | positive integer is returned. Otherwise, if the conversion overflows, the
4576 | largest integer with the same sign as `a' is returned.
4577 *----------------------------------------------------------------------------*/
4578
4579 int32 float128_to_int32( float128 a STATUS_PARAM )
4580 {
4581 flag aSign;
4582 int32 aExp, shiftCount;
4583 bits64 aSig0, aSig1;
4584
4585 aSig1 = extractFloat128Frac1( a );
4586 aSig0 = extractFloat128Frac0( a );
4587 aExp = extractFloat128Exp( a );
4588 aSign = extractFloat128Sign( a );
4589 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4590 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4591 aSig0 |= ( aSig1 != 0 );
4592 shiftCount = 0x4028 - aExp;
4593 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4594 return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
4595
4596 }
4597
4598 /*----------------------------------------------------------------------------
4599 | Returns the result of converting the quadruple-precision floating-point
4600 | value `a' to the 32-bit two's complement integer format. The conversion
4601 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4602 | Arithmetic, except that the conversion is always rounded toward zero. If
4603 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
4604 | conversion overflows, the largest integer with the same sign as `a' is
4605 | returned.
4606 *----------------------------------------------------------------------------*/
4607
4608 int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
4609 {
4610 flag aSign;
4611 int32 aExp, shiftCount;
4612 bits64 aSig0, aSig1, savedASig;
4613 int32 z;
4614
4615 aSig1 = extractFloat128Frac1( a );
4616 aSig0 = extractFloat128Frac0( a );
4617 aExp = extractFloat128Exp( a );
4618 aSign = extractFloat128Sign( a );
4619 aSig0 |= ( aSig1 != 0 );
4620 if ( 0x401E < aExp ) {
4621 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4622 goto invalid;
4623 }
4624 else if ( aExp < 0x3FFF ) {
4625 if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
4626 return 0;
4627 }
4628 aSig0 |= LIT64( 0x0001000000000000 );
4629 shiftCount = 0x402F - aExp;
4630 savedASig = aSig0;
4631 aSig0 >>= shiftCount;
4632 z = aSig0;
4633 if ( aSign ) z = - z;
4634 if ( ( z < 0 ) ^ aSign ) {
4635 invalid:
4636 float_raise( float_flag_invalid STATUS_VAR);
4637 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4638 }
4639 if ( ( aSig0<<shiftCount ) != savedASig ) {
4640 STATUS(float_exception_flags) |= float_flag_inexact;
4641 }
4642 return z;
4643
4644 }
4645
4646 /*----------------------------------------------------------------------------
4647 | Returns the result of converting the quadruple-precision floating-point
4648 | value `a' to the 64-bit two's complement integer format. The conversion
4649 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4650 | Arithmetic---which means in particular that the conversion is rounded
4651 | according to the current rounding mode. If `a' is a NaN, the largest
4652 | positive integer is returned. Otherwise, if the conversion overflows, the
4653 | largest integer with the same sign as `a' is returned.
4654 *----------------------------------------------------------------------------*/
4655
4656 int64 float128_to_int64( float128 a STATUS_PARAM )
4657 {
4658 flag aSign;
4659 int32 aExp, shiftCount;
4660 bits64 aSig0, aSig1;
4661
4662 aSig1 = extractFloat128Frac1( a );
4663 aSig0 = extractFloat128Frac0( a );
4664 aExp = extractFloat128Exp( a );
4665 aSign = extractFloat128Sign( a );
4666 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4667 shiftCount = 0x402F - aExp;
4668 if ( shiftCount <= 0 ) {
4669 if ( 0x403E < aExp ) {
4670 float_raise( float_flag_invalid STATUS_VAR);
4671 if ( ! aSign
4672 || ( ( aExp == 0x7FFF )
4673 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4674 )
4675 ) {
4676 return LIT64( 0x7FFFFFFFFFFFFFFF );
4677 }
4678 return (sbits64) LIT64( 0x8000000000000000 );
4679 }
4680 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4681 }
4682 else {
4683 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4684 }
4685 return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
4686
4687 }
4688
4689 /*----------------------------------------------------------------------------
4690 | Returns the result of converting the quadruple-precision floating-point
4691 | value `a' to the 64-bit two's complement integer format. The conversion
4692 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4693 | Arithmetic, except that the conversion is always rounded toward zero.
4694 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
4695 | the conversion overflows, the largest integer with the same sign as `a' is
4696 | returned.
4697 *----------------------------------------------------------------------------*/
4698
4699 int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
4700 {
4701 flag aSign;
4702 int32 aExp, shiftCount;
4703 bits64 aSig0, aSig1;
4704 int64 z;
4705
4706 aSig1 = extractFloat128Frac1( a );
4707 aSig0 = extractFloat128Frac0( a );
4708 aExp = extractFloat128Exp( a );
4709 aSign = extractFloat128Sign( a );
4710 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4711 shiftCount = aExp - 0x402F;
4712 if ( 0 < shiftCount ) {
4713 if ( 0x403E <= aExp ) {
4714 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4715 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
4716 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4717 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
4718 }
4719 else {
4720 float_raise( float_flag_invalid STATUS_VAR);
4721 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4722 return LIT64( 0x7FFFFFFFFFFFFFFF );
4723 }
4724 }
4725 return (sbits64) LIT64( 0x8000000000000000 );
4726 }
4727 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4728 if ( (bits64) ( aSig1<<shiftCount ) ) {
4729 STATUS(float_exception_flags) |= float_flag_inexact;
4730 }
4731 }
4732 else {
4733 if ( aExp < 0x3FFF ) {
4734 if ( aExp | aSig0 | aSig1 ) {
4735 STATUS(float_exception_flags) |= float_flag_inexact;
4736 }
4737 return 0;
4738 }
4739 z = aSig0>>( - shiftCount );
4740 if ( aSig1
4741 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4742 STATUS(float_exception_flags) |= float_flag_inexact;
4743 }
4744 }
4745 if ( aSign ) z = - z;
4746 return z;
4747
4748 }
4749
4750 /*----------------------------------------------------------------------------
4751 | Returns the result of converting the quadruple-precision floating-point
4752 | value `a' to the single-precision floating-point format. The conversion
4753 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4754 | Arithmetic.
4755 *----------------------------------------------------------------------------*/
4756
4757 float32 float128_to_float32( float128 a STATUS_PARAM )
4758 {
4759 flag aSign;
4760 int32 aExp;
4761 bits64 aSig0, aSig1;
4762 bits32 zSig;
4763
4764 aSig1 = extractFloat128Frac1( a );
4765 aSig0 = extractFloat128Frac0( a );
4766 aExp = extractFloat128Exp( a );
4767 aSign = extractFloat128Sign( a );
4768 if ( aExp == 0x7FFF ) {
4769 if ( aSig0 | aSig1 ) {
4770 return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) );
4771 }
4772 return packFloat32( aSign, 0xFF, 0 );
4773 }
4774 aSig0 |= ( aSig1 != 0 );
4775 shift64RightJamming( aSig0, 18, &aSig0 );
4776 zSig = aSig0;
4777 if ( aExp || zSig ) {
4778 zSig |= 0x40000000;
4779 aExp -= 0x3F81;
4780 }
4781 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
4782
4783 }
4784
4785 /*----------------------------------------------------------------------------
4786 | Returns the result of converting the quadruple-precision floating-point
4787 | value `a' to the double-precision floating-point format. The conversion
4788 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4789 | Arithmetic.
4790 *----------------------------------------------------------------------------*/
4791
4792 float64 float128_to_float64( float128 a STATUS_PARAM )
4793 {
4794 flag aSign;
4795 int32 aExp;
4796 bits64 aSig0, aSig1;
4797
4798 aSig1 = extractFloat128Frac1( a );
4799 aSig0 = extractFloat128Frac0( a );
4800 aExp = extractFloat128Exp( a );
4801 aSign = extractFloat128Sign( a );
4802 if ( aExp == 0x7FFF ) {
4803 if ( aSig0 | aSig1 ) {
4804 return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) );
4805 }
4806 return packFloat64( aSign, 0x7FF, 0 );
4807 }
4808 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4809 aSig0 |= ( aSig1 != 0 );
4810 if ( aExp || aSig0 ) {
4811 aSig0 |= LIT64( 0x4000000000000000 );
4812 aExp -= 0x3C01;
4813 }
4814 return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
4815
4816 }
4817
4818 #ifdef FLOATX80
4819
4820 /*----------------------------------------------------------------------------
4821 | Returns the result of converting the quadruple-precision floating-point
4822 | value `a' to the extended double-precision floating-point format. The
4823 | conversion is performed according to the IEC/IEEE Standard for Binary
4824 | Floating-Point Arithmetic.
4825 *----------------------------------------------------------------------------*/
4826
4827 floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
4828 {
4829 flag aSign;
4830 int32 aExp;
4831 bits64 aSig0, aSig1;
4832
4833 aSig1 = extractFloat128Frac1( a );
4834 aSig0 = extractFloat128Frac0( a );
4835 aExp = extractFloat128Exp( a );
4836 aSign = extractFloat128Sign( a );
4837 if ( aExp == 0x7FFF ) {
4838 if ( aSig0 | aSig1 ) {
4839 return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) );
4840 }
4841 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4842 }
4843 if ( aExp == 0 ) {
4844 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4845 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4846 }
4847 else {
4848 aSig0 |= LIT64( 0x0001000000000000 );
4849 }
4850 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4851 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
4852
4853 }
4854
4855 #endif
4856
4857 /*----------------------------------------------------------------------------
4858 | Rounds the quadruple-precision floating-point value `a' to an integer, and
4859 | returns the result as a quadruple-precision floating-point value. The
4860 | operation is performed according to the IEC/IEEE Standard for Binary
4861 | Floating-Point Arithmetic.
4862 *----------------------------------------------------------------------------*/
4863
4864 float128 float128_round_to_int( float128 a STATUS_PARAM )
4865 {
4866 flag aSign;
4867 int32 aExp;
4868 bits64 lastBitMask, roundBitsMask;
4869 int8 roundingMode;
4870 float128 z;
4871
4872 aExp = extractFloat128Exp( a );
4873 if ( 0x402F <= aExp ) {
4874 if ( 0x406F <= aExp ) {
4875 if ( ( aExp == 0x7FFF )
4876 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4877 ) {
4878 return propagateFloat128NaN( a, a STATUS_VAR );
4879 }
4880 return a;
4881 }
4882 lastBitMask = 1;
4883 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4884 roundBitsMask = lastBitMask - 1;
4885 z = a;
4886 roundingMode = STATUS(float_rounding_mode);
4887 if ( roundingMode == float_round_nearest_even ) {
4888 if ( lastBitMask ) {
4889 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4890 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4891 }
4892 else {
4893 if ( (sbits64) z.low < 0 ) {
4894 ++z.high;
4895 if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4896 }
4897 }
4898 }
4899 else if ( roundingMode != float_round_to_zero ) {
4900 if ( extractFloat128Sign( z )
4901 ^ ( roundingMode == float_round_up ) ) {
4902 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4903 }
4904 }
4905 z.low &= ~ roundBitsMask;
4906 }
4907 else {
4908 if ( aExp < 0x3FFF ) {
4909 if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4910 STATUS(float_exception_flags) |= float_flag_inexact;
4911 aSign = extractFloat128Sign( a );
4912 switch ( STATUS(float_rounding_mode) ) {
4913 case float_round_nearest_even:
4914 if ( ( aExp == 0x3FFE )
4915 && ( extractFloat128Frac0( a )
4916 | extractFloat128Frac1( a ) )
4917 ) {
4918 return packFloat128( aSign, 0x3FFF, 0, 0 );
4919 }
4920 break;
4921 case float_round_down:
4922 return
4923 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4924 : packFloat128( 0, 0, 0, 0 );
4925 case float_round_up:
4926 return
4927 aSign ? packFloat128( 1, 0, 0, 0 )
4928 : packFloat128( 0, 0x3FFF, 0, 0 );
4929 }
4930 return packFloat128( aSign, 0, 0, 0 );
4931 }
4932 lastBitMask = 1;
4933 lastBitMask <<= 0x402F - aExp;
4934 roundBitsMask = lastBitMask - 1;
4935 z.low = 0;
4936 z.high = a.high;
4937 roundingMode = STATUS(float_rounding_mode);
4938 if ( roundingMode == float_round_nearest_even ) {
4939 z.high += lastBitMask>>1;
4940 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4941 z.high &= ~ lastBitMask;
4942 }
4943 }
4944 else if ( roundingMode != float_round_to_zero ) {
4945 if ( extractFloat128Sign( z )
4946 ^ ( roundingMode == float_round_up ) ) {
4947 z.high |= ( a.low != 0 );
4948 z.high += roundBitsMask;
4949 }
4950 }
4951 z.high &= ~ roundBitsMask;
4952 }
4953 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4954 STATUS(float_exception_flags) |= float_flag_inexact;
4955 }
4956 return z;
4957
4958 }
4959
4960 /*----------------------------------------------------------------------------
4961 | Returns the result of adding the absolute values of the quadruple-precision
4962 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
4963 | before being returned. `zSign' is ignored if the result is a NaN.
4964 | The addition is performed according to the IEC/IEEE Standard for Binary
4965 | Floating-Point Arithmetic.
4966 *----------------------------------------------------------------------------*/
4967
4968 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
4969 {
4970 int32 aExp, bExp, zExp;
4971 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4972 int32 expDiff;
4973
4974 aSig1 = extractFloat128Frac1( a );
4975 aSig0 = extractFloat128Frac0( a );
4976 aExp = extractFloat128Exp( a );
4977 bSig1 = extractFloat128Frac1( b );
4978 bSig0 = extractFloat128Frac0( b );
4979 bExp = extractFloat128Exp( b );
4980 expDiff = aExp - bExp;
4981 if ( 0 < expDiff ) {
4982 if ( aExp == 0x7FFF ) {
4983 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4984 return a;
4985 }
4986 if ( bExp == 0 ) {
4987 --expDiff;
4988 }
4989 else {
4990 bSig0 |= LIT64( 0x0001000000000000 );
4991 }
4992 shift128ExtraRightJamming(
4993 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4994 zExp = aExp;
4995 }
4996 else if ( expDiff < 0 ) {
4997 if ( bExp == 0x7FFF ) {
4998 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4999 return packFloat128( zSign, 0x7FFF, 0, 0 );
5000 }
5001 if ( aExp == 0 ) {
5002 ++expDiff;
5003 }
5004 else {
5005 aSig0 |= LIT64( 0x0001000000000000 );
5006 }
5007 shift128ExtraRightJamming(
5008 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
5009 zExp = bExp;
5010 }
5011 else {
5012 if ( aExp == 0x7FFF ) {
5013 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5014 return propagateFloat128NaN( a, b STATUS_VAR );
5015 }
5016 return a;
5017 }
5018 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5019 if ( aExp == 0 ) {
5020 if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
5021 return packFloat128( zSign, 0, zSig0, zSig1 );
5022 }
5023 zSig2 = 0;
5024 zSig0 |= LIT64( 0x0002000000000000 );
5025 zExp = aExp;
5026 goto shiftRight1;
5027 }
5028 aSig0 |= LIT64( 0x0001000000000000 );
5029 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5030 --zExp;
5031 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
5032 ++zExp;
5033 shiftRight1:
5034 shift128ExtraRightJamming(
5035 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5036 roundAndPack:
5037 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5038
5039 }
5040
5041 /*----------------------------------------------------------------------------
5042 | Returns the result of subtracting the absolute values of the quadruple-
5043 | precision floating-point values `a' and `b'. If `zSign' is 1, the
5044 | difference is negated before being returned. `zSign' is ignored if the
5045 | result is a NaN. The subtraction is performed according to the IEC/IEEE
5046 | Standard for Binary Floating-Point Arithmetic.
5047 *----------------------------------------------------------------------------*/
5048
5049 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5050 {
5051 int32 aExp, bExp, zExp;
5052 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
5053 int32 expDiff;
5054 float128 z;
5055
5056 aSig1 = extractFloat128Frac1( a );
5057 aSig0 = extractFloat128Frac0( a );
5058 aExp = extractFloat128Exp( a );
5059 bSig1 = extractFloat128Frac1( b );
5060 bSig0 = extractFloat128Frac0( b );
5061 bExp = extractFloat128Exp( b );
5062 expDiff = aExp - bExp;
5063 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5064 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
5065 if ( 0 < expDiff ) goto aExpBigger;
5066 if ( expDiff < 0 ) goto bExpBigger;
5067 if ( aExp == 0x7FFF ) {
5068 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5069 return propagateFloat128NaN( a, b STATUS_VAR );
5070 }
5071 float_raise( float_flag_invalid STATUS_VAR);
5072 z.low = float128_default_nan_low;
5073 z.high = float128_default_nan_high;
5074 return z;
5075 }
5076 if ( aExp == 0 ) {
5077 aExp = 1;
5078 bExp = 1;
5079 }
5080 if ( bSig0 < aSig0 ) goto aBigger;
5081 if ( aSig0 < bSig0 ) goto bBigger;
5082 if ( bSig1 < aSig1 ) goto aBigger;
5083 if ( aSig1 < bSig1 ) goto bBigger;
5084 return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
5085 bExpBigger:
5086 if ( bExp == 0x7FFF ) {
5087 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5088 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
5089 }
5090 if ( aExp == 0 ) {
5091 ++expDiff;
5092 }
5093 else {
5094 aSig0 |= LIT64( 0x4000000000000000 );
5095 }
5096 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5097 bSig0 |= LIT64( 0x4000000000000000 );
5098 bBigger:
5099 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
5100 zExp = bExp;
5101 zSign ^= 1;
5102 goto normalizeRoundAndPack;
5103 aExpBigger:
5104 if ( aExp == 0x7FFF ) {
5105 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5106 return a;
5107 }
5108 if ( bExp == 0 ) {
5109 --expDiff;
5110 }
5111 else {
5112 bSig0 |= LIT64( 0x4000000000000000 );
5113 }
5114 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
5115 aSig0 |= LIT64( 0x4000000000000000 );
5116 aBigger:
5117 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5118 zExp = aExp;
5119 normalizeRoundAndPack:
5120 --zExp;
5121 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
5122
5123 }
5124
5125 /*----------------------------------------------------------------------------
5126 | Returns the result of adding the quadruple-precision floating-point values
5127 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
5128 | for Binary Floating-Point Arithmetic.
5129 *----------------------------------------------------------------------------*/
5130
5131 float128 float128_add( float128 a, float128 b STATUS_PARAM )
5132 {
5133 flag aSign, bSign;
5134
5135 aSign = extractFloat128Sign( a );
5136 bSign = extractFloat128Sign( b );
5137 if ( aSign == bSign ) {
5138 return addFloat128Sigs( a, b, aSign STATUS_VAR );
5139 }
5140 else {
5141 return subFloat128Sigs( a, b, aSign STATUS_VAR );
5142 }
5143
5144 }
5145
5146 /*----------------------------------------------------------------------------
5147 | Returns the result of subtracting the quadruple-precision floating-point
5148 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5149 | Standard for Binary Floating-Point Arithmetic.
5150 *----------------------------------------------------------------------------*/
5151
5152 float128 float128_sub( float128 a, float128 b STATUS_PARAM )
5153 {
5154 flag aSign, bSign;
5155
5156 aSign = extractFloat128Sign( a );
5157 bSign = extractFloat128Sign( b );
5158 if ( aSign == bSign ) {
5159 return subFloat128Sigs( a, b, aSign STATUS_VAR );
5160 }
5161 else {
5162 return addFloat128Sigs( a, b, aSign STATUS_VAR );
5163 }
5164
5165 }
5166
5167 /*----------------------------------------------------------------------------
5168 | Returns the result of multiplying the quadruple-precision floating-point
5169 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5170 | Standard for Binary Floating-Point Arithmetic.
5171 *----------------------------------------------------------------------------*/
5172
5173 float128 float128_mul( float128 a, float128 b STATUS_PARAM )
5174 {
5175 flag aSign, bSign, zSign;
5176 int32 aExp, bExp, zExp;
5177 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
5178 float128 z;
5179
5180 aSig1 = extractFloat128Frac1( a );
5181 aSig0 = extractFloat128Frac0( a );
5182 aExp = extractFloat128Exp( a );
5183 aSign = extractFloat128Sign( a );
5184 bSig1 = extractFloat128Frac1( b );
5185 bSig0 = extractFloat128Frac0( b );
5186 bExp = extractFloat128Exp( b );
5187 bSign = extractFloat128Sign( b );
5188 zSign = aSign ^ bSign;
5189 if ( aExp == 0x7FFF ) {
5190 if ( ( aSig0 | aSig1 )
5191 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5192 return propagateFloat128NaN( a, b STATUS_VAR );
5193 }
5194 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
5195 return packFloat128( zSign, 0x7FFF, 0, 0 );
5196 }
5197 if ( bExp == 0x7FFF ) {
5198 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5199 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5200 invalid:
5201 float_raise( float_flag_invalid STATUS_VAR);
5202 z.low = float128_default_nan_low;
5203 z.high = float128_default_nan_high;
5204 return z;
5205 }
5206 return packFloat128( zSign, 0x7FFF, 0, 0 );
5207 }
5208 if ( aExp == 0 ) {
5209 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5210 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5211 }
5212 if ( bExp == 0 ) {
5213 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5214 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5215 }
5216 zExp = aExp + bExp - 0x4000;
5217 aSig0 |= LIT64( 0x0001000000000000 );
5218 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5219 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5220 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5221 zSig2 |= ( zSig3 != 0 );
5222 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5223 shift128ExtraRightJamming(
5224 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5225 ++zExp;
5226 }
5227 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5228
5229 }
5230
5231 /*----------------------------------------------------------------------------
5232 | Returns the result of dividing the quadruple-precision floating-point value
5233 | `a' by the corresponding value `b'. The operation is performed according to
5234 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5235 *----------------------------------------------------------------------------*/
5236
5237 float128 float128_div( float128 a, float128 b STATUS_PARAM )
5238 {
5239 flag aSign, bSign, zSign;
5240 int32 aExp, bExp, zExp;
5241 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5242 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5243 float128 z;
5244
5245 aSig1 = extractFloat128Frac1( a );
5246 aSig0 = extractFloat128Frac0( a );
5247 aExp = extractFloat128Exp( a );
5248 aSign = extractFloat128Sign( a );
5249 bSig1 = extractFloat128Frac1( b );
5250 bSig0 = extractFloat128Frac0( b );
5251 bExp = extractFloat128Exp( b );
5252 bSign = extractFloat128Sign( b );
5253 zSign = aSign ^ bSign;
5254 if ( aExp == 0x7FFF ) {
5255 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5256 if ( bExp == 0x7FFF ) {
5257 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5258 goto invalid;
5259 }
5260 return packFloat128( zSign, 0x7FFF, 0, 0 );
5261 }
5262 if ( bExp == 0x7FFF ) {
5263 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5264 return packFloat128( zSign, 0, 0, 0 );
5265 }
5266 if ( bExp == 0 ) {
5267 if ( ( bSig0 | bSig1 ) == 0 ) {
5268 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5269 invalid:
5270 float_raise( float_flag_invalid STATUS_VAR);
5271 z.low = float128_default_nan_low;
5272 z.high = float128_default_nan_high;
5273 return z;
5274 }
5275 float_raise( float_flag_divbyzero STATUS_VAR);
5276 return packFloat128( zSign, 0x7FFF, 0, 0 );
5277 }
5278 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5279 }
5280 if ( aExp == 0 ) {
5281 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5282 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5283 }
5284 zExp = aExp - bExp + 0x3FFD;
5285 shortShift128Left(
5286 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5287 shortShift128Left(
5288 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5289 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5290 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5291 ++zExp;
5292 }
5293 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5294 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5295 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5296 while ( (sbits64) rem0 < 0 ) {
5297 --zSig0;
5298 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5299 }
5300 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5301 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5302 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5303 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5304 while ( (sbits64) rem1 < 0 ) {
5305 --zSig1;
5306 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5307 }
5308 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5309 }
5310 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5311 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5312
5313 }
5314
5315 /*----------------------------------------------------------------------------
5316 | Returns the remainder of the quadruple-precision floating-point value `a'
5317 | with respect to the corresponding value `b'. The operation is performed
5318 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5319 *----------------------------------------------------------------------------*/
5320
5321 float128 float128_rem( float128 a, float128 b STATUS_PARAM )
5322 {
5323 flag aSign, zSign;
5324 int32 aExp, bExp, expDiff;
5325 bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5326 bits64 allZero, alternateASig0, alternateASig1, sigMean1;
5327 sbits64 sigMean0;
5328 float128 z;
5329
5330 aSig1 = extractFloat128Frac1( a );
5331 aSig0 = extractFloat128Frac0( a );
5332 aExp = extractFloat128Exp( a );
5333 aSign = extractFloat128Sign( a );
5334 bSig1 = extractFloat128Frac1( b );
5335 bSig0 = extractFloat128Frac0( b );
5336 bExp = extractFloat128Exp( b );
5337 if ( aExp == 0x7FFF ) {
5338 if ( ( aSig0 | aSig1 )
5339 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5340 return propagateFloat128NaN( a, b STATUS_VAR );
5341 }
5342 goto invalid;
5343 }
5344 if ( bExp == 0x7FFF ) {
5345 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5346 return a;
5347 }
5348 if ( bExp == 0 ) {
5349 if ( ( bSig0 | bSig1 ) == 0 ) {
5350 invalid:
5351 float_raise( float_flag_invalid STATUS_VAR);
5352 z.low = float128_default_nan_low;
5353 z.high = float128_default_nan_high;
5354 return z;
5355 }
5356 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5357 }
5358 if ( aExp == 0 ) {
5359 if ( ( aSig0 | aSig1 ) == 0 ) return a;
5360 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5361 }
5362 expDiff = aExp - bExp;
5363 if ( expDiff < -1 ) return a;
5364 shortShift128Left(
5365 aSig0 | LIT64( 0x0001000000000000 ),
5366 aSig1,
5367 15 - ( expDiff < 0 ),
5368 &aSig0,
5369 &aSig1
5370 );
5371 shortShift128Left(
5372 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5373 q = le128( bSig0, bSig1, aSig0, aSig1 );
5374 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5375 expDiff -= 64;
5376 while ( 0 < expDiff ) {
5377 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5378 q = ( 4 < q ) ? q - 4 : 0;
5379 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5380 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5381 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5382 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5383 expDiff -= 61;
5384 }
5385 if ( -64 < expDiff ) {
5386 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5387 q = ( 4 < q ) ? q - 4 : 0;
5388 q >>= - expDiff;
5389 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5390 expDiff += 52;
5391 if ( expDiff < 0 ) {
5392 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5393 }
5394 else {
5395 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5396 }
5397 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5398 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5399 }
5400 else {
5401 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5402 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5403 }
5404 do {
5405 alternateASig0 = aSig0;
5406 alternateASig1 = aSig1;
5407 ++q;
5408 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5409 } while ( 0 <= (sbits64) aSig0 );
5410 add128(
5411 aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 );
5412 if ( ( sigMean0 < 0 )
5413 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5414 aSig0 = alternateASig0;
5415 aSig1 = alternateASig1;
5416 }
5417 zSign = ( (sbits64) aSig0 < 0 );
5418 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5419 return
5420 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
5421
5422 }
5423
5424 /*----------------------------------------------------------------------------
5425 | Returns the square root of the quadruple-precision floating-point value `a'.
5426 | The operation is performed according to the IEC/IEEE Standard for Binary
5427 | Floating-Point Arithmetic.
5428 *----------------------------------------------------------------------------*/
5429
5430 float128 float128_sqrt( float128 a STATUS_PARAM )
5431 {
5432 flag aSign;
5433 int32 aExp, zExp;
5434 bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5435 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5436 float128 z;
5437
5438 aSig1 = extractFloat128Frac1( a );
5439 aSig0 = extractFloat128Frac0( a );
5440 aExp = extractFloat128Exp( a );
5441 aSign = extractFloat128Sign( a );
5442 if ( aExp == 0x7FFF ) {
5443 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
5444 if ( ! aSign ) return a;
5445 goto invalid;
5446 }
5447 if ( aSign ) {
5448 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5449 invalid:
5450 float_raise( float_flag_invalid STATUS_VAR);
5451 z.low = float128_default_nan_low;
5452 z.high = float128_default_nan_high;
5453 return z;
5454 }
5455 if ( aExp == 0 ) {
5456 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5457 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5458 }
5459 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5460 aSig0 |= LIT64( 0x0001000000000000 );
5461 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5462 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5463 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5464 doubleZSig0 = zSig0<<1;
5465 mul64To128( zSig0, zSig0, &term0, &term1 );
5466 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5467 while ( (sbits64) rem0 < 0 ) {
5468 --zSig0;
5469 doubleZSig0 -= 2;
5470 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5471 }
5472 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5473 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5474 if ( zSig1 == 0 ) zSig1 = 1;
5475 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5476 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5477 mul64To128( zSig1, zSig1, &term2, &term3 );
5478 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5479 while ( (sbits64) rem1 < 0 ) {
5480 --zSig1;
5481 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5482 term3 |= 1;
5483 term2 |= doubleZSig0;
5484 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5485 }
5486 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5487 }
5488 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5489 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5490
5491 }
5492
5493 /*----------------------------------------------------------------------------
5494 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5495 | the corresponding value `b', and 0 otherwise. The comparison is performed
5496 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5497 *----------------------------------------------------------------------------*/
5498
5499 int float128_eq( float128 a, float128 b STATUS_PARAM )
5500 {
5501
5502 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5503 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5504 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5505 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5506 ) {
5507 if ( float128_is_signaling_nan( a )
5508 || float128_is_signaling_nan( b ) ) {
5509 float_raise( float_flag_invalid STATUS_VAR);
5510 }
5511 return 0;
5512 }
5513 return
5514 ( a.low == b.low )
5515 && ( ( a.high == b.high )
5516 || ( ( a.low == 0 )
5517 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5518 );
5519
5520 }
5521
5522 /*----------------------------------------------------------------------------
5523 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5524 | or equal to the corresponding value `b', and 0 otherwise. The comparison
5525 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5526 | Arithmetic.
5527 *----------------------------------------------------------------------------*/
5528
5529 int float128_le( float128 a, float128 b STATUS_PARAM )
5530 {
5531 flag aSign, bSign;
5532
5533 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5534 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5535 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5536 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5537 ) {
5538 float_raise( float_flag_invalid STATUS_VAR);
5539 return 0;
5540 }
5541 aSign = extractFloat128Sign( a );
5542 bSign = extractFloat128Sign( b );
5543 if ( aSign != bSign ) {
5544 return
5545 aSign
5546 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5547 == 0 );
5548 }
5549 return
5550 aSign ? le128( b.high, b.low, a.high, a.low )
5551 : le128( a.high, a.low, b.high, b.low );
5552
5553 }
5554
5555 /*----------------------------------------------------------------------------
5556 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5557 | the corresponding value `b', and 0 otherwise. The comparison is performed
5558 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5559 *----------------------------------------------------------------------------*/
5560
5561 int float128_lt( float128 a, float128 b STATUS_PARAM )
5562 {
5563 flag aSign, bSign;
5564
5565 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5566 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5567 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5568 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5569 ) {
5570 float_raise( float_flag_invalid STATUS_VAR);
5571 return 0;
5572 }
5573 aSign = extractFloat128Sign( a );
5574 bSign = extractFloat128Sign( b );
5575 if ( aSign != bSign ) {
5576 return
5577 aSign
5578 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5579 != 0 );
5580 }
5581 return
5582 aSign ? lt128( b.high, b.low, a.high, a.low )
5583 : lt128( a.high, a.low, b.high, b.low );
5584
5585 }
5586
5587 /*----------------------------------------------------------------------------
5588 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5589 | the corresponding value `b', and 0 otherwise. The invalid exception is
5590 | raised if either operand is a NaN. Otherwise, the comparison is performed
5591 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5592 *----------------------------------------------------------------------------*/
5593
5594 int float128_eq_signaling( float128 a, float128 b STATUS_PARAM )
5595 {
5596
5597 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5598 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5599 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5600 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5601 ) {
5602 float_raise( float_flag_invalid STATUS_VAR);
5603 return 0;
5604 }
5605 return
5606 ( a.low == b.low )
5607 && ( ( a.high == b.high )
5608 || ( ( a.low == 0 )
5609 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5610 );
5611
5612 }
5613
5614 /*----------------------------------------------------------------------------
5615 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5616 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5617 | cause an exception. Otherwise, the comparison is performed according to the
5618 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5619 *----------------------------------------------------------------------------*/
5620
5621 int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
5622 {
5623 flag aSign, bSign;
5624
5625 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5626 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5627 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5628 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5629 ) {
5630 if ( float128_is_signaling_nan( a )
5631 || float128_is_signaling_nan( b ) ) {
5632 float_raise( float_flag_invalid STATUS_VAR);
5633 }
5634 return 0;
5635 }
5636 aSign = extractFloat128Sign( a );
5637 bSign = extractFloat128Sign( b );
5638 if ( aSign != bSign ) {
5639 return
5640 aSign
5641 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5642 == 0 );
5643 }
5644 return
5645 aSign ? le128( b.high, b.low, a.high, a.low )
5646 : le128( a.high, a.low, b.high, b.low );
5647
5648 }
5649
5650 /*----------------------------------------------------------------------------
5651 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5652 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5653 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
5654 | Standard for Binary Floating-Point Arithmetic.
5655 *----------------------------------------------------------------------------*/
5656
5657 int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
5658 {
5659 flag aSign, bSign;
5660
5661 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5662 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5663 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5664 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5665 ) {
5666 if ( float128_is_signaling_nan( a )
5667 || float128_is_signaling_nan( b ) ) {
5668 float_raise( float_flag_invalid STATUS_VAR);
5669 }
5670 return 0;
5671 }
5672 aSign = extractFloat128Sign( a );
5673 bSign = extractFloat128Sign( b );
5674 if ( aSign != bSign ) {
5675 return
5676 aSign
5677 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5678 != 0 );
5679 }
5680 return
5681 aSign ? lt128( b.high, b.low, a.high, a.low )
5682 : lt128( a.high, a.low, b.high, b.low );
5683
5684 }
5685
5686 #endif
5687
5688 /* misc functions */
5689 float32 uint32_to_float32( unsigned int a STATUS_PARAM )
5690 {
5691 return int64_to_float32(a STATUS_VAR);
5692 }
5693
5694 float64 uint32_to_float64( unsigned int a STATUS_PARAM )
5695 {
5696 return int64_to_float64(a STATUS_VAR);
5697 }
5698
5699 unsigned int float32_to_uint32( float32 a STATUS_PARAM )
5700 {
5701 int64_t v;
5702 unsigned int res;
5703
5704 v = float32_to_int64(a STATUS_VAR);
5705 if (v < 0) {
5706 res = 0;
5707 float_raise( float_flag_invalid STATUS_VAR);
5708 } else if (v > 0xffffffff) {
5709 res = 0xffffffff;
5710 float_raise( float_flag_invalid STATUS_VAR);
5711 } else {
5712 res = v;
5713 }
5714 return res;
5715 }
5716
5717 unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
5718 {
5719 int64_t v;
5720 unsigned int res;
5721
5722 v = float32_to_int64_round_to_zero(a STATUS_VAR);
5723 if (v < 0) {
5724 res = 0;
5725 float_raise( float_flag_invalid STATUS_VAR);
5726 } else if (v > 0xffffffff) {
5727 res = 0xffffffff;
5728 float_raise( float_flag_invalid STATUS_VAR);
5729 } else {
5730 res = v;
5731 }
5732 return res;
5733 }
5734
5735 unsigned int float32_to_uint16_round_to_zero( float32 a STATUS_PARAM )
5736 {
5737 int64_t v;
5738 unsigned int res;
5739
5740 v = float32_to_int64_round_to_zero(a STATUS_VAR);
5741 if (v < 0) {
5742 res = 0;
5743 float_raise( float_flag_invalid STATUS_VAR);
5744 } else if (v > 0xffff) {
5745 res = 0xffff;
5746 float_raise( float_flag_invalid STATUS_VAR);
5747 } else {
5748 res = v;
5749 }
5750 return res;
5751 }
5752
5753 unsigned int float64_to_uint32( float64 a STATUS_PARAM )
5754 {
5755 int64_t v;
5756 unsigned int res;
5757
5758 v = float64_to_int64(a STATUS_VAR);
5759 if (v < 0) {
5760 res = 0;
5761 float_raise( float_flag_invalid STATUS_VAR);
5762 } else if (v > 0xffffffff) {
5763 res = 0xffffffff;
5764 float_raise( float_flag_invalid STATUS_VAR);
5765 } else {
5766 res = v;
5767 }
5768 return res;
5769 }
5770
5771 unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
5772 {
5773 int64_t v;
5774 unsigned int res;
5775
5776 v = float64_to_int64_round_to_zero(a STATUS_VAR);
5777 if (v < 0) {
5778 res = 0;
5779 float_raise( float_flag_invalid STATUS_VAR);
5780 } else if (v > 0xffffffff) {
5781 res = 0xffffffff;
5782 float_raise( float_flag_invalid STATUS_VAR);
5783 } else {
5784 res = v;
5785 }
5786 return res;
5787 }
5788
5789 unsigned int float64_to_uint16_round_to_zero( float64 a STATUS_PARAM )
5790 {
5791 int64_t v;
5792 unsigned int res;
5793
5794 v = float64_to_int64_round_to_zero(a STATUS_VAR);
5795 if (v < 0) {
5796 res = 0;
5797 float_raise( float_flag_invalid STATUS_VAR);
5798 } else if (v > 0xffff) {
5799 res = 0xffff;
5800 float_raise( float_flag_invalid STATUS_VAR);
5801 } else {
5802 res = v;
5803 }
5804 return res;
5805 }
5806
5807 /* FIXME: This looks broken. */
5808 uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
5809 {
5810 int64_t v;
5811
5812 v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5813 v += float64_val(a);
5814 v = float64_to_int64(make_float64(v) STATUS_VAR);
5815
5816 return v - INT64_MIN;
5817 }
5818
5819 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
5820 {
5821 int64_t v;
5822
5823 v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5824 v += float64_val(a);
5825 v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);
5826
5827 return v - INT64_MIN;
5828 }
5829
5830 #define COMPARE(s, nan_exp) \
5831 INLINE int float ## s ## _compare_internal( float ## s a, float ## s b, \
5832 int is_quiet STATUS_PARAM ) \
5833 { \
5834 flag aSign, bSign; \
5835 bits ## s av, bv; \
5836 \
5837 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
5838 extractFloat ## s ## Frac( a ) ) || \
5839 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
5840 extractFloat ## s ## Frac( b ) )) { \
5841 if (!is_quiet || \
5842 float ## s ## _is_signaling_nan( a ) || \
5843 float ## s ## _is_signaling_nan( b ) ) { \
5844 float_raise( float_flag_invalid STATUS_VAR); \
5845 } \
5846 return float_relation_unordered; \
5847 } \
5848 aSign = extractFloat ## s ## Sign( a ); \
5849 bSign = extractFloat ## s ## Sign( b ); \
5850 av = float ## s ## _val(a); \
5851 bv = float ## s ## _val(b); \
5852 if ( aSign != bSign ) { \
5853 if ( (bits ## s) ( ( av | bv )<<1 ) == 0 ) { \
5854 /* zero case */ \
5855 return float_relation_equal; \
5856 } else { \
5857 return 1 - (2 * aSign); \
5858 } \
5859 } else { \
5860 if (av == bv) { \
5861 return float_relation_equal; \
5862 } else { \
5863 return 1 - 2 * (aSign ^ ( av < bv )); \
5864 } \
5865 } \
5866 } \
5867 \
5868 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM ) \
5869 { \
5870 return float ## s ## _compare_internal(a, b, 0 STATUS_VAR); \
5871 } \
5872 \
5873 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \
5874 { \
5875 return float ## s ## _compare_internal(a, b, 1 STATUS_VAR); \
5876 }
5877
5878 COMPARE(32, 0xff)
5879 COMPARE(64, 0x7ff)
5880
5881 INLINE int float128_compare_internal( float128 a, float128 b,
5882 int is_quiet STATUS_PARAM )
5883 {
5884 flag aSign, bSign;
5885
5886 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
5887 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
5888 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
5889 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
5890 if (!is_quiet ||
5891 float128_is_signaling_nan( a ) ||
5892 float128_is_signaling_nan( b ) ) {
5893 float_raise( float_flag_invalid STATUS_VAR);
5894 }
5895 return float_relation_unordered;
5896 }
5897 aSign = extractFloat128Sign( a );
5898 bSign = extractFloat128Sign( b );
5899 if ( aSign != bSign ) {
5900 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
5901 /* zero case */
5902 return float_relation_equal;
5903 } else {
5904 return 1 - (2 * aSign);
5905 }
5906 } else {
5907 if (a.low == b.low && a.high == b.high) {
5908 return float_relation_equal;
5909 } else {
5910 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
5911 }
5912 }
5913 }
5914
5915 int float128_compare( float128 a, float128 b STATUS_PARAM )
5916 {
5917 return float128_compare_internal(a, b, 0 STATUS_VAR);
5918 }
5919
5920 int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
5921 {
5922 return float128_compare_internal(a, b, 1 STATUS_VAR);
5923 }
5924
5925 /* Multiply A by 2 raised to the power N. */
5926 float32 float32_scalbn( float32 a, int n STATUS_PARAM )
5927 {
5928 flag aSign;
5929 int16 aExp;
5930 bits32 aSig;
5931
5932 aSig = extractFloat32Frac( a );
5933 aExp = extractFloat32Exp( a );
5934 aSign = extractFloat32Sign( a );
5935
5936 if ( aExp == 0xFF ) {
5937 return a;
5938 }
5939 if ( aExp != 0 )
5940 aSig |= 0x00800000;
5941 else if ( aSig == 0 )
5942 return a;
5943
5944 aExp += n - 1;
5945 aSig <<= 7;
5946 return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
5947 }
5948
5949 float64 float64_scalbn( float64 a, int n STATUS_PARAM )
5950 {
5951 flag aSign;
5952 int16 aExp;
5953 bits64 aSig;
5954
5955 aSig = extractFloat64Frac( a );
5956 aExp = extractFloat64Exp( a );
5957 aSign = extractFloat64Sign( a );
5958
5959 if ( aExp == 0x7FF ) {
5960 return a;
5961 }
5962 if ( aExp != 0 )
5963 aSig |= LIT64( 0x0010000000000000 );
5964 else if ( aSig == 0 )
5965 return a;
5966
5967 aExp += n - 1;
5968 aSig <<= 10;
5969 return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
5970 }
5971
5972 #ifdef FLOATX80
5973 floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
5974 {
5975 flag aSign;
5976 int16 aExp;
5977 bits64 aSig;
5978
5979 aSig = extractFloatx80Frac( a );
5980 aExp = extractFloatx80Exp( a );
5981 aSign = extractFloatx80Sign( a );
5982
5983 if ( aExp == 0x7FF ) {
5984 return a;
5985 }
5986 if (aExp == 0 && aSig == 0)
5987 return a;
5988
5989 aExp += n;
5990 return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
5991 aSign, aExp, aSig, 0 STATUS_VAR );
5992 }
5993 #endif
5994
5995 #ifdef FLOAT128
5996 float128 float128_scalbn( float128 a, int n STATUS_PARAM )
5997 {
5998 flag aSign;
5999 int32 aExp;
6000 bits64 aSig0, aSig1;
6001
6002 aSig1 = extractFloat128Frac1( a );
6003 aSig0 = extractFloat128Frac0( a );
6004 aExp = extractFloat128Exp( a );
6005 aSign = extractFloat128Sign( a );
6006 if ( aExp == 0x7FFF ) {
6007 return a;
6008 }
6009 if ( aExp != 0 )
6010 aSig0 |= LIT64( 0x0001000000000000 );
6011 else if ( aSig0 == 0 && aSig1 == 0 )
6012 return a;
6013
6014 aExp += n - 1;
6015 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
6016 STATUS_VAR );
6017
6018 }
6019 #endif