]> git.proxmox.com Git - mirror_qemu.git/blob - fpu/softfloat.c
d4ca7cf7f122db608684fe88b02f4836b7fae813
[mirror_qemu.git] / fpu / softfloat.c
1 /*
2 * QEMU float support
3 *
4 * Derived from SoftFloat.
5 */
6
7 /*============================================================================
8
9 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
10 Package, Release 2b.
11
12 Written by John R. Hauser. This work was made possible in part by the
13 International Computer Science Institute, located at Suite 600, 1947 Center
14 Street, Berkeley, California 94704. Funding was partially provided by the
15 National Science Foundation under grant MIP-9311980. The original version
16 of this code was written as part of a project to build a fixed-point vector
17 processor in collaboration with the University of California at Berkeley,
18 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
19 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
20 arithmetic/SoftFloat.html'.
21
22 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
23 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
24 RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
25 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
26 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
27 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
28 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
29 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
30
31 Derivative works are acceptable, even for commercial purposes, so long as
32 (1) the source code for the derivative work includes prominent notice that
33 the work is derivative, and (2) the source code includes prominent notice with
34 these four paragraphs for those parts of this code that are retained.
35
36 =============================================================================*/
37
38 /* softfloat (and in particular the code in softfloat-specialize.h) is
39 * target-dependent and needs the TARGET_* macros.
40 */
41 #include "config.h"
42
43 #include "fpu/softfloat.h"
44
45 /*----------------------------------------------------------------------------
46 | Primitive arithmetic functions, including multi-word arithmetic, and
47 | division and square root approximations. (Can be specialized to target if
48 | desired.)
49 *----------------------------------------------------------------------------*/
50 #include "softfloat-macros.h"
51
52 /*----------------------------------------------------------------------------
53 | Functions and definitions to determine: (1) whether tininess for underflow
54 | is detected before or after rounding by default, (2) what (if anything)
55 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
56 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
57 | are propagated from function inputs to output. These details are target-
58 | specific.
59 *----------------------------------------------------------------------------*/
60 #include "softfloat-specialize.h"
61
62 /*----------------------------------------------------------------------------
63 | Returns the fraction bits of the half-precision floating-point value `a'.
64 *----------------------------------------------------------------------------*/
65
66 INLINE uint32_t extractFloat16Frac(float16 a)
67 {
68 return float16_val(a) & 0x3ff;
69 }
70
71 /*----------------------------------------------------------------------------
72 | Returns the exponent bits of the half-precision floating-point value `a'.
73 *----------------------------------------------------------------------------*/
74
75 INLINE int_fast16_t extractFloat16Exp(float16 a)
76 {
77 return (float16_val(a) >> 10) & 0x1f;
78 }
79
80 /*----------------------------------------------------------------------------
81 | Returns the sign bit of the single-precision floating-point value `a'.
82 *----------------------------------------------------------------------------*/
83
84 INLINE flag extractFloat16Sign(float16 a)
85 {
86 return float16_val(a)>>15;
87 }
88
89 /*----------------------------------------------------------------------------
90 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
91 | and 7, and returns the properly rounded 32-bit integer corresponding to the
92 | input. If `zSign' is 1, the input is negated before being converted to an
93 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
94 | is simply rounded to an integer, with the inexact exception raised if the
95 | input cannot be represented exactly as an integer. However, if the fixed-
96 | point input is too large, the invalid exception is raised and the largest
97 | positive or negative integer is returned.
98 *----------------------------------------------------------------------------*/
99
100 static int32 roundAndPackInt32( flag zSign, uint64_t absZ STATUS_PARAM)
101 {
102 int8 roundingMode;
103 flag roundNearestEven;
104 int8 roundIncrement, roundBits;
105 int32_t z;
106
107 roundingMode = STATUS(float_rounding_mode);
108 roundNearestEven = ( roundingMode == float_round_nearest_even );
109 roundIncrement = 0x40;
110 if ( ! roundNearestEven ) {
111 if ( roundingMode == float_round_to_zero ) {
112 roundIncrement = 0;
113 }
114 else {
115 roundIncrement = 0x7F;
116 if ( zSign ) {
117 if ( roundingMode == float_round_up ) roundIncrement = 0;
118 }
119 else {
120 if ( roundingMode == float_round_down ) roundIncrement = 0;
121 }
122 }
123 }
124 roundBits = absZ & 0x7F;
125 absZ = ( absZ + roundIncrement )>>7;
126 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
127 z = absZ;
128 if ( zSign ) z = - z;
129 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
130 float_raise( float_flag_invalid STATUS_VAR);
131 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
132 }
133 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
134 return z;
135
136 }
137
138 /*----------------------------------------------------------------------------
139 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
140 | `absZ1', with binary point between bits 63 and 64 (between the input words),
141 | and returns the properly rounded 64-bit integer corresponding to the input.
142 | If `zSign' is 1, the input is negated before being converted to an integer.
143 | Ordinarily, the fixed-point input is simply rounded to an integer, with
144 | the inexact exception raised if the input cannot be represented exactly as
145 | an integer. However, if the fixed-point input is too large, the invalid
146 | exception is raised and the largest positive or negative integer is
147 | returned.
148 *----------------------------------------------------------------------------*/
149
150 static int64 roundAndPackInt64( flag zSign, uint64_t absZ0, uint64_t absZ1 STATUS_PARAM)
151 {
152 int8 roundingMode;
153 flag roundNearestEven, increment;
154 int64_t z;
155
156 roundingMode = STATUS(float_rounding_mode);
157 roundNearestEven = ( roundingMode == float_round_nearest_even );
158 increment = ( (int64_t) absZ1 < 0 );
159 if ( ! roundNearestEven ) {
160 if ( roundingMode == float_round_to_zero ) {
161 increment = 0;
162 }
163 else {
164 if ( zSign ) {
165 increment = ( roundingMode == float_round_down ) && absZ1;
166 }
167 else {
168 increment = ( roundingMode == float_round_up ) && absZ1;
169 }
170 }
171 }
172 if ( increment ) {
173 ++absZ0;
174 if ( absZ0 == 0 ) goto overflow;
175 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
176 }
177 z = absZ0;
178 if ( zSign ) z = - z;
179 if ( z && ( ( z < 0 ) ^ zSign ) ) {
180 overflow:
181 float_raise( float_flag_invalid STATUS_VAR);
182 return
183 zSign ? (int64_t) LIT64( 0x8000000000000000 )
184 : LIT64( 0x7FFFFFFFFFFFFFFF );
185 }
186 if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
187 return z;
188
189 }
190
191 /*----------------------------------------------------------------------------
192 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
193 | `absZ1', with binary point between bits 63 and 64 (between the input words),
194 | and returns the properly rounded 64-bit unsigned integer corresponding to the
195 | input. Ordinarily, the fixed-point input is simply rounded to an integer,
196 | with the inexact exception raised if the input cannot be represented exactly
197 | as an integer. However, if the fixed-point input is too large, the invalid
198 | exception is raised and the largest unsigned integer is returned.
199 *----------------------------------------------------------------------------*/
200
201 static int64 roundAndPackUint64(flag zSign, uint64_t absZ0,
202 uint64_t absZ1 STATUS_PARAM)
203 {
204 int8 roundingMode;
205 flag roundNearestEven, increment;
206
207 roundingMode = STATUS(float_rounding_mode);
208 roundNearestEven = (roundingMode == float_round_nearest_even);
209 increment = ((int64_t)absZ1 < 0);
210 if (!roundNearestEven) {
211 if (roundingMode == float_round_to_zero) {
212 increment = 0;
213 } else if (absZ1) {
214 if (zSign) {
215 increment = (roundingMode == float_round_down) && absZ1;
216 } else {
217 increment = (roundingMode == float_round_up) && absZ1;
218 }
219 }
220 }
221 if (increment) {
222 ++absZ0;
223 if (absZ0 == 0) {
224 float_raise(float_flag_invalid STATUS_VAR);
225 return LIT64(0xFFFFFFFFFFFFFFFF);
226 }
227 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
228 }
229
230 if (zSign && absZ0) {
231 float_raise(float_flag_invalid STATUS_VAR);
232 return 0;
233 }
234
235 if (absZ1) {
236 STATUS(float_exception_flags) |= float_flag_inexact;
237 }
238 return absZ0;
239 }
240
241 /*----------------------------------------------------------------------------
242 | Returns the fraction bits of the single-precision floating-point value `a'.
243 *----------------------------------------------------------------------------*/
244
245 INLINE uint32_t extractFloat32Frac( float32 a )
246 {
247
248 return float32_val(a) & 0x007FFFFF;
249
250 }
251
252 /*----------------------------------------------------------------------------
253 | Returns the exponent bits of the single-precision floating-point value `a'.
254 *----------------------------------------------------------------------------*/
255
256 INLINE int_fast16_t extractFloat32Exp(float32 a)
257 {
258
259 return ( float32_val(a)>>23 ) & 0xFF;
260
261 }
262
263 /*----------------------------------------------------------------------------
264 | Returns the sign bit of the single-precision floating-point value `a'.
265 *----------------------------------------------------------------------------*/
266
267 INLINE flag extractFloat32Sign( float32 a )
268 {
269
270 return float32_val(a)>>31;
271
272 }
273
274 /*----------------------------------------------------------------------------
275 | If `a' is denormal and we are in flush-to-zero mode then set the
276 | input-denormal exception and return zero. Otherwise just return the value.
277 *----------------------------------------------------------------------------*/
278 static float32 float32_squash_input_denormal(float32 a STATUS_PARAM)
279 {
280 if (STATUS(flush_inputs_to_zero)) {
281 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
282 float_raise(float_flag_input_denormal STATUS_VAR);
283 return make_float32(float32_val(a) & 0x80000000);
284 }
285 }
286 return a;
287 }
288
289 /*----------------------------------------------------------------------------
290 | Normalizes the subnormal single-precision floating-point value represented
291 | by the denormalized significand `aSig'. The normalized exponent and
292 | significand are stored at the locations pointed to by `zExpPtr' and
293 | `zSigPtr', respectively.
294 *----------------------------------------------------------------------------*/
295
296 static void
297 normalizeFloat32Subnormal(uint32_t aSig, int_fast16_t *zExpPtr, uint32_t *zSigPtr)
298 {
299 int8 shiftCount;
300
301 shiftCount = countLeadingZeros32( aSig ) - 8;
302 *zSigPtr = aSig<<shiftCount;
303 *zExpPtr = 1 - shiftCount;
304
305 }
306
307 /*----------------------------------------------------------------------------
308 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
309 | single-precision floating-point value, returning the result. After being
310 | shifted into the proper positions, the three fields are simply added
311 | together to form the result. This means that any integer portion of `zSig'
312 | will be added into the exponent. Since a properly normalized significand
313 | will have an integer portion equal to 1, the `zExp' input should be 1 less
314 | than the desired result exponent whenever `zSig' is a complete, normalized
315 | significand.
316 *----------------------------------------------------------------------------*/
317
318 INLINE float32 packFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig)
319 {
320
321 return make_float32(
322 ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
323
324 }
325
326 /*----------------------------------------------------------------------------
327 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
328 | and significand `zSig', and returns the proper single-precision floating-
329 | point value corresponding to the abstract input. Ordinarily, the abstract
330 | value is simply rounded and packed into the single-precision format, with
331 | the inexact exception raised if the abstract input cannot be represented
332 | exactly. However, if the abstract value is too large, the overflow and
333 | inexact exceptions are raised and an infinity or maximal finite value is
334 | returned. If the abstract value is too small, the input value is rounded to
335 | a subnormal number, and the underflow and inexact exceptions are raised if
336 | the abstract input cannot be represented exactly as a subnormal single-
337 | precision floating-point number.
338 | The input significand `zSig' has its binary point between bits 30
339 | and 29, which is 7 bits to the left of the usual location. This shifted
340 | significand must be normalized or smaller. If `zSig' is not normalized,
341 | `zExp' must be 0; in that case, the result returned is a subnormal number,
342 | and it must not require rounding. In the usual case that `zSig' is
343 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
344 | The handling of underflow and overflow follows the IEC/IEEE Standard for
345 | Binary Floating-Point Arithmetic.
346 *----------------------------------------------------------------------------*/
347
348 static float32 roundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM)
349 {
350 int8 roundingMode;
351 flag roundNearestEven;
352 int8 roundIncrement, roundBits;
353 flag isTiny;
354
355 roundingMode = STATUS(float_rounding_mode);
356 roundNearestEven = ( roundingMode == float_round_nearest_even );
357 roundIncrement = 0x40;
358 if ( ! roundNearestEven ) {
359 if ( roundingMode == float_round_to_zero ) {
360 roundIncrement = 0;
361 }
362 else {
363 roundIncrement = 0x7F;
364 if ( zSign ) {
365 if ( roundingMode == float_round_up ) roundIncrement = 0;
366 }
367 else {
368 if ( roundingMode == float_round_down ) roundIncrement = 0;
369 }
370 }
371 }
372 roundBits = zSig & 0x7F;
373 if ( 0xFD <= (uint16_t) zExp ) {
374 if ( ( 0xFD < zExp )
375 || ( ( zExp == 0xFD )
376 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
377 ) {
378 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
379 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
380 }
381 if ( zExp < 0 ) {
382 if (STATUS(flush_to_zero)) {
383 float_raise(float_flag_output_denormal STATUS_VAR);
384 return packFloat32(zSign, 0, 0);
385 }
386 isTiny =
387 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
388 || ( zExp < -1 )
389 || ( zSig + roundIncrement < 0x80000000 );
390 shift32RightJamming( zSig, - zExp, &zSig );
391 zExp = 0;
392 roundBits = zSig & 0x7F;
393 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
394 }
395 }
396 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
397 zSig = ( zSig + roundIncrement )>>7;
398 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
399 if ( zSig == 0 ) zExp = 0;
400 return packFloat32( zSign, zExp, zSig );
401
402 }
403
404 /*----------------------------------------------------------------------------
405 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
406 | and significand `zSig', and returns the proper single-precision floating-
407 | point value corresponding to the abstract input. This routine is just like
408 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
409 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
410 | floating-point exponent.
411 *----------------------------------------------------------------------------*/
412
413 static float32
414 normalizeRoundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM)
415 {
416 int8 shiftCount;
417
418 shiftCount = countLeadingZeros32( zSig ) - 1;
419 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
420
421 }
422
423 /*----------------------------------------------------------------------------
424 | Returns the fraction bits of the double-precision floating-point value `a'.
425 *----------------------------------------------------------------------------*/
426
427 INLINE uint64_t extractFloat64Frac( float64 a )
428 {
429
430 return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
431
432 }
433
434 /*----------------------------------------------------------------------------
435 | Returns the exponent bits of the double-precision floating-point value `a'.
436 *----------------------------------------------------------------------------*/
437
438 INLINE int_fast16_t extractFloat64Exp(float64 a)
439 {
440
441 return ( float64_val(a)>>52 ) & 0x7FF;
442
443 }
444
445 /*----------------------------------------------------------------------------
446 | Returns the sign bit of the double-precision floating-point value `a'.
447 *----------------------------------------------------------------------------*/
448
449 INLINE flag extractFloat64Sign( float64 a )
450 {
451
452 return float64_val(a)>>63;
453
454 }
455
456 /*----------------------------------------------------------------------------
457 | If `a' is denormal and we are in flush-to-zero mode then set the
458 | input-denormal exception and return zero. Otherwise just return the value.
459 *----------------------------------------------------------------------------*/
460 static float64 float64_squash_input_denormal(float64 a STATUS_PARAM)
461 {
462 if (STATUS(flush_inputs_to_zero)) {
463 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
464 float_raise(float_flag_input_denormal STATUS_VAR);
465 return make_float64(float64_val(a) & (1ULL << 63));
466 }
467 }
468 return a;
469 }
470
471 /*----------------------------------------------------------------------------
472 | Normalizes the subnormal double-precision floating-point value represented
473 | by the denormalized significand `aSig'. The normalized exponent and
474 | significand are stored at the locations pointed to by `zExpPtr' and
475 | `zSigPtr', respectively.
476 *----------------------------------------------------------------------------*/
477
478 static void
479 normalizeFloat64Subnormal(uint64_t aSig, int_fast16_t *zExpPtr, uint64_t *zSigPtr)
480 {
481 int8 shiftCount;
482
483 shiftCount = countLeadingZeros64( aSig ) - 11;
484 *zSigPtr = aSig<<shiftCount;
485 *zExpPtr = 1 - shiftCount;
486
487 }
488
489 /*----------------------------------------------------------------------------
490 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
491 | double-precision floating-point value, returning the result. After being
492 | shifted into the proper positions, the three fields are simply added
493 | together to form the result. This means that any integer portion of `zSig'
494 | will be added into the exponent. Since a properly normalized significand
495 | will have an integer portion equal to 1, the `zExp' input should be 1 less
496 | than the desired result exponent whenever `zSig' is a complete, normalized
497 | significand.
498 *----------------------------------------------------------------------------*/
499
500 INLINE float64 packFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig)
501 {
502
503 return make_float64(
504 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
505
506 }
507
508 /*----------------------------------------------------------------------------
509 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
510 | and significand `zSig', and returns the proper double-precision floating-
511 | point value corresponding to the abstract input. Ordinarily, the abstract
512 | value is simply rounded and packed into the double-precision format, with
513 | the inexact exception raised if the abstract input cannot be represented
514 | exactly. However, if the abstract value is too large, the overflow and
515 | inexact exceptions are raised and an infinity or maximal finite value is
516 | returned. If the abstract value is too small, the input value is rounded
517 | to a subnormal number, and the underflow and inexact exceptions are raised
518 | if the abstract input cannot be represented exactly as a subnormal double-
519 | precision floating-point number.
520 | The input significand `zSig' has its binary point between bits 62
521 | and 61, which is 10 bits to the left of the usual location. This shifted
522 | significand must be normalized or smaller. If `zSig' is not normalized,
523 | `zExp' must be 0; in that case, the result returned is a subnormal number,
524 | and it must not require rounding. In the usual case that `zSig' is
525 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
526 | The handling of underflow and overflow follows the IEC/IEEE Standard for
527 | Binary Floating-Point Arithmetic.
528 *----------------------------------------------------------------------------*/
529
530 static float64 roundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM)
531 {
532 int8 roundingMode;
533 flag roundNearestEven;
534 int_fast16_t roundIncrement, roundBits;
535 flag isTiny;
536
537 roundingMode = STATUS(float_rounding_mode);
538 roundNearestEven = ( roundingMode == float_round_nearest_even );
539 roundIncrement = 0x200;
540 if ( ! roundNearestEven ) {
541 if ( roundingMode == float_round_to_zero ) {
542 roundIncrement = 0;
543 }
544 else {
545 roundIncrement = 0x3FF;
546 if ( zSign ) {
547 if ( roundingMode == float_round_up ) roundIncrement = 0;
548 }
549 else {
550 if ( roundingMode == float_round_down ) roundIncrement = 0;
551 }
552 }
553 }
554 roundBits = zSig & 0x3FF;
555 if ( 0x7FD <= (uint16_t) zExp ) {
556 if ( ( 0x7FD < zExp )
557 || ( ( zExp == 0x7FD )
558 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
559 ) {
560 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
561 return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
562 }
563 if ( zExp < 0 ) {
564 if (STATUS(flush_to_zero)) {
565 float_raise(float_flag_output_denormal STATUS_VAR);
566 return packFloat64(zSign, 0, 0);
567 }
568 isTiny =
569 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
570 || ( zExp < -1 )
571 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
572 shift64RightJamming( zSig, - zExp, &zSig );
573 zExp = 0;
574 roundBits = zSig & 0x3FF;
575 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
576 }
577 }
578 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
579 zSig = ( zSig + roundIncrement )>>10;
580 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
581 if ( zSig == 0 ) zExp = 0;
582 return packFloat64( zSign, zExp, zSig );
583
584 }
585
586 /*----------------------------------------------------------------------------
587 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
588 | and significand `zSig', and returns the proper double-precision floating-
589 | point value corresponding to the abstract input. This routine is just like
590 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
591 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
592 | floating-point exponent.
593 *----------------------------------------------------------------------------*/
594
595 static float64
596 normalizeRoundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM)
597 {
598 int8 shiftCount;
599
600 shiftCount = countLeadingZeros64( zSig ) - 1;
601 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
602
603 }
604
605 /*----------------------------------------------------------------------------
606 | Returns the fraction bits of the extended double-precision floating-point
607 | value `a'.
608 *----------------------------------------------------------------------------*/
609
610 INLINE uint64_t extractFloatx80Frac( floatx80 a )
611 {
612
613 return a.low;
614
615 }
616
617 /*----------------------------------------------------------------------------
618 | Returns the exponent bits of the extended double-precision floating-point
619 | value `a'.
620 *----------------------------------------------------------------------------*/
621
622 INLINE int32 extractFloatx80Exp( floatx80 a )
623 {
624
625 return a.high & 0x7FFF;
626
627 }
628
629 /*----------------------------------------------------------------------------
630 | Returns the sign bit of the extended double-precision floating-point value
631 | `a'.
632 *----------------------------------------------------------------------------*/
633
634 INLINE flag extractFloatx80Sign( floatx80 a )
635 {
636
637 return a.high>>15;
638
639 }
640
641 /*----------------------------------------------------------------------------
642 | Normalizes the subnormal extended double-precision floating-point value
643 | represented by the denormalized significand `aSig'. The normalized exponent
644 | and significand are stored at the locations pointed to by `zExpPtr' and
645 | `zSigPtr', respectively.
646 *----------------------------------------------------------------------------*/
647
648 static void
649 normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr )
650 {
651 int8 shiftCount;
652
653 shiftCount = countLeadingZeros64( aSig );
654 *zSigPtr = aSig<<shiftCount;
655 *zExpPtr = 1 - shiftCount;
656
657 }
658
659 /*----------------------------------------------------------------------------
660 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
661 | extended double-precision floating-point value, returning the result.
662 *----------------------------------------------------------------------------*/
663
664 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, uint64_t zSig )
665 {
666 floatx80 z;
667
668 z.low = zSig;
669 z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
670 return z;
671
672 }
673
674 /*----------------------------------------------------------------------------
675 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
676 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
677 | and returns the proper extended double-precision floating-point value
678 | corresponding to the abstract input. Ordinarily, the abstract value is
679 | rounded and packed into the extended double-precision format, with the
680 | inexact exception raised if the abstract input cannot be represented
681 | exactly. However, if the abstract value is too large, the overflow and
682 | inexact exceptions are raised and an infinity or maximal finite value is
683 | returned. If the abstract value is too small, the input value is rounded to
684 | a subnormal number, and the underflow and inexact exceptions are raised if
685 | the abstract input cannot be represented exactly as a subnormal extended
686 | double-precision floating-point number.
687 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
688 | number of bits as single or double precision, respectively. Otherwise, the
689 | result is rounded to the full precision of the extended double-precision
690 | format.
691 | The input significand must be normalized or smaller. If the input
692 | significand is not normalized, `zExp' must be 0; in that case, the result
693 | returned is a subnormal number, and it must not require rounding. The
694 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
695 | Floating-Point Arithmetic.
696 *----------------------------------------------------------------------------*/
697
698 static floatx80
699 roundAndPackFloatx80(
700 int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
701 STATUS_PARAM)
702 {
703 int8 roundingMode;
704 flag roundNearestEven, increment, isTiny;
705 int64 roundIncrement, roundMask, roundBits;
706
707 roundingMode = STATUS(float_rounding_mode);
708 roundNearestEven = ( roundingMode == float_round_nearest_even );
709 if ( roundingPrecision == 80 ) goto precision80;
710 if ( roundingPrecision == 64 ) {
711 roundIncrement = LIT64( 0x0000000000000400 );
712 roundMask = LIT64( 0x00000000000007FF );
713 }
714 else if ( roundingPrecision == 32 ) {
715 roundIncrement = LIT64( 0x0000008000000000 );
716 roundMask = LIT64( 0x000000FFFFFFFFFF );
717 }
718 else {
719 goto precision80;
720 }
721 zSig0 |= ( zSig1 != 0 );
722 if ( ! roundNearestEven ) {
723 if ( roundingMode == float_round_to_zero ) {
724 roundIncrement = 0;
725 }
726 else {
727 roundIncrement = roundMask;
728 if ( zSign ) {
729 if ( roundingMode == float_round_up ) roundIncrement = 0;
730 }
731 else {
732 if ( roundingMode == float_round_down ) roundIncrement = 0;
733 }
734 }
735 }
736 roundBits = zSig0 & roundMask;
737 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
738 if ( ( 0x7FFE < zExp )
739 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
740 ) {
741 goto overflow;
742 }
743 if ( zExp <= 0 ) {
744 if (STATUS(flush_to_zero)) {
745 float_raise(float_flag_output_denormal STATUS_VAR);
746 return packFloatx80(zSign, 0, 0);
747 }
748 isTiny =
749 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
750 || ( zExp < 0 )
751 || ( zSig0 <= zSig0 + roundIncrement );
752 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
753 zExp = 0;
754 roundBits = zSig0 & roundMask;
755 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
756 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
757 zSig0 += roundIncrement;
758 if ( (int64_t) zSig0 < 0 ) zExp = 1;
759 roundIncrement = roundMask + 1;
760 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
761 roundMask |= roundIncrement;
762 }
763 zSig0 &= ~ roundMask;
764 return packFloatx80( zSign, zExp, zSig0 );
765 }
766 }
767 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
768 zSig0 += roundIncrement;
769 if ( zSig0 < roundIncrement ) {
770 ++zExp;
771 zSig0 = LIT64( 0x8000000000000000 );
772 }
773 roundIncrement = roundMask + 1;
774 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
775 roundMask |= roundIncrement;
776 }
777 zSig0 &= ~ roundMask;
778 if ( zSig0 == 0 ) zExp = 0;
779 return packFloatx80( zSign, zExp, zSig0 );
780 precision80:
781 increment = ( (int64_t) zSig1 < 0 );
782 if ( ! roundNearestEven ) {
783 if ( roundingMode == float_round_to_zero ) {
784 increment = 0;
785 }
786 else {
787 if ( zSign ) {
788 increment = ( roundingMode == float_round_down ) && zSig1;
789 }
790 else {
791 increment = ( roundingMode == float_round_up ) && zSig1;
792 }
793 }
794 }
795 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
796 if ( ( 0x7FFE < zExp )
797 || ( ( zExp == 0x7FFE )
798 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
799 && increment
800 )
801 ) {
802 roundMask = 0;
803 overflow:
804 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
805 if ( ( roundingMode == float_round_to_zero )
806 || ( zSign && ( roundingMode == float_round_up ) )
807 || ( ! zSign && ( roundingMode == float_round_down ) )
808 ) {
809 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
810 }
811 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
812 }
813 if ( zExp <= 0 ) {
814 isTiny =
815 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
816 || ( zExp < 0 )
817 || ! increment
818 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
819 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
820 zExp = 0;
821 if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
822 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
823 if ( roundNearestEven ) {
824 increment = ( (int64_t) zSig1 < 0 );
825 }
826 else {
827 if ( zSign ) {
828 increment = ( roundingMode == float_round_down ) && zSig1;
829 }
830 else {
831 increment = ( roundingMode == float_round_up ) && zSig1;
832 }
833 }
834 if ( increment ) {
835 ++zSig0;
836 zSig0 &=
837 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
838 if ( (int64_t) zSig0 < 0 ) zExp = 1;
839 }
840 return packFloatx80( zSign, zExp, zSig0 );
841 }
842 }
843 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
844 if ( increment ) {
845 ++zSig0;
846 if ( zSig0 == 0 ) {
847 ++zExp;
848 zSig0 = LIT64( 0x8000000000000000 );
849 }
850 else {
851 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
852 }
853 }
854 else {
855 if ( zSig0 == 0 ) zExp = 0;
856 }
857 return packFloatx80( zSign, zExp, zSig0 );
858
859 }
860
861 /*----------------------------------------------------------------------------
862 | Takes an abstract floating-point value having sign `zSign', exponent
863 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
864 | and returns the proper extended double-precision floating-point value
865 | corresponding to the abstract input. This routine is just like
866 | `roundAndPackFloatx80' except that the input significand does not have to be
867 | normalized.
868 *----------------------------------------------------------------------------*/
869
870 static floatx80
871 normalizeRoundAndPackFloatx80(
872 int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
873 STATUS_PARAM)
874 {
875 int8 shiftCount;
876
877 if ( zSig0 == 0 ) {
878 zSig0 = zSig1;
879 zSig1 = 0;
880 zExp -= 64;
881 }
882 shiftCount = countLeadingZeros64( zSig0 );
883 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
884 zExp -= shiftCount;
885 return
886 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
887
888 }
889
890 /*----------------------------------------------------------------------------
891 | Returns the least-significant 64 fraction bits of the quadruple-precision
892 | floating-point value `a'.
893 *----------------------------------------------------------------------------*/
894
895 INLINE uint64_t extractFloat128Frac1( float128 a )
896 {
897
898 return a.low;
899
900 }
901
902 /*----------------------------------------------------------------------------
903 | Returns the most-significant 48 fraction bits of the quadruple-precision
904 | floating-point value `a'.
905 *----------------------------------------------------------------------------*/
906
907 INLINE uint64_t extractFloat128Frac0( float128 a )
908 {
909
910 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
911
912 }
913
914 /*----------------------------------------------------------------------------
915 | Returns the exponent bits of the quadruple-precision floating-point value
916 | `a'.
917 *----------------------------------------------------------------------------*/
918
919 INLINE int32 extractFloat128Exp( float128 a )
920 {
921
922 return ( a.high>>48 ) & 0x7FFF;
923
924 }
925
926 /*----------------------------------------------------------------------------
927 | Returns the sign bit of the quadruple-precision floating-point value `a'.
928 *----------------------------------------------------------------------------*/
929
930 INLINE flag extractFloat128Sign( float128 a )
931 {
932
933 return a.high>>63;
934
935 }
936
937 /*----------------------------------------------------------------------------
938 | Normalizes the subnormal quadruple-precision floating-point value
939 | represented by the denormalized significand formed by the concatenation of
940 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
941 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
942 | significand are stored at the location pointed to by `zSig0Ptr', and the
943 | least significant 64 bits of the normalized significand are stored at the
944 | location pointed to by `zSig1Ptr'.
945 *----------------------------------------------------------------------------*/
946
947 static void
948 normalizeFloat128Subnormal(
949 uint64_t aSig0,
950 uint64_t aSig1,
951 int32 *zExpPtr,
952 uint64_t *zSig0Ptr,
953 uint64_t *zSig1Ptr
954 )
955 {
956 int8 shiftCount;
957
958 if ( aSig0 == 0 ) {
959 shiftCount = countLeadingZeros64( aSig1 ) - 15;
960 if ( shiftCount < 0 ) {
961 *zSig0Ptr = aSig1>>( - shiftCount );
962 *zSig1Ptr = aSig1<<( shiftCount & 63 );
963 }
964 else {
965 *zSig0Ptr = aSig1<<shiftCount;
966 *zSig1Ptr = 0;
967 }
968 *zExpPtr = - shiftCount - 63;
969 }
970 else {
971 shiftCount = countLeadingZeros64( aSig0 ) - 15;
972 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
973 *zExpPtr = 1 - shiftCount;
974 }
975
976 }
977
978 /*----------------------------------------------------------------------------
979 | Packs the sign `zSign', the exponent `zExp', and the significand formed
980 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
981 | floating-point value, returning the result. After being shifted into the
982 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
983 | added together to form the most significant 32 bits of the result. This
984 | means that any integer portion of `zSig0' will be added into the exponent.
985 | Since a properly normalized significand will have an integer portion equal
986 | to 1, the `zExp' input should be 1 less than the desired result exponent
987 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
988 | significand.
989 *----------------------------------------------------------------------------*/
990
991 INLINE float128
992 packFloat128( flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 )
993 {
994 float128 z;
995
996 z.low = zSig1;
997 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
998 return z;
999
1000 }
1001
1002 /*----------------------------------------------------------------------------
1003 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1004 | and extended significand formed by the concatenation of `zSig0', `zSig1',
1005 | and `zSig2', and returns the proper quadruple-precision floating-point value
1006 | corresponding to the abstract input. Ordinarily, the abstract value is
1007 | simply rounded and packed into the quadruple-precision format, with the
1008 | inexact exception raised if the abstract input cannot be represented
1009 | exactly. However, if the abstract value is too large, the overflow and
1010 | inexact exceptions are raised and an infinity or maximal finite value is
1011 | returned. If the abstract value is too small, the input value is rounded to
1012 | a subnormal number, and the underflow and inexact exceptions are raised if
1013 | the abstract input cannot be represented exactly as a subnormal quadruple-
1014 | precision floating-point number.
1015 | The input significand must be normalized or smaller. If the input
1016 | significand is not normalized, `zExp' must be 0; in that case, the result
1017 | returned is a subnormal number, and it must not require rounding. In the
1018 | usual case that the input significand is normalized, `zExp' must be 1 less
1019 | than the ``true'' floating-point exponent. The handling of underflow and
1020 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1021 *----------------------------------------------------------------------------*/
1022
1023 static float128
1024 roundAndPackFloat128(
1025 flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1, uint64_t zSig2 STATUS_PARAM)
1026 {
1027 int8 roundingMode;
1028 flag roundNearestEven, increment, isTiny;
1029
1030 roundingMode = STATUS(float_rounding_mode);
1031 roundNearestEven = ( roundingMode == float_round_nearest_even );
1032 increment = ( (int64_t) zSig2 < 0 );
1033 if ( ! roundNearestEven ) {
1034 if ( roundingMode == float_round_to_zero ) {
1035 increment = 0;
1036 }
1037 else {
1038 if ( zSign ) {
1039 increment = ( roundingMode == float_round_down ) && zSig2;
1040 }
1041 else {
1042 increment = ( roundingMode == float_round_up ) && zSig2;
1043 }
1044 }
1045 }
1046 if ( 0x7FFD <= (uint32_t) zExp ) {
1047 if ( ( 0x7FFD < zExp )
1048 || ( ( zExp == 0x7FFD )
1049 && eq128(
1050 LIT64( 0x0001FFFFFFFFFFFF ),
1051 LIT64( 0xFFFFFFFFFFFFFFFF ),
1052 zSig0,
1053 zSig1
1054 )
1055 && increment
1056 )
1057 ) {
1058 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
1059 if ( ( roundingMode == float_round_to_zero )
1060 || ( zSign && ( roundingMode == float_round_up ) )
1061 || ( ! zSign && ( roundingMode == float_round_down ) )
1062 ) {
1063 return
1064 packFloat128(
1065 zSign,
1066 0x7FFE,
1067 LIT64( 0x0000FFFFFFFFFFFF ),
1068 LIT64( 0xFFFFFFFFFFFFFFFF )
1069 );
1070 }
1071 return packFloat128( zSign, 0x7FFF, 0, 0 );
1072 }
1073 if ( zExp < 0 ) {
1074 if (STATUS(flush_to_zero)) {
1075 float_raise(float_flag_output_denormal STATUS_VAR);
1076 return packFloat128(zSign, 0, 0, 0);
1077 }
1078 isTiny =
1079 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
1080 || ( zExp < -1 )
1081 || ! increment
1082 || lt128(
1083 zSig0,
1084 zSig1,
1085 LIT64( 0x0001FFFFFFFFFFFF ),
1086 LIT64( 0xFFFFFFFFFFFFFFFF )
1087 );
1088 shift128ExtraRightJamming(
1089 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1090 zExp = 0;
1091 if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
1092 if ( roundNearestEven ) {
1093 increment = ( (int64_t) zSig2 < 0 );
1094 }
1095 else {
1096 if ( zSign ) {
1097 increment = ( roundingMode == float_round_down ) && zSig2;
1098 }
1099 else {
1100 increment = ( roundingMode == float_round_up ) && zSig2;
1101 }
1102 }
1103 }
1104 }
1105 if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1106 if ( increment ) {
1107 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1108 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1109 }
1110 else {
1111 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1112 }
1113 return packFloat128( zSign, zExp, zSig0, zSig1 );
1114
1115 }
1116
1117 /*----------------------------------------------------------------------------
1118 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1119 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1120 | returns the proper quadruple-precision floating-point value corresponding
1121 | to the abstract input. This routine is just like `roundAndPackFloat128'
1122 | except that the input significand has fewer bits and does not have to be
1123 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1124 | point exponent.
1125 *----------------------------------------------------------------------------*/
1126
1127 static float128
1128 normalizeRoundAndPackFloat128(
1129 flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 STATUS_PARAM)
1130 {
1131 int8 shiftCount;
1132 uint64_t zSig2;
1133
1134 if ( zSig0 == 0 ) {
1135 zSig0 = zSig1;
1136 zSig1 = 0;
1137 zExp -= 64;
1138 }
1139 shiftCount = countLeadingZeros64( zSig0 ) - 15;
1140 if ( 0 <= shiftCount ) {
1141 zSig2 = 0;
1142 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1143 }
1144 else {
1145 shift128ExtraRightJamming(
1146 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1147 }
1148 zExp -= shiftCount;
1149 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1150
1151 }
1152
1153 /*----------------------------------------------------------------------------
1154 | Returns the result of converting the 32-bit two's complement integer `a'
1155 | to the single-precision floating-point format. The conversion is performed
1156 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1157 *----------------------------------------------------------------------------*/
1158
1159 float32 int32_to_float32(int32_t a STATUS_PARAM)
1160 {
1161 flag zSign;
1162
1163 if ( a == 0 ) return float32_zero;
1164 if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1165 zSign = ( a < 0 );
1166 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1167
1168 }
1169
1170 /*----------------------------------------------------------------------------
1171 | Returns the result of converting the 32-bit two's complement integer `a'
1172 | to the double-precision floating-point format. The conversion is performed
1173 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1174 *----------------------------------------------------------------------------*/
1175
1176 float64 int32_to_float64(int32_t a STATUS_PARAM)
1177 {
1178 flag zSign;
1179 uint32 absA;
1180 int8 shiftCount;
1181 uint64_t zSig;
1182
1183 if ( a == 0 ) return float64_zero;
1184 zSign = ( a < 0 );
1185 absA = zSign ? - a : a;
1186 shiftCount = countLeadingZeros32( absA ) + 21;
1187 zSig = absA;
1188 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1189
1190 }
1191
1192 /*----------------------------------------------------------------------------
1193 | Returns the result of converting the 32-bit two's complement integer `a'
1194 | to the extended double-precision floating-point format. The conversion
1195 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1196 | Arithmetic.
1197 *----------------------------------------------------------------------------*/
1198
1199 floatx80 int32_to_floatx80(int32_t a STATUS_PARAM)
1200 {
1201 flag zSign;
1202 uint32 absA;
1203 int8 shiftCount;
1204 uint64_t zSig;
1205
1206 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1207 zSign = ( a < 0 );
1208 absA = zSign ? - a : a;
1209 shiftCount = countLeadingZeros32( absA ) + 32;
1210 zSig = absA;
1211 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1212
1213 }
1214
1215 /*----------------------------------------------------------------------------
1216 | Returns the result of converting the 32-bit two's complement integer `a' to
1217 | the quadruple-precision floating-point format. The conversion is performed
1218 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1219 *----------------------------------------------------------------------------*/
1220
1221 float128 int32_to_float128(int32_t a STATUS_PARAM)
1222 {
1223 flag zSign;
1224 uint32 absA;
1225 int8 shiftCount;
1226 uint64_t zSig0;
1227
1228 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1229 zSign = ( a < 0 );
1230 absA = zSign ? - a : a;
1231 shiftCount = countLeadingZeros32( absA ) + 17;
1232 zSig0 = absA;
1233 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1234
1235 }
1236
1237 /*----------------------------------------------------------------------------
1238 | Returns the result of converting the 64-bit two's complement integer `a'
1239 | to the single-precision floating-point format. The conversion is performed
1240 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1241 *----------------------------------------------------------------------------*/
1242
1243 float32 int64_to_float32(int64_t a STATUS_PARAM)
1244 {
1245 flag zSign;
1246 uint64 absA;
1247 int8 shiftCount;
1248
1249 if ( a == 0 ) return float32_zero;
1250 zSign = ( a < 0 );
1251 absA = zSign ? - a : a;
1252 shiftCount = countLeadingZeros64( absA ) - 40;
1253 if ( 0 <= shiftCount ) {
1254 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1255 }
1256 else {
1257 shiftCount += 7;
1258 if ( shiftCount < 0 ) {
1259 shift64RightJamming( absA, - shiftCount, &absA );
1260 }
1261 else {
1262 absA <<= shiftCount;
1263 }
1264 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1265 }
1266
1267 }
1268
1269 float32 uint64_to_float32(uint64_t a STATUS_PARAM)
1270 {
1271 int8 shiftCount;
1272
1273 if ( a == 0 ) return float32_zero;
1274 shiftCount = countLeadingZeros64( a ) - 40;
1275 if ( 0 <= shiftCount ) {
1276 return packFloat32(0, 0x95 - shiftCount, a<<shiftCount);
1277 }
1278 else {
1279 shiftCount += 7;
1280 if ( shiftCount < 0 ) {
1281 shift64RightJamming( a, - shiftCount, &a );
1282 }
1283 else {
1284 a <<= shiftCount;
1285 }
1286 return roundAndPackFloat32(0, 0x9C - shiftCount, a STATUS_VAR);
1287 }
1288 }
1289
1290 /*----------------------------------------------------------------------------
1291 | Returns the result of converting the 64-bit two's complement integer `a'
1292 | to the double-precision floating-point format. The conversion is performed
1293 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1294 *----------------------------------------------------------------------------*/
1295
1296 float64 int64_to_float64(int64_t a STATUS_PARAM)
1297 {
1298 flag zSign;
1299
1300 if ( a == 0 ) return float64_zero;
1301 if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
1302 return packFloat64( 1, 0x43E, 0 );
1303 }
1304 zSign = ( a < 0 );
1305 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1306
1307 }
1308
1309 float64 uint64_to_float64(uint64_t a STATUS_PARAM)
1310 {
1311 int exp = 0x43C;
1312
1313 if (a == 0) {
1314 return float64_zero;
1315 }
1316 if ((int64_t)a < 0) {
1317 shift64RightJamming(a, 1, &a);
1318 exp += 1;
1319 }
1320 return normalizeRoundAndPackFloat64(0, exp, a STATUS_VAR);
1321 }
1322
1323 /*----------------------------------------------------------------------------
1324 | Returns the result of converting the 64-bit two's complement integer `a'
1325 | to the extended double-precision floating-point format. The conversion
1326 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1327 | Arithmetic.
1328 *----------------------------------------------------------------------------*/
1329
1330 floatx80 int64_to_floatx80(int64_t a STATUS_PARAM)
1331 {
1332 flag zSign;
1333 uint64 absA;
1334 int8 shiftCount;
1335
1336 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1337 zSign = ( a < 0 );
1338 absA = zSign ? - a : a;
1339 shiftCount = countLeadingZeros64( absA );
1340 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1341
1342 }
1343
1344 /*----------------------------------------------------------------------------
1345 | Returns the result of converting the 64-bit two's complement integer `a' to
1346 | the quadruple-precision floating-point format. The conversion is performed
1347 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1348 *----------------------------------------------------------------------------*/
1349
1350 float128 int64_to_float128(int64_t a STATUS_PARAM)
1351 {
1352 flag zSign;
1353 uint64 absA;
1354 int8 shiftCount;
1355 int32 zExp;
1356 uint64_t zSig0, zSig1;
1357
1358 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1359 zSign = ( a < 0 );
1360 absA = zSign ? - a : a;
1361 shiftCount = countLeadingZeros64( absA ) + 49;
1362 zExp = 0x406E - shiftCount;
1363 if ( 64 <= shiftCount ) {
1364 zSig1 = 0;
1365 zSig0 = absA;
1366 shiftCount -= 64;
1367 }
1368 else {
1369 zSig1 = absA;
1370 zSig0 = 0;
1371 }
1372 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1373 return packFloat128( zSign, zExp, zSig0, zSig1 );
1374
1375 }
1376
1377 float128 uint64_to_float128(uint64_t a STATUS_PARAM)
1378 {
1379 if (a == 0) {
1380 return float128_zero;
1381 }
1382 return normalizeRoundAndPackFloat128(0, 0x406E, a, 0 STATUS_VAR);
1383 }
1384
1385 /*----------------------------------------------------------------------------
1386 | Returns the result of converting the single-precision floating-point value
1387 | `a' to the 32-bit two's complement integer format. The conversion is
1388 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1389 | Arithmetic---which means in particular that the conversion is rounded
1390 | according to the current rounding mode. If `a' is a NaN, the largest
1391 | positive integer is returned. Otherwise, if the conversion overflows, the
1392 | largest integer with the same sign as `a' is returned.
1393 *----------------------------------------------------------------------------*/
1394
1395 int32 float32_to_int32( float32 a STATUS_PARAM )
1396 {
1397 flag aSign;
1398 int_fast16_t aExp, shiftCount;
1399 uint32_t aSig;
1400 uint64_t aSig64;
1401
1402 a = float32_squash_input_denormal(a STATUS_VAR);
1403 aSig = extractFloat32Frac( a );
1404 aExp = extractFloat32Exp( a );
1405 aSign = extractFloat32Sign( a );
1406 if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1407 if ( aExp ) aSig |= 0x00800000;
1408 shiftCount = 0xAF - aExp;
1409 aSig64 = aSig;
1410 aSig64 <<= 32;
1411 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1412 return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1413
1414 }
1415
1416 /*----------------------------------------------------------------------------
1417 | Returns the result of converting the single-precision floating-point value
1418 | `a' to the 32-bit two's complement integer format. The conversion is
1419 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1420 | Arithmetic, except that the conversion is always rounded toward zero.
1421 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1422 | the conversion overflows, the largest integer with the same sign as `a' is
1423 | returned.
1424 *----------------------------------------------------------------------------*/
1425
1426 int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1427 {
1428 flag aSign;
1429 int_fast16_t aExp, shiftCount;
1430 uint32_t aSig;
1431 int32_t z;
1432 a = float32_squash_input_denormal(a STATUS_VAR);
1433
1434 aSig = extractFloat32Frac( a );
1435 aExp = extractFloat32Exp( a );
1436 aSign = extractFloat32Sign( a );
1437 shiftCount = aExp - 0x9E;
1438 if ( 0 <= shiftCount ) {
1439 if ( float32_val(a) != 0xCF000000 ) {
1440 float_raise( float_flag_invalid STATUS_VAR);
1441 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1442 }
1443 return (int32_t) 0x80000000;
1444 }
1445 else if ( aExp <= 0x7E ) {
1446 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1447 return 0;
1448 }
1449 aSig = ( aSig | 0x00800000 )<<8;
1450 z = aSig>>( - shiftCount );
1451 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1452 STATUS(float_exception_flags) |= float_flag_inexact;
1453 }
1454 if ( aSign ) z = - z;
1455 return z;
1456
1457 }
1458
1459 /*----------------------------------------------------------------------------
1460 | Returns the result of converting the single-precision floating-point value
1461 | `a' to the 16-bit two's complement integer format. The conversion is
1462 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1463 | Arithmetic, except that the conversion is always rounded toward zero.
1464 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1465 | the conversion overflows, the largest integer with the same sign as `a' is
1466 | returned.
1467 *----------------------------------------------------------------------------*/
1468
1469 int_fast16_t float32_to_int16_round_to_zero(float32 a STATUS_PARAM)
1470 {
1471 flag aSign;
1472 int_fast16_t aExp, shiftCount;
1473 uint32_t aSig;
1474 int32 z;
1475
1476 aSig = extractFloat32Frac( a );
1477 aExp = extractFloat32Exp( a );
1478 aSign = extractFloat32Sign( a );
1479 shiftCount = aExp - 0x8E;
1480 if ( 0 <= shiftCount ) {
1481 if ( float32_val(a) != 0xC7000000 ) {
1482 float_raise( float_flag_invalid STATUS_VAR);
1483 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1484 return 0x7FFF;
1485 }
1486 }
1487 return (int32_t) 0xffff8000;
1488 }
1489 else if ( aExp <= 0x7E ) {
1490 if ( aExp | aSig ) {
1491 STATUS(float_exception_flags) |= float_flag_inexact;
1492 }
1493 return 0;
1494 }
1495 shiftCount -= 0x10;
1496 aSig = ( aSig | 0x00800000 )<<8;
1497 z = aSig>>( - shiftCount );
1498 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1499 STATUS(float_exception_flags) |= float_flag_inexact;
1500 }
1501 if ( aSign ) {
1502 z = - z;
1503 }
1504 return z;
1505
1506 }
1507
1508 /*----------------------------------------------------------------------------
1509 | Returns the result of converting the single-precision floating-point value
1510 | `a' to the 64-bit two's complement integer format. The conversion is
1511 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1512 | Arithmetic---which means in particular that the conversion is rounded
1513 | according to the current rounding mode. If `a' is a NaN, the largest
1514 | positive integer is returned. Otherwise, if the conversion overflows, the
1515 | largest integer with the same sign as `a' is returned.
1516 *----------------------------------------------------------------------------*/
1517
1518 int64 float32_to_int64( float32 a STATUS_PARAM )
1519 {
1520 flag aSign;
1521 int_fast16_t aExp, shiftCount;
1522 uint32_t aSig;
1523 uint64_t aSig64, aSigExtra;
1524 a = float32_squash_input_denormal(a STATUS_VAR);
1525
1526 aSig = extractFloat32Frac( a );
1527 aExp = extractFloat32Exp( a );
1528 aSign = extractFloat32Sign( a );
1529 shiftCount = 0xBE - aExp;
1530 if ( shiftCount < 0 ) {
1531 float_raise( float_flag_invalid STATUS_VAR);
1532 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1533 return LIT64( 0x7FFFFFFFFFFFFFFF );
1534 }
1535 return (int64_t) LIT64( 0x8000000000000000 );
1536 }
1537 if ( aExp ) aSig |= 0x00800000;
1538 aSig64 = aSig;
1539 aSig64 <<= 40;
1540 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1541 return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1542
1543 }
1544
1545 /*----------------------------------------------------------------------------
1546 | Returns the result of converting the single-precision floating-point value
1547 | `a' to the 64-bit unsigned integer format. The conversion is
1548 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1549 | Arithmetic---which means in particular that the conversion is rounded
1550 | according to the current rounding mode. If `a' is a NaN, the largest
1551 | unsigned integer is returned. Otherwise, if the conversion overflows, the
1552 | largest unsigned integer is returned. If the 'a' is negative, the result
1553 | is rounded and zero is returned; values that do not round to zero will
1554 | raise the inexact exception flag.
1555 *----------------------------------------------------------------------------*/
1556
1557 uint64 float32_to_uint64(float32 a STATUS_PARAM)
1558 {
1559 flag aSign;
1560 int_fast16_t aExp, shiftCount;
1561 uint32_t aSig;
1562 uint64_t aSig64, aSigExtra;
1563 a = float32_squash_input_denormal(a STATUS_VAR);
1564
1565 aSig = extractFloat32Frac(a);
1566 aExp = extractFloat32Exp(a);
1567 aSign = extractFloat32Sign(a);
1568 if ((aSign) && (aExp > 126)) {
1569 float_raise(float_flag_invalid STATUS_VAR);
1570 if (float32_is_any_nan(a)) {
1571 return LIT64(0xFFFFFFFFFFFFFFFF);
1572 } else {
1573 return 0;
1574 }
1575 }
1576 shiftCount = 0xBE - aExp;
1577 if (aExp) {
1578 aSig |= 0x00800000;
1579 }
1580 if (shiftCount < 0) {
1581 float_raise(float_flag_invalid STATUS_VAR);
1582 return LIT64(0xFFFFFFFFFFFFFFFF);
1583 }
1584
1585 aSig64 = aSig;
1586 aSig64 <<= 40;
1587 shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra);
1588 return roundAndPackUint64(aSign, aSig64, aSigExtra STATUS_VAR);
1589 }
1590
1591 /*----------------------------------------------------------------------------
1592 | Returns the result of converting the single-precision floating-point value
1593 | `a' to the 64-bit two's complement integer format. The conversion is
1594 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1595 | Arithmetic, except that the conversion is always rounded toward zero. If
1596 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1597 | conversion overflows, the largest integer with the same sign as `a' is
1598 | returned.
1599 *----------------------------------------------------------------------------*/
1600
1601 int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1602 {
1603 flag aSign;
1604 int_fast16_t aExp, shiftCount;
1605 uint32_t aSig;
1606 uint64_t aSig64;
1607 int64 z;
1608 a = float32_squash_input_denormal(a STATUS_VAR);
1609
1610 aSig = extractFloat32Frac( a );
1611 aExp = extractFloat32Exp( a );
1612 aSign = extractFloat32Sign( a );
1613 shiftCount = aExp - 0xBE;
1614 if ( 0 <= shiftCount ) {
1615 if ( float32_val(a) != 0xDF000000 ) {
1616 float_raise( float_flag_invalid STATUS_VAR);
1617 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1618 return LIT64( 0x7FFFFFFFFFFFFFFF );
1619 }
1620 }
1621 return (int64_t) LIT64( 0x8000000000000000 );
1622 }
1623 else if ( aExp <= 0x7E ) {
1624 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1625 return 0;
1626 }
1627 aSig64 = aSig | 0x00800000;
1628 aSig64 <<= 40;
1629 z = aSig64>>( - shiftCount );
1630 if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
1631 STATUS(float_exception_flags) |= float_flag_inexact;
1632 }
1633 if ( aSign ) z = - z;
1634 return z;
1635
1636 }
1637
1638 /*----------------------------------------------------------------------------
1639 | Returns the result of converting the single-precision floating-point value
1640 | `a' to the double-precision floating-point format. The conversion is
1641 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1642 | Arithmetic.
1643 *----------------------------------------------------------------------------*/
1644
1645 float64 float32_to_float64( float32 a STATUS_PARAM )
1646 {
1647 flag aSign;
1648 int_fast16_t aExp;
1649 uint32_t aSig;
1650 a = float32_squash_input_denormal(a STATUS_VAR);
1651
1652 aSig = extractFloat32Frac( a );
1653 aExp = extractFloat32Exp( a );
1654 aSign = extractFloat32Sign( a );
1655 if ( aExp == 0xFF ) {
1656 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1657 return packFloat64( aSign, 0x7FF, 0 );
1658 }
1659 if ( aExp == 0 ) {
1660 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1661 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1662 --aExp;
1663 }
1664 return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
1665
1666 }
1667
1668 /*----------------------------------------------------------------------------
1669 | Returns the result of converting the single-precision floating-point value
1670 | `a' to the extended double-precision floating-point format. The conversion
1671 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1672 | Arithmetic.
1673 *----------------------------------------------------------------------------*/
1674
1675 floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1676 {
1677 flag aSign;
1678 int_fast16_t aExp;
1679 uint32_t aSig;
1680
1681 a = float32_squash_input_denormal(a STATUS_VAR);
1682 aSig = extractFloat32Frac( a );
1683 aExp = extractFloat32Exp( a );
1684 aSign = extractFloat32Sign( a );
1685 if ( aExp == 0xFF ) {
1686 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1687 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1688 }
1689 if ( aExp == 0 ) {
1690 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1691 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1692 }
1693 aSig |= 0x00800000;
1694 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
1695
1696 }
1697
1698 /*----------------------------------------------------------------------------
1699 | Returns the result of converting the single-precision floating-point value
1700 | `a' to the double-precision floating-point format. The conversion is
1701 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1702 | Arithmetic.
1703 *----------------------------------------------------------------------------*/
1704
1705 float128 float32_to_float128( float32 a STATUS_PARAM )
1706 {
1707 flag aSign;
1708 int_fast16_t aExp;
1709 uint32_t aSig;
1710
1711 a = float32_squash_input_denormal(a STATUS_VAR);
1712 aSig = extractFloat32Frac( a );
1713 aExp = extractFloat32Exp( a );
1714 aSign = extractFloat32Sign( a );
1715 if ( aExp == 0xFF ) {
1716 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1717 return packFloat128( aSign, 0x7FFF, 0, 0 );
1718 }
1719 if ( aExp == 0 ) {
1720 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1721 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1722 --aExp;
1723 }
1724 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
1725
1726 }
1727
1728 /*----------------------------------------------------------------------------
1729 | Rounds the single-precision floating-point value `a' to an integer, and
1730 | returns the result as a single-precision floating-point value. The
1731 | operation is performed according to the IEC/IEEE Standard for Binary
1732 | Floating-Point Arithmetic.
1733 *----------------------------------------------------------------------------*/
1734
1735 float32 float32_round_to_int( float32 a STATUS_PARAM)
1736 {
1737 flag aSign;
1738 int_fast16_t aExp;
1739 uint32_t lastBitMask, roundBitsMask;
1740 int8 roundingMode;
1741 uint32_t z;
1742 a = float32_squash_input_denormal(a STATUS_VAR);
1743
1744 aExp = extractFloat32Exp( a );
1745 if ( 0x96 <= aExp ) {
1746 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1747 return propagateFloat32NaN( a, a STATUS_VAR );
1748 }
1749 return a;
1750 }
1751 if ( aExp <= 0x7E ) {
1752 if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
1753 STATUS(float_exception_flags) |= float_flag_inexact;
1754 aSign = extractFloat32Sign( a );
1755 switch ( STATUS(float_rounding_mode) ) {
1756 case float_round_nearest_even:
1757 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1758 return packFloat32( aSign, 0x7F, 0 );
1759 }
1760 break;
1761 case float_round_down:
1762 return make_float32(aSign ? 0xBF800000 : 0);
1763 case float_round_up:
1764 return make_float32(aSign ? 0x80000000 : 0x3F800000);
1765 }
1766 return packFloat32( aSign, 0, 0 );
1767 }
1768 lastBitMask = 1;
1769 lastBitMask <<= 0x96 - aExp;
1770 roundBitsMask = lastBitMask - 1;
1771 z = float32_val(a);
1772 roundingMode = STATUS(float_rounding_mode);
1773 if ( roundingMode == float_round_nearest_even ) {
1774 z += lastBitMask>>1;
1775 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1776 }
1777 else if ( roundingMode != float_round_to_zero ) {
1778 if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
1779 z += roundBitsMask;
1780 }
1781 }
1782 z &= ~ roundBitsMask;
1783 if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1784 return make_float32(z);
1785
1786 }
1787
1788 /*----------------------------------------------------------------------------
1789 | Returns the result of adding the absolute values of the single-precision
1790 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1791 | before being returned. `zSign' is ignored if the result is a NaN.
1792 | The addition is performed according to the IEC/IEEE Standard for Binary
1793 | Floating-Point Arithmetic.
1794 *----------------------------------------------------------------------------*/
1795
1796 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1797 {
1798 int_fast16_t aExp, bExp, zExp;
1799 uint32_t aSig, bSig, zSig;
1800 int_fast16_t expDiff;
1801
1802 aSig = extractFloat32Frac( a );
1803 aExp = extractFloat32Exp( a );
1804 bSig = extractFloat32Frac( b );
1805 bExp = extractFloat32Exp( b );
1806 expDiff = aExp - bExp;
1807 aSig <<= 6;
1808 bSig <<= 6;
1809 if ( 0 < expDiff ) {
1810 if ( aExp == 0xFF ) {
1811 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1812 return a;
1813 }
1814 if ( bExp == 0 ) {
1815 --expDiff;
1816 }
1817 else {
1818 bSig |= 0x20000000;
1819 }
1820 shift32RightJamming( bSig, expDiff, &bSig );
1821 zExp = aExp;
1822 }
1823 else if ( expDiff < 0 ) {
1824 if ( bExp == 0xFF ) {
1825 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1826 return packFloat32( zSign, 0xFF, 0 );
1827 }
1828 if ( aExp == 0 ) {
1829 ++expDiff;
1830 }
1831 else {
1832 aSig |= 0x20000000;
1833 }
1834 shift32RightJamming( aSig, - expDiff, &aSig );
1835 zExp = bExp;
1836 }
1837 else {
1838 if ( aExp == 0xFF ) {
1839 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1840 return a;
1841 }
1842 if ( aExp == 0 ) {
1843 if (STATUS(flush_to_zero)) {
1844 if (aSig | bSig) {
1845 float_raise(float_flag_output_denormal STATUS_VAR);
1846 }
1847 return packFloat32(zSign, 0, 0);
1848 }
1849 return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1850 }
1851 zSig = 0x40000000 + aSig + bSig;
1852 zExp = aExp;
1853 goto roundAndPack;
1854 }
1855 aSig |= 0x20000000;
1856 zSig = ( aSig + bSig )<<1;
1857 --zExp;
1858 if ( (int32_t) zSig < 0 ) {
1859 zSig = aSig + bSig;
1860 ++zExp;
1861 }
1862 roundAndPack:
1863 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1864
1865 }
1866
1867 /*----------------------------------------------------------------------------
1868 | Returns the result of subtracting the absolute values of the single-
1869 | precision floating-point values `a' and `b'. If `zSign' is 1, the
1870 | difference is negated before being returned. `zSign' is ignored if the
1871 | result is a NaN. The subtraction is performed according to the IEC/IEEE
1872 | Standard for Binary Floating-Point Arithmetic.
1873 *----------------------------------------------------------------------------*/
1874
1875 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1876 {
1877 int_fast16_t aExp, bExp, zExp;
1878 uint32_t aSig, bSig, zSig;
1879 int_fast16_t expDiff;
1880
1881 aSig = extractFloat32Frac( a );
1882 aExp = extractFloat32Exp( a );
1883 bSig = extractFloat32Frac( b );
1884 bExp = extractFloat32Exp( b );
1885 expDiff = aExp - bExp;
1886 aSig <<= 7;
1887 bSig <<= 7;
1888 if ( 0 < expDiff ) goto aExpBigger;
1889 if ( expDiff < 0 ) goto bExpBigger;
1890 if ( aExp == 0xFF ) {
1891 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1892 float_raise( float_flag_invalid STATUS_VAR);
1893 return float32_default_nan;
1894 }
1895 if ( aExp == 0 ) {
1896 aExp = 1;
1897 bExp = 1;
1898 }
1899 if ( bSig < aSig ) goto aBigger;
1900 if ( aSig < bSig ) goto bBigger;
1901 return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1902 bExpBigger:
1903 if ( bExp == 0xFF ) {
1904 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1905 return packFloat32( zSign ^ 1, 0xFF, 0 );
1906 }
1907 if ( aExp == 0 ) {
1908 ++expDiff;
1909 }
1910 else {
1911 aSig |= 0x40000000;
1912 }
1913 shift32RightJamming( aSig, - expDiff, &aSig );
1914 bSig |= 0x40000000;
1915 bBigger:
1916 zSig = bSig - aSig;
1917 zExp = bExp;
1918 zSign ^= 1;
1919 goto normalizeRoundAndPack;
1920 aExpBigger:
1921 if ( aExp == 0xFF ) {
1922 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1923 return a;
1924 }
1925 if ( bExp == 0 ) {
1926 --expDiff;
1927 }
1928 else {
1929 bSig |= 0x40000000;
1930 }
1931 shift32RightJamming( bSig, expDiff, &bSig );
1932 aSig |= 0x40000000;
1933 aBigger:
1934 zSig = aSig - bSig;
1935 zExp = aExp;
1936 normalizeRoundAndPack:
1937 --zExp;
1938 return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1939
1940 }
1941
1942 /*----------------------------------------------------------------------------
1943 | Returns the result of adding the single-precision floating-point values `a'
1944 | and `b'. The operation is performed according to the IEC/IEEE Standard for
1945 | Binary Floating-Point Arithmetic.
1946 *----------------------------------------------------------------------------*/
1947
1948 float32 float32_add( float32 a, float32 b STATUS_PARAM )
1949 {
1950 flag aSign, bSign;
1951 a = float32_squash_input_denormal(a STATUS_VAR);
1952 b = float32_squash_input_denormal(b STATUS_VAR);
1953
1954 aSign = extractFloat32Sign( a );
1955 bSign = extractFloat32Sign( b );
1956 if ( aSign == bSign ) {
1957 return addFloat32Sigs( a, b, aSign STATUS_VAR);
1958 }
1959 else {
1960 return subFloat32Sigs( a, b, aSign STATUS_VAR );
1961 }
1962
1963 }
1964
1965 /*----------------------------------------------------------------------------
1966 | Returns the result of subtracting the single-precision floating-point values
1967 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1968 | for Binary Floating-Point Arithmetic.
1969 *----------------------------------------------------------------------------*/
1970
1971 float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1972 {
1973 flag aSign, bSign;
1974 a = float32_squash_input_denormal(a STATUS_VAR);
1975 b = float32_squash_input_denormal(b STATUS_VAR);
1976
1977 aSign = extractFloat32Sign( a );
1978 bSign = extractFloat32Sign( b );
1979 if ( aSign == bSign ) {
1980 return subFloat32Sigs( a, b, aSign STATUS_VAR );
1981 }
1982 else {
1983 return addFloat32Sigs( a, b, aSign STATUS_VAR );
1984 }
1985
1986 }
1987
1988 /*----------------------------------------------------------------------------
1989 | Returns the result of multiplying the single-precision floating-point values
1990 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1991 | for Binary Floating-Point Arithmetic.
1992 *----------------------------------------------------------------------------*/
1993
1994 float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1995 {
1996 flag aSign, bSign, zSign;
1997 int_fast16_t aExp, bExp, zExp;
1998 uint32_t aSig, bSig;
1999 uint64_t zSig64;
2000 uint32_t zSig;
2001
2002 a = float32_squash_input_denormal(a STATUS_VAR);
2003 b = float32_squash_input_denormal(b STATUS_VAR);
2004
2005 aSig = extractFloat32Frac( a );
2006 aExp = extractFloat32Exp( a );
2007 aSign = extractFloat32Sign( a );
2008 bSig = extractFloat32Frac( b );
2009 bExp = extractFloat32Exp( b );
2010 bSign = extractFloat32Sign( b );
2011 zSign = aSign ^ bSign;
2012 if ( aExp == 0xFF ) {
2013 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2014 return propagateFloat32NaN( a, b STATUS_VAR );
2015 }
2016 if ( ( bExp | bSig ) == 0 ) {
2017 float_raise( float_flag_invalid STATUS_VAR);
2018 return float32_default_nan;
2019 }
2020 return packFloat32( zSign, 0xFF, 0 );
2021 }
2022 if ( bExp == 0xFF ) {
2023 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2024 if ( ( aExp | aSig ) == 0 ) {
2025 float_raise( float_flag_invalid STATUS_VAR);
2026 return float32_default_nan;
2027 }
2028 return packFloat32( zSign, 0xFF, 0 );
2029 }
2030 if ( aExp == 0 ) {
2031 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2032 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2033 }
2034 if ( bExp == 0 ) {
2035 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
2036 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2037 }
2038 zExp = aExp + bExp - 0x7F;
2039 aSig = ( aSig | 0x00800000 )<<7;
2040 bSig = ( bSig | 0x00800000 )<<8;
2041 shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
2042 zSig = zSig64;
2043 if ( 0 <= (int32_t) ( zSig<<1 ) ) {
2044 zSig <<= 1;
2045 --zExp;
2046 }
2047 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2048
2049 }
2050
2051 /*----------------------------------------------------------------------------
2052 | Returns the result of dividing the single-precision floating-point value `a'
2053 | by the corresponding value `b'. The operation is performed according to the
2054 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2055 *----------------------------------------------------------------------------*/
2056
2057 float32 float32_div( float32 a, float32 b STATUS_PARAM )
2058 {
2059 flag aSign, bSign, zSign;
2060 int_fast16_t aExp, bExp, zExp;
2061 uint32_t aSig, bSig, zSig;
2062 a = float32_squash_input_denormal(a STATUS_VAR);
2063 b = float32_squash_input_denormal(b STATUS_VAR);
2064
2065 aSig = extractFloat32Frac( a );
2066 aExp = extractFloat32Exp( a );
2067 aSign = extractFloat32Sign( a );
2068 bSig = extractFloat32Frac( b );
2069 bExp = extractFloat32Exp( b );
2070 bSign = extractFloat32Sign( b );
2071 zSign = aSign ^ bSign;
2072 if ( aExp == 0xFF ) {
2073 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2074 if ( bExp == 0xFF ) {
2075 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2076 float_raise( float_flag_invalid STATUS_VAR);
2077 return float32_default_nan;
2078 }
2079 return packFloat32( zSign, 0xFF, 0 );
2080 }
2081 if ( bExp == 0xFF ) {
2082 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2083 return packFloat32( zSign, 0, 0 );
2084 }
2085 if ( bExp == 0 ) {
2086 if ( bSig == 0 ) {
2087 if ( ( aExp | aSig ) == 0 ) {
2088 float_raise( float_flag_invalid STATUS_VAR);
2089 return float32_default_nan;
2090 }
2091 float_raise( float_flag_divbyzero STATUS_VAR);
2092 return packFloat32( zSign, 0xFF, 0 );
2093 }
2094 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2095 }
2096 if ( aExp == 0 ) {
2097 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2098 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2099 }
2100 zExp = aExp - bExp + 0x7D;
2101 aSig = ( aSig | 0x00800000 )<<7;
2102 bSig = ( bSig | 0x00800000 )<<8;
2103 if ( bSig <= ( aSig + aSig ) ) {
2104 aSig >>= 1;
2105 ++zExp;
2106 }
2107 zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
2108 if ( ( zSig & 0x3F ) == 0 ) {
2109 zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
2110 }
2111 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2112
2113 }
2114
2115 /*----------------------------------------------------------------------------
2116 | Returns the remainder of the single-precision floating-point value `a'
2117 | with respect to the corresponding value `b'. The operation is performed
2118 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2119 *----------------------------------------------------------------------------*/
2120
2121 float32 float32_rem( float32 a, float32 b STATUS_PARAM )
2122 {
2123 flag aSign, zSign;
2124 int_fast16_t aExp, bExp, expDiff;
2125 uint32_t aSig, bSig;
2126 uint32_t q;
2127 uint64_t aSig64, bSig64, q64;
2128 uint32_t alternateASig;
2129 int32_t sigMean;
2130 a = float32_squash_input_denormal(a STATUS_VAR);
2131 b = float32_squash_input_denormal(b STATUS_VAR);
2132
2133 aSig = extractFloat32Frac( a );
2134 aExp = extractFloat32Exp( a );
2135 aSign = extractFloat32Sign( a );
2136 bSig = extractFloat32Frac( b );
2137 bExp = extractFloat32Exp( b );
2138 if ( aExp == 0xFF ) {
2139 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2140 return propagateFloat32NaN( a, b STATUS_VAR );
2141 }
2142 float_raise( float_flag_invalid STATUS_VAR);
2143 return float32_default_nan;
2144 }
2145 if ( bExp == 0xFF ) {
2146 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2147 return a;
2148 }
2149 if ( bExp == 0 ) {
2150 if ( bSig == 0 ) {
2151 float_raise( float_flag_invalid STATUS_VAR);
2152 return float32_default_nan;
2153 }
2154 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2155 }
2156 if ( aExp == 0 ) {
2157 if ( aSig == 0 ) return a;
2158 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2159 }
2160 expDiff = aExp - bExp;
2161 aSig |= 0x00800000;
2162 bSig |= 0x00800000;
2163 if ( expDiff < 32 ) {
2164 aSig <<= 8;
2165 bSig <<= 8;
2166 if ( expDiff < 0 ) {
2167 if ( expDiff < -1 ) return a;
2168 aSig >>= 1;
2169 }
2170 q = ( bSig <= aSig );
2171 if ( q ) aSig -= bSig;
2172 if ( 0 < expDiff ) {
2173 q = ( ( (uint64_t) aSig )<<32 ) / bSig;
2174 q >>= 32 - expDiff;
2175 bSig >>= 2;
2176 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2177 }
2178 else {
2179 aSig >>= 2;
2180 bSig >>= 2;
2181 }
2182 }
2183 else {
2184 if ( bSig <= aSig ) aSig -= bSig;
2185 aSig64 = ( (uint64_t) aSig )<<40;
2186 bSig64 = ( (uint64_t) bSig )<<40;
2187 expDiff -= 64;
2188 while ( 0 < expDiff ) {
2189 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2190 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2191 aSig64 = - ( ( bSig * q64 )<<38 );
2192 expDiff -= 62;
2193 }
2194 expDiff += 64;
2195 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2196 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2197 q = q64>>( 64 - expDiff );
2198 bSig <<= 6;
2199 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2200 }
2201 do {
2202 alternateASig = aSig;
2203 ++q;
2204 aSig -= bSig;
2205 } while ( 0 <= (int32_t) aSig );
2206 sigMean = aSig + alternateASig;
2207 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2208 aSig = alternateASig;
2209 }
2210 zSign = ( (int32_t) aSig < 0 );
2211 if ( zSign ) aSig = - aSig;
2212 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2213
2214 }
2215
2216 /*----------------------------------------------------------------------------
2217 | Returns the result of multiplying the single-precision floating-point values
2218 | `a' and `b' then adding 'c', with no intermediate rounding step after the
2219 | multiplication. The operation is performed according to the IEC/IEEE
2220 | Standard for Binary Floating-Point Arithmetic 754-2008.
2221 | The flags argument allows the caller to select negation of the
2222 | addend, the intermediate product, or the final result. (The difference
2223 | between this and having the caller do a separate negation is that negating
2224 | externally will flip the sign bit on NaNs.)
2225 *----------------------------------------------------------------------------*/
2226
2227 float32 float32_muladd(float32 a, float32 b, float32 c, int flags STATUS_PARAM)
2228 {
2229 flag aSign, bSign, cSign, zSign;
2230 int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
2231 uint32_t aSig, bSig, cSig;
2232 flag pInf, pZero, pSign;
2233 uint64_t pSig64, cSig64, zSig64;
2234 uint32_t pSig;
2235 int shiftcount;
2236 flag signflip, infzero;
2237
2238 a = float32_squash_input_denormal(a STATUS_VAR);
2239 b = float32_squash_input_denormal(b STATUS_VAR);
2240 c = float32_squash_input_denormal(c STATUS_VAR);
2241 aSig = extractFloat32Frac(a);
2242 aExp = extractFloat32Exp(a);
2243 aSign = extractFloat32Sign(a);
2244 bSig = extractFloat32Frac(b);
2245 bExp = extractFloat32Exp(b);
2246 bSign = extractFloat32Sign(b);
2247 cSig = extractFloat32Frac(c);
2248 cExp = extractFloat32Exp(c);
2249 cSign = extractFloat32Sign(c);
2250
2251 infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) ||
2252 (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0));
2253
2254 /* It is implementation-defined whether the cases of (0,inf,qnan)
2255 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
2256 * they return if they do), so we have to hand this information
2257 * off to the target-specific pick-a-NaN routine.
2258 */
2259 if (((aExp == 0xff) && aSig) ||
2260 ((bExp == 0xff) && bSig) ||
2261 ((cExp == 0xff) && cSig)) {
2262 return propagateFloat32MulAddNaN(a, b, c, infzero STATUS_VAR);
2263 }
2264
2265 if (infzero) {
2266 float_raise(float_flag_invalid STATUS_VAR);
2267 return float32_default_nan;
2268 }
2269
2270 if (flags & float_muladd_negate_c) {
2271 cSign ^= 1;
2272 }
2273
2274 signflip = (flags & float_muladd_negate_result) ? 1 : 0;
2275
2276 /* Work out the sign and type of the product */
2277 pSign = aSign ^ bSign;
2278 if (flags & float_muladd_negate_product) {
2279 pSign ^= 1;
2280 }
2281 pInf = (aExp == 0xff) || (bExp == 0xff);
2282 pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
2283
2284 if (cExp == 0xff) {
2285 if (pInf && (pSign ^ cSign)) {
2286 /* addition of opposite-signed infinities => InvalidOperation */
2287 float_raise(float_flag_invalid STATUS_VAR);
2288 return float32_default_nan;
2289 }
2290 /* Otherwise generate an infinity of the same sign */
2291 return packFloat32(cSign ^ signflip, 0xff, 0);
2292 }
2293
2294 if (pInf) {
2295 return packFloat32(pSign ^ signflip, 0xff, 0);
2296 }
2297
2298 if (pZero) {
2299 if (cExp == 0) {
2300 if (cSig == 0) {
2301 /* Adding two exact zeroes */
2302 if (pSign == cSign) {
2303 zSign = pSign;
2304 } else if (STATUS(float_rounding_mode) == float_round_down) {
2305 zSign = 1;
2306 } else {
2307 zSign = 0;
2308 }
2309 return packFloat32(zSign ^ signflip, 0, 0);
2310 }
2311 /* Exact zero plus a denorm */
2312 if (STATUS(flush_to_zero)) {
2313 float_raise(float_flag_output_denormal STATUS_VAR);
2314 return packFloat32(cSign ^ signflip, 0, 0);
2315 }
2316 }
2317 /* Zero plus something non-zero : just return the something */
2318 return packFloat32(cSign ^ signflip, cExp, cSig);
2319 }
2320
2321 if (aExp == 0) {
2322 normalizeFloat32Subnormal(aSig, &aExp, &aSig);
2323 }
2324 if (bExp == 0) {
2325 normalizeFloat32Subnormal(bSig, &bExp, &bSig);
2326 }
2327
2328 /* Calculate the actual result a * b + c */
2329
2330 /* Multiply first; this is easy. */
2331 /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
2332 * because we want the true exponent, not the "one-less-than"
2333 * flavour that roundAndPackFloat32() takes.
2334 */
2335 pExp = aExp + bExp - 0x7e;
2336 aSig = (aSig | 0x00800000) << 7;
2337 bSig = (bSig | 0x00800000) << 8;
2338 pSig64 = (uint64_t)aSig * bSig;
2339 if ((int64_t)(pSig64 << 1) >= 0) {
2340 pSig64 <<= 1;
2341 pExp--;
2342 }
2343
2344 zSign = pSign ^ signflip;
2345
2346 /* Now pSig64 is the significand of the multiply, with the explicit bit in
2347 * position 62.
2348 */
2349 if (cExp == 0) {
2350 if (!cSig) {
2351 /* Throw out the special case of c being an exact zero now */
2352 shift64RightJamming(pSig64, 32, &pSig64);
2353 pSig = pSig64;
2354 return roundAndPackFloat32(zSign, pExp - 1,
2355 pSig STATUS_VAR);
2356 }
2357 normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2358 }
2359
2360 cSig64 = (uint64_t)cSig << (62 - 23);
2361 cSig64 |= LIT64(0x4000000000000000);
2362 expDiff = pExp - cExp;
2363
2364 if (pSign == cSign) {
2365 /* Addition */
2366 if (expDiff > 0) {
2367 /* scale c to match p */
2368 shift64RightJamming(cSig64, expDiff, &cSig64);
2369 zExp = pExp;
2370 } else if (expDiff < 0) {
2371 /* scale p to match c */
2372 shift64RightJamming(pSig64, -expDiff, &pSig64);
2373 zExp = cExp;
2374 } else {
2375 /* no scaling needed */
2376 zExp = cExp;
2377 }
2378 /* Add significands and make sure explicit bit ends up in posn 62 */
2379 zSig64 = pSig64 + cSig64;
2380 if ((int64_t)zSig64 < 0) {
2381 shift64RightJamming(zSig64, 1, &zSig64);
2382 } else {
2383 zExp--;
2384 }
2385 } else {
2386 /* Subtraction */
2387 if (expDiff > 0) {
2388 shift64RightJamming(cSig64, expDiff, &cSig64);
2389 zSig64 = pSig64 - cSig64;
2390 zExp = pExp;
2391 } else if (expDiff < 0) {
2392 shift64RightJamming(pSig64, -expDiff, &pSig64);
2393 zSig64 = cSig64 - pSig64;
2394 zExp = cExp;
2395 zSign ^= 1;
2396 } else {
2397 zExp = pExp;
2398 if (cSig64 < pSig64) {
2399 zSig64 = pSig64 - cSig64;
2400 } else if (pSig64 < cSig64) {
2401 zSig64 = cSig64 - pSig64;
2402 zSign ^= 1;
2403 } else {
2404 /* Exact zero */
2405 zSign = signflip;
2406 if (STATUS(float_rounding_mode) == float_round_down) {
2407 zSign ^= 1;
2408 }
2409 return packFloat32(zSign, 0, 0);
2410 }
2411 }
2412 --zExp;
2413 /* Normalize to put the explicit bit back into bit 62. */
2414 shiftcount = countLeadingZeros64(zSig64) - 1;
2415 zSig64 <<= shiftcount;
2416 zExp -= shiftcount;
2417 }
2418 shift64RightJamming(zSig64, 32, &zSig64);
2419 return roundAndPackFloat32(zSign, zExp, zSig64 STATUS_VAR);
2420 }
2421
2422
2423 /*----------------------------------------------------------------------------
2424 | Returns the square root of the single-precision floating-point value `a'.
2425 | The operation is performed according to the IEC/IEEE Standard for Binary
2426 | Floating-Point Arithmetic.
2427 *----------------------------------------------------------------------------*/
2428
2429 float32 float32_sqrt( float32 a STATUS_PARAM )
2430 {
2431 flag aSign;
2432 int_fast16_t aExp, zExp;
2433 uint32_t aSig, zSig;
2434 uint64_t rem, term;
2435 a = float32_squash_input_denormal(a STATUS_VAR);
2436
2437 aSig = extractFloat32Frac( a );
2438 aExp = extractFloat32Exp( a );
2439 aSign = extractFloat32Sign( a );
2440 if ( aExp == 0xFF ) {
2441 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2442 if ( ! aSign ) return a;
2443 float_raise( float_flag_invalid STATUS_VAR);
2444 return float32_default_nan;
2445 }
2446 if ( aSign ) {
2447 if ( ( aExp | aSig ) == 0 ) return a;
2448 float_raise( float_flag_invalid STATUS_VAR);
2449 return float32_default_nan;
2450 }
2451 if ( aExp == 0 ) {
2452 if ( aSig == 0 ) return float32_zero;
2453 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2454 }
2455 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2456 aSig = ( aSig | 0x00800000 )<<8;
2457 zSig = estimateSqrt32( aExp, aSig ) + 2;
2458 if ( ( zSig & 0x7F ) <= 5 ) {
2459 if ( zSig < 2 ) {
2460 zSig = 0x7FFFFFFF;
2461 goto roundAndPack;
2462 }
2463 aSig >>= aExp & 1;
2464 term = ( (uint64_t) zSig ) * zSig;
2465 rem = ( ( (uint64_t) aSig )<<32 ) - term;
2466 while ( (int64_t) rem < 0 ) {
2467 --zSig;
2468 rem += ( ( (uint64_t) zSig )<<1 ) | 1;
2469 }
2470 zSig |= ( rem != 0 );
2471 }
2472 shift32RightJamming( zSig, 1, &zSig );
2473 roundAndPack:
2474 return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2475
2476 }
2477
2478 /*----------------------------------------------------------------------------
2479 | Returns the binary exponential of the single-precision floating-point value
2480 | `a'. The operation is performed according to the IEC/IEEE Standard for
2481 | Binary Floating-Point Arithmetic.
2482 |
2483 | Uses the following identities:
2484 |
2485 | 1. -------------------------------------------------------------------------
2486 | x x*ln(2)
2487 | 2 = e
2488 |
2489 | 2. -------------------------------------------------------------------------
2490 | 2 3 4 5 n
2491 | x x x x x x x
2492 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2493 | 1! 2! 3! 4! 5! n!
2494 *----------------------------------------------------------------------------*/
2495
2496 static const float64 float32_exp2_coefficients[15] =
2497 {
2498 const_float64( 0x3ff0000000000000ll ), /* 1 */
2499 const_float64( 0x3fe0000000000000ll ), /* 2 */
2500 const_float64( 0x3fc5555555555555ll ), /* 3 */
2501 const_float64( 0x3fa5555555555555ll ), /* 4 */
2502 const_float64( 0x3f81111111111111ll ), /* 5 */
2503 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
2504 const_float64( 0x3f2a01a01a01a01all ), /* 7 */
2505 const_float64( 0x3efa01a01a01a01all ), /* 8 */
2506 const_float64( 0x3ec71de3a556c734ll ), /* 9 */
2507 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2508 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2509 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2510 const_float64( 0x3de6124613a86d09ll ), /* 13 */
2511 const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2512 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2513 };
2514
2515 float32 float32_exp2( float32 a STATUS_PARAM )
2516 {
2517 flag aSign;
2518 int_fast16_t aExp;
2519 uint32_t aSig;
2520 float64 r, x, xn;
2521 int i;
2522 a = float32_squash_input_denormal(a STATUS_VAR);
2523
2524 aSig = extractFloat32Frac( a );
2525 aExp = extractFloat32Exp( a );
2526 aSign = extractFloat32Sign( a );
2527
2528 if ( aExp == 0xFF) {
2529 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2530 return (aSign) ? float32_zero : a;
2531 }
2532 if (aExp == 0) {
2533 if (aSig == 0) return float32_one;
2534 }
2535
2536 float_raise( float_flag_inexact STATUS_VAR);
2537
2538 /* ******************************* */
2539 /* using float64 for approximation */
2540 /* ******************************* */
2541 x = float32_to_float64(a STATUS_VAR);
2542 x = float64_mul(x, float64_ln2 STATUS_VAR);
2543
2544 xn = x;
2545 r = float64_one;
2546 for (i = 0 ; i < 15 ; i++) {
2547 float64 f;
2548
2549 f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2550 r = float64_add(r, f STATUS_VAR);
2551
2552 xn = float64_mul(xn, x STATUS_VAR);
2553 }
2554
2555 return float64_to_float32(r, status);
2556 }
2557
2558 /*----------------------------------------------------------------------------
2559 | Returns the binary log of the single-precision floating-point value `a'.
2560 | The operation is performed according to the IEC/IEEE Standard for Binary
2561 | Floating-Point Arithmetic.
2562 *----------------------------------------------------------------------------*/
2563 float32 float32_log2( float32 a STATUS_PARAM )
2564 {
2565 flag aSign, zSign;
2566 int_fast16_t aExp;
2567 uint32_t aSig, zSig, i;
2568
2569 a = float32_squash_input_denormal(a STATUS_VAR);
2570 aSig = extractFloat32Frac( a );
2571 aExp = extractFloat32Exp( a );
2572 aSign = extractFloat32Sign( a );
2573
2574 if ( aExp == 0 ) {
2575 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2576 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2577 }
2578 if ( aSign ) {
2579 float_raise( float_flag_invalid STATUS_VAR);
2580 return float32_default_nan;
2581 }
2582 if ( aExp == 0xFF ) {
2583 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2584 return a;
2585 }
2586
2587 aExp -= 0x7F;
2588 aSig |= 0x00800000;
2589 zSign = aExp < 0;
2590 zSig = aExp << 23;
2591
2592 for (i = 1 << 22; i > 0; i >>= 1) {
2593 aSig = ( (uint64_t)aSig * aSig ) >> 23;
2594 if ( aSig & 0x01000000 ) {
2595 aSig >>= 1;
2596 zSig |= i;
2597 }
2598 }
2599
2600 if ( zSign )
2601 zSig = -zSig;
2602
2603 return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2604 }
2605
2606 /*----------------------------------------------------------------------------
2607 | Returns 1 if the single-precision floating-point value `a' is equal to
2608 | the corresponding value `b', and 0 otherwise. The invalid exception is
2609 | raised if either operand is a NaN. Otherwise, the comparison is performed
2610 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2611 *----------------------------------------------------------------------------*/
2612
2613 int float32_eq( float32 a, float32 b STATUS_PARAM )
2614 {
2615 uint32_t av, bv;
2616 a = float32_squash_input_denormal(a STATUS_VAR);
2617 b = float32_squash_input_denormal(b STATUS_VAR);
2618
2619 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2620 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2621 ) {
2622 float_raise( float_flag_invalid STATUS_VAR);
2623 return 0;
2624 }
2625 av = float32_val(a);
2626 bv = float32_val(b);
2627 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2628 }
2629
2630 /*----------------------------------------------------------------------------
2631 | Returns 1 if the single-precision floating-point value `a' is less than
2632 | or equal to the corresponding value `b', and 0 otherwise. The invalid
2633 | exception is raised if either operand is a NaN. The comparison is performed
2634 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2635 *----------------------------------------------------------------------------*/
2636
2637 int float32_le( float32 a, float32 b STATUS_PARAM )
2638 {
2639 flag aSign, bSign;
2640 uint32_t av, bv;
2641 a = float32_squash_input_denormal(a STATUS_VAR);
2642 b = float32_squash_input_denormal(b STATUS_VAR);
2643
2644 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2645 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2646 ) {
2647 float_raise( float_flag_invalid STATUS_VAR);
2648 return 0;
2649 }
2650 aSign = extractFloat32Sign( a );
2651 bSign = extractFloat32Sign( b );
2652 av = float32_val(a);
2653 bv = float32_val(b);
2654 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2655 return ( av == bv ) || ( aSign ^ ( av < bv ) );
2656
2657 }
2658
2659 /*----------------------------------------------------------------------------
2660 | Returns 1 if the single-precision floating-point value `a' is less than
2661 | the corresponding value `b', and 0 otherwise. The invalid exception is
2662 | raised if either operand is a NaN. The comparison is performed according
2663 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2664 *----------------------------------------------------------------------------*/
2665
2666 int float32_lt( float32 a, float32 b STATUS_PARAM )
2667 {
2668 flag aSign, bSign;
2669 uint32_t av, bv;
2670 a = float32_squash_input_denormal(a STATUS_VAR);
2671 b = float32_squash_input_denormal(b STATUS_VAR);
2672
2673 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2674 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2675 ) {
2676 float_raise( float_flag_invalid STATUS_VAR);
2677 return 0;
2678 }
2679 aSign = extractFloat32Sign( a );
2680 bSign = extractFloat32Sign( b );
2681 av = float32_val(a);
2682 bv = float32_val(b);
2683 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2684 return ( av != bv ) && ( aSign ^ ( av < bv ) );
2685
2686 }
2687
2688 /*----------------------------------------------------------------------------
2689 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2690 | be compared, and 0 otherwise. The invalid exception is raised if either
2691 | operand is a NaN. The comparison is performed according to the IEC/IEEE
2692 | Standard for Binary Floating-Point Arithmetic.
2693 *----------------------------------------------------------------------------*/
2694
2695 int float32_unordered( float32 a, float32 b STATUS_PARAM )
2696 {
2697 a = float32_squash_input_denormal(a STATUS_VAR);
2698 b = float32_squash_input_denormal(b STATUS_VAR);
2699
2700 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2701 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2702 ) {
2703 float_raise( float_flag_invalid STATUS_VAR);
2704 return 1;
2705 }
2706 return 0;
2707 }
2708
2709 /*----------------------------------------------------------------------------
2710 | Returns 1 if the single-precision floating-point value `a' is equal to
2711 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2712 | exception. The comparison is performed according to the IEC/IEEE Standard
2713 | for Binary Floating-Point Arithmetic.
2714 *----------------------------------------------------------------------------*/
2715
2716 int float32_eq_quiet( float32 a, float32 b STATUS_PARAM )
2717 {
2718 a = float32_squash_input_denormal(a STATUS_VAR);
2719 b = float32_squash_input_denormal(b STATUS_VAR);
2720
2721 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2722 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2723 ) {
2724 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2725 float_raise( float_flag_invalid STATUS_VAR);
2726 }
2727 return 0;
2728 }
2729 return ( float32_val(a) == float32_val(b) ) ||
2730 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2731 }
2732
2733 /*----------------------------------------------------------------------------
2734 | Returns 1 if the single-precision floating-point value `a' is less than or
2735 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2736 | cause an exception. Otherwise, the comparison is performed according to the
2737 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2738 *----------------------------------------------------------------------------*/
2739
2740 int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2741 {
2742 flag aSign, bSign;
2743 uint32_t av, bv;
2744 a = float32_squash_input_denormal(a STATUS_VAR);
2745 b = float32_squash_input_denormal(b STATUS_VAR);
2746
2747 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2748 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2749 ) {
2750 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2751 float_raise( float_flag_invalid STATUS_VAR);
2752 }
2753 return 0;
2754 }
2755 aSign = extractFloat32Sign( a );
2756 bSign = extractFloat32Sign( b );
2757 av = float32_val(a);
2758 bv = float32_val(b);
2759 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2760 return ( av == bv ) || ( aSign ^ ( av < bv ) );
2761
2762 }
2763
2764 /*----------------------------------------------------------------------------
2765 | Returns 1 if the single-precision floating-point value `a' is less than
2766 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2767 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
2768 | Standard for Binary Floating-Point Arithmetic.
2769 *----------------------------------------------------------------------------*/
2770
2771 int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2772 {
2773 flag aSign, bSign;
2774 uint32_t av, bv;
2775 a = float32_squash_input_denormal(a STATUS_VAR);
2776 b = float32_squash_input_denormal(b STATUS_VAR);
2777
2778 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2779 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2780 ) {
2781 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2782 float_raise( float_flag_invalid STATUS_VAR);
2783 }
2784 return 0;
2785 }
2786 aSign = extractFloat32Sign( a );
2787 bSign = extractFloat32Sign( b );
2788 av = float32_val(a);
2789 bv = float32_val(b);
2790 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2791 return ( av != bv ) && ( aSign ^ ( av < bv ) );
2792
2793 }
2794
2795 /*----------------------------------------------------------------------------
2796 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2797 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
2798 | comparison is performed according to the IEC/IEEE Standard for Binary
2799 | Floating-Point Arithmetic.
2800 *----------------------------------------------------------------------------*/
2801
2802 int float32_unordered_quiet( float32 a, float32 b STATUS_PARAM )
2803 {
2804 a = float32_squash_input_denormal(a STATUS_VAR);
2805 b = float32_squash_input_denormal(b STATUS_VAR);
2806
2807 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2808 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2809 ) {
2810 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2811 float_raise( float_flag_invalid STATUS_VAR);
2812 }
2813 return 1;
2814 }
2815 return 0;
2816 }
2817
2818 /*----------------------------------------------------------------------------
2819 | Returns the result of converting the double-precision floating-point value
2820 | `a' to the 32-bit two's complement integer format. The conversion is
2821 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2822 | Arithmetic---which means in particular that the conversion is rounded
2823 | according to the current rounding mode. If `a' is a NaN, the largest
2824 | positive integer is returned. Otherwise, if the conversion overflows, the
2825 | largest integer with the same sign as `a' is returned.
2826 *----------------------------------------------------------------------------*/
2827
2828 int32 float64_to_int32( float64 a STATUS_PARAM )
2829 {
2830 flag aSign;
2831 int_fast16_t aExp, shiftCount;
2832 uint64_t aSig;
2833 a = float64_squash_input_denormal(a STATUS_VAR);
2834
2835 aSig = extractFloat64Frac( a );
2836 aExp = extractFloat64Exp( a );
2837 aSign = extractFloat64Sign( a );
2838 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2839 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2840 shiftCount = 0x42C - aExp;
2841 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2842 return roundAndPackInt32( aSign, aSig STATUS_VAR );
2843
2844 }
2845
2846 /*----------------------------------------------------------------------------
2847 | Returns the result of converting the double-precision floating-point value
2848 | `a' to the 32-bit two's complement integer format. The conversion is
2849 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2850 | Arithmetic, except that the conversion is always rounded toward zero.
2851 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2852 | the conversion overflows, the largest integer with the same sign as `a' is
2853 | returned.
2854 *----------------------------------------------------------------------------*/
2855
2856 int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2857 {
2858 flag aSign;
2859 int_fast16_t aExp, shiftCount;
2860 uint64_t aSig, savedASig;
2861 int32_t z;
2862 a = float64_squash_input_denormal(a STATUS_VAR);
2863
2864 aSig = extractFloat64Frac( a );
2865 aExp = extractFloat64Exp( a );
2866 aSign = extractFloat64Sign( a );
2867 if ( 0x41E < aExp ) {
2868 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2869 goto invalid;
2870 }
2871 else if ( aExp < 0x3FF ) {
2872 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2873 return 0;
2874 }
2875 aSig |= LIT64( 0x0010000000000000 );
2876 shiftCount = 0x433 - aExp;
2877 savedASig = aSig;
2878 aSig >>= shiftCount;
2879 z = aSig;
2880 if ( aSign ) z = - z;
2881 if ( ( z < 0 ) ^ aSign ) {
2882 invalid:
2883 float_raise( float_flag_invalid STATUS_VAR);
2884 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2885 }
2886 if ( ( aSig<<shiftCount ) != savedASig ) {
2887 STATUS(float_exception_flags) |= float_flag_inexact;
2888 }
2889 return z;
2890
2891 }
2892
2893 /*----------------------------------------------------------------------------
2894 | Returns the result of converting the double-precision floating-point value
2895 | `a' to the 16-bit two's complement integer format. The conversion is
2896 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2897 | Arithmetic, except that the conversion is always rounded toward zero.
2898 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2899 | the conversion overflows, the largest integer with the same sign as `a' is
2900 | returned.
2901 *----------------------------------------------------------------------------*/
2902
2903 int_fast16_t float64_to_int16_round_to_zero(float64 a STATUS_PARAM)
2904 {
2905 flag aSign;
2906 int_fast16_t aExp, shiftCount;
2907 uint64_t aSig, savedASig;
2908 int32 z;
2909
2910 aSig = extractFloat64Frac( a );
2911 aExp = extractFloat64Exp( a );
2912 aSign = extractFloat64Sign( a );
2913 if ( 0x40E < aExp ) {
2914 if ( ( aExp == 0x7FF ) && aSig ) {
2915 aSign = 0;
2916 }
2917 goto invalid;
2918 }
2919 else if ( aExp < 0x3FF ) {
2920 if ( aExp || aSig ) {
2921 STATUS(float_exception_flags) |= float_flag_inexact;
2922 }
2923 return 0;
2924 }
2925 aSig |= LIT64( 0x0010000000000000 );
2926 shiftCount = 0x433 - aExp;
2927 savedASig = aSig;
2928 aSig >>= shiftCount;
2929 z = aSig;
2930 if ( aSign ) {
2931 z = - z;
2932 }
2933 if ( ( (int16_t)z < 0 ) ^ aSign ) {
2934 invalid:
2935 float_raise( float_flag_invalid STATUS_VAR);
2936 return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
2937 }
2938 if ( ( aSig<<shiftCount ) != savedASig ) {
2939 STATUS(float_exception_flags) |= float_flag_inexact;
2940 }
2941 return z;
2942 }
2943
2944 /*----------------------------------------------------------------------------
2945 | Returns the result of converting the double-precision floating-point value
2946 | `a' to the 64-bit two's complement integer format. The conversion is
2947 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2948 | Arithmetic---which means in particular that the conversion is rounded
2949 | according to the current rounding mode. If `a' is a NaN, the largest
2950 | positive integer is returned. Otherwise, if the conversion overflows, the
2951 | largest integer with the same sign as `a' is returned.
2952 *----------------------------------------------------------------------------*/
2953
2954 int64 float64_to_int64( float64 a STATUS_PARAM )
2955 {
2956 flag aSign;
2957 int_fast16_t aExp, shiftCount;
2958 uint64_t aSig, aSigExtra;
2959 a = float64_squash_input_denormal(a STATUS_VAR);
2960
2961 aSig = extractFloat64Frac( a );
2962 aExp = extractFloat64Exp( a );
2963 aSign = extractFloat64Sign( a );
2964 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2965 shiftCount = 0x433 - aExp;
2966 if ( shiftCount <= 0 ) {
2967 if ( 0x43E < aExp ) {
2968 float_raise( float_flag_invalid STATUS_VAR);
2969 if ( ! aSign
2970 || ( ( aExp == 0x7FF )
2971 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2972 ) {
2973 return LIT64( 0x7FFFFFFFFFFFFFFF );
2974 }
2975 return (int64_t) LIT64( 0x8000000000000000 );
2976 }
2977 aSigExtra = 0;
2978 aSig <<= - shiftCount;
2979 }
2980 else {
2981 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2982 }
2983 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2984
2985 }
2986
2987 /*----------------------------------------------------------------------------
2988 | Returns the result of converting the double-precision floating-point value
2989 | `a' to the 64-bit two's complement integer format. The conversion is
2990 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2991 | Arithmetic, except that the conversion is always rounded toward zero.
2992 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2993 | the conversion overflows, the largest integer with the same sign as `a' is
2994 | returned.
2995 *----------------------------------------------------------------------------*/
2996
2997 int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2998 {
2999 flag aSign;
3000 int_fast16_t aExp, shiftCount;
3001 uint64_t aSig;
3002 int64 z;
3003 a = float64_squash_input_denormal(a STATUS_VAR);
3004
3005 aSig = extractFloat64Frac( a );
3006 aExp = extractFloat64Exp( a );
3007 aSign = extractFloat64Sign( a );
3008 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3009 shiftCount = aExp - 0x433;
3010 if ( 0 <= shiftCount ) {
3011 if ( 0x43E <= aExp ) {
3012 if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
3013 float_raise( float_flag_invalid STATUS_VAR);
3014 if ( ! aSign
3015 || ( ( aExp == 0x7FF )
3016 && ( aSig != LIT64( 0x0010000000000000 ) ) )
3017 ) {
3018 return LIT64( 0x7FFFFFFFFFFFFFFF );
3019 }
3020 }
3021 return (int64_t) LIT64( 0x8000000000000000 );
3022 }
3023 z = aSig<<shiftCount;
3024 }
3025 else {
3026 if ( aExp < 0x3FE ) {
3027 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3028 return 0;
3029 }
3030 z = aSig>>( - shiftCount );
3031 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
3032 STATUS(float_exception_flags) |= float_flag_inexact;
3033 }
3034 }
3035 if ( aSign ) z = - z;
3036 return z;
3037
3038 }
3039
3040 /*----------------------------------------------------------------------------
3041 | Returns the result of converting the double-precision floating-point value
3042 | `a' to the single-precision floating-point format. The conversion is
3043 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3044 | Arithmetic.
3045 *----------------------------------------------------------------------------*/
3046
3047 float32 float64_to_float32( float64 a STATUS_PARAM )
3048 {
3049 flag aSign;
3050 int_fast16_t aExp;
3051 uint64_t aSig;
3052 uint32_t zSig;
3053 a = float64_squash_input_denormal(a STATUS_VAR);
3054
3055 aSig = extractFloat64Frac( a );
3056 aExp = extractFloat64Exp( a );
3057 aSign = extractFloat64Sign( a );
3058 if ( aExp == 0x7FF ) {
3059 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3060 return packFloat32( aSign, 0xFF, 0 );
3061 }
3062 shift64RightJamming( aSig, 22, &aSig );
3063 zSig = aSig;
3064 if ( aExp || zSig ) {
3065 zSig |= 0x40000000;
3066 aExp -= 0x381;
3067 }
3068 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
3069
3070 }
3071
3072
3073 /*----------------------------------------------------------------------------
3074 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3075 | half-precision floating-point value, returning the result. After being
3076 | shifted into the proper positions, the three fields are simply added
3077 | together to form the result. This means that any integer portion of `zSig'
3078 | will be added into the exponent. Since a properly normalized significand
3079 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3080 | than the desired result exponent whenever `zSig' is a complete, normalized
3081 | significand.
3082 *----------------------------------------------------------------------------*/
3083 static float16 packFloat16(flag zSign, int_fast16_t zExp, uint16_t zSig)
3084 {
3085 return make_float16(
3086 (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
3087 }
3088
3089 /*----------------------------------------------------------------------------
3090 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3091 | and significand `zSig', and returns the proper half-precision floating-
3092 | point value corresponding to the abstract input. Ordinarily, the abstract
3093 | value is simply rounded and packed into the half-precision format, with
3094 | the inexact exception raised if the abstract input cannot be represented
3095 | exactly. However, if the abstract value is too large, the overflow and
3096 | inexact exceptions are raised and an infinity or maximal finite value is
3097 | returned. If the abstract value is too small, the input value is rounded to
3098 | a subnormal number, and the underflow and inexact exceptions are raised if
3099 | the abstract input cannot be represented exactly as a subnormal half-
3100 | precision floating-point number.
3101 | The `ieee' flag indicates whether to use IEEE standard half precision, or
3102 | ARM-style "alternative representation", which omits the NaN and Inf
3103 | encodings in order to raise the maximum representable exponent by one.
3104 | The input significand `zSig' has its binary point between bits 22
3105 | and 23, which is 13 bits to the left of the usual location. This shifted
3106 | significand must be normalized or smaller. If `zSig' is not normalized,
3107 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3108 | and it must not require rounding. In the usual case that `zSig' is
3109 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3110 | Note the slightly odd position of the binary point in zSig compared with the
3111 | other roundAndPackFloat functions. This should probably be fixed if we
3112 | need to implement more float16 routines than just conversion.
3113 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3114 | Binary Floating-Point Arithmetic.
3115 *----------------------------------------------------------------------------*/
3116
3117 static float32 roundAndPackFloat16(flag zSign, int_fast16_t zExp,
3118 uint32_t zSig, flag ieee STATUS_PARAM)
3119 {
3120 int maxexp = ieee ? 29 : 30;
3121 uint32_t mask;
3122 uint32_t increment;
3123 int8 roundingMode;
3124 bool rounding_bumps_exp;
3125 bool is_tiny = false;
3126
3127 /* Calculate the mask of bits of the mantissa which are not
3128 * representable in half-precision and will be lost.
3129 */
3130 if (zExp < 1) {
3131 /* Will be denormal in halfprec */
3132 mask = 0x00ffffff;
3133 if (zExp >= -11) {
3134 mask >>= 11 + zExp;
3135 }
3136 } else {
3137 /* Normal number in halfprec */
3138 mask = 0x00001fff;
3139 }
3140
3141 roundingMode = STATUS(float_rounding_mode);
3142 switch (roundingMode) {
3143 case float_round_nearest_even:
3144 increment = (mask + 1) >> 1;
3145 if ((zSig & mask) == increment) {
3146 increment = zSig & (increment << 1);
3147 }
3148 break;
3149 case float_round_up:
3150 increment = zSign ? 0 : mask;
3151 break;
3152 case float_round_down:
3153 increment = zSign ? mask : 0;
3154 break;
3155 default: /* round_to_zero */
3156 increment = 0;
3157 break;
3158 }
3159
3160 rounding_bumps_exp = (zSig + increment >= 0x01000000);
3161
3162 if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3163 if (ieee) {
3164 float_raise(float_flag_overflow | float_flag_inexact STATUS_VAR);
3165 return packFloat16(zSign, 0x1f, 0);
3166 } else {
3167 float_raise(float_flag_invalid STATUS_VAR);
3168 return packFloat16(zSign, 0x1f, 0x3ff);
3169 }
3170 }
3171
3172 if (zExp < 0) {
3173 /* Note that flush-to-zero does not affect half-precision results */
3174 is_tiny =
3175 (STATUS(float_detect_tininess) == float_tininess_before_rounding)
3176 || (zExp < -1)
3177 || (!rounding_bumps_exp);
3178 }
3179 if (zSig & mask) {
3180 float_raise(float_flag_inexact STATUS_VAR);
3181 if (is_tiny) {
3182 float_raise(float_flag_underflow STATUS_VAR);
3183 }
3184 }
3185
3186 zSig += increment;
3187 if (rounding_bumps_exp) {
3188 zSig >>= 1;
3189 zExp++;
3190 }
3191
3192 if (zExp < -10) {
3193 return packFloat16(zSign, 0, 0);
3194 }
3195 if (zExp < 0) {
3196 zSig >>= -zExp;
3197 zExp = 0;
3198 }
3199 return packFloat16(zSign, zExp, zSig >> 13);
3200 }
3201
3202 static void normalizeFloat16Subnormal(uint32_t aSig, int_fast16_t *zExpPtr,
3203 uint32_t *zSigPtr)
3204 {
3205 int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3206 *zSigPtr = aSig << shiftCount;
3207 *zExpPtr = 1 - shiftCount;
3208 }
3209
3210 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
3211 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
3212
3213 float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM)
3214 {
3215 flag aSign;
3216 int_fast16_t aExp;
3217 uint32_t aSig;
3218
3219 aSign = extractFloat16Sign(a);
3220 aExp = extractFloat16Exp(a);
3221 aSig = extractFloat16Frac(a);
3222
3223 if (aExp == 0x1f && ieee) {
3224 if (aSig) {
3225 return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3226 }
3227 return packFloat32(aSign, 0xff, 0);
3228 }
3229 if (aExp == 0) {
3230 if (aSig == 0) {
3231 return packFloat32(aSign, 0, 0);
3232 }
3233
3234 normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3235 aExp--;
3236 }
3237 return packFloat32( aSign, aExp + 0x70, aSig << 13);
3238 }
3239
3240 float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM)
3241 {
3242 flag aSign;
3243 int_fast16_t aExp;
3244 uint32_t aSig;
3245
3246 a = float32_squash_input_denormal(a STATUS_VAR);
3247
3248 aSig = extractFloat32Frac( a );
3249 aExp = extractFloat32Exp( a );
3250 aSign = extractFloat32Sign( a );
3251 if ( aExp == 0xFF ) {
3252 if (aSig) {
3253 /* Input is a NaN */
3254 if (!ieee) {
3255 float_raise(float_flag_invalid STATUS_VAR);
3256 return packFloat16(aSign, 0, 0);
3257 }
3258 return commonNaNToFloat16(
3259 float32ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3260 }
3261 /* Infinity */
3262 if (!ieee) {
3263 float_raise(float_flag_invalid STATUS_VAR);
3264 return packFloat16(aSign, 0x1f, 0x3ff);
3265 }
3266 return packFloat16(aSign, 0x1f, 0);
3267 }
3268 if (aExp == 0 && aSig == 0) {
3269 return packFloat16(aSign, 0, 0);
3270 }
3271 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3272 * even if the input is denormal; however this is harmless because
3273 * the largest possible single-precision denormal is still smaller
3274 * than the smallest representable half-precision denormal, and so we
3275 * will end up ignoring aSig and returning via the "always return zero"
3276 * codepath.
3277 */
3278 aSig |= 0x00800000;
3279 aExp -= 0x71;
3280
3281 return roundAndPackFloat16(aSign, aExp, aSig, ieee STATUS_VAR);
3282 }
3283
3284 float64 float16_to_float64(float16 a, flag ieee STATUS_PARAM)
3285 {
3286 flag aSign;
3287 int_fast16_t aExp;
3288 uint32_t aSig;
3289
3290 aSign = extractFloat16Sign(a);
3291 aExp = extractFloat16Exp(a);
3292 aSig = extractFloat16Frac(a);
3293
3294 if (aExp == 0x1f && ieee) {
3295 if (aSig) {
3296 return commonNaNToFloat64(
3297 float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3298 }
3299 return packFloat64(aSign, 0x7ff, 0);
3300 }
3301 if (aExp == 0) {
3302 if (aSig == 0) {
3303 return packFloat64(aSign, 0, 0);
3304 }
3305
3306 normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3307 aExp--;
3308 }
3309 return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);
3310 }
3311
3312 float16 float64_to_float16(float64 a, flag ieee STATUS_PARAM)
3313 {
3314 flag aSign;
3315 int_fast16_t aExp;
3316 uint64_t aSig;
3317 uint32_t zSig;
3318
3319 a = float64_squash_input_denormal(a STATUS_VAR);
3320
3321 aSig = extractFloat64Frac(a);
3322 aExp = extractFloat64Exp(a);
3323 aSign = extractFloat64Sign(a);
3324 if (aExp == 0x7FF) {
3325 if (aSig) {
3326 /* Input is a NaN */
3327 if (!ieee) {
3328 float_raise(float_flag_invalid STATUS_VAR);
3329 return packFloat16(aSign, 0, 0);
3330 }
3331 return commonNaNToFloat16(
3332 float64ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3333 }
3334 /* Infinity */
3335 if (!ieee) {
3336 float_raise(float_flag_invalid STATUS_VAR);
3337 return packFloat16(aSign, 0x1f, 0x3ff);
3338 }
3339 return packFloat16(aSign, 0x1f, 0);
3340 }
3341 shift64RightJamming(aSig, 29, &aSig);
3342 zSig = aSig;
3343 if (aExp == 0 && zSig == 0) {
3344 return packFloat16(aSign, 0, 0);
3345 }
3346 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3347 * even if the input is denormal; however this is harmless because
3348 * the largest possible single-precision denormal is still smaller
3349 * than the smallest representable half-precision denormal, and so we
3350 * will end up ignoring aSig and returning via the "always return zero"
3351 * codepath.
3352 */
3353 zSig |= 0x00800000;
3354 aExp -= 0x3F1;
3355
3356 return roundAndPackFloat16(aSign, aExp, zSig, ieee STATUS_VAR);
3357 }
3358
3359 /*----------------------------------------------------------------------------
3360 | Returns the result of converting the double-precision floating-point value
3361 | `a' to the extended double-precision floating-point format. The conversion
3362 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3363 | Arithmetic.
3364 *----------------------------------------------------------------------------*/
3365
3366 floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
3367 {
3368 flag aSign;
3369 int_fast16_t aExp;
3370 uint64_t aSig;
3371
3372 a = float64_squash_input_denormal(a STATUS_VAR);
3373 aSig = extractFloat64Frac( a );
3374 aExp = extractFloat64Exp( a );
3375 aSign = extractFloat64Sign( a );
3376 if ( aExp == 0x7FF ) {
3377 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3378 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3379 }
3380 if ( aExp == 0 ) {
3381 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3382 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3383 }
3384 return
3385 packFloatx80(
3386 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
3387
3388 }
3389
3390 /*----------------------------------------------------------------------------
3391 | Returns the result of converting the double-precision floating-point value
3392 | `a' to the quadruple-precision floating-point format. The conversion is
3393 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3394 | Arithmetic.
3395 *----------------------------------------------------------------------------*/
3396
3397 float128 float64_to_float128( float64 a STATUS_PARAM )
3398 {
3399 flag aSign;
3400 int_fast16_t aExp;
3401 uint64_t aSig, zSig0, zSig1;
3402
3403 a = float64_squash_input_denormal(a STATUS_VAR);
3404 aSig = extractFloat64Frac( a );
3405 aExp = extractFloat64Exp( a );
3406 aSign = extractFloat64Sign( a );
3407 if ( aExp == 0x7FF ) {
3408 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3409 return packFloat128( aSign, 0x7FFF, 0, 0 );
3410 }
3411 if ( aExp == 0 ) {
3412 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3413 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3414 --aExp;
3415 }
3416 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
3417 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
3418
3419 }
3420
3421 /*----------------------------------------------------------------------------
3422 | Rounds the double-precision floating-point value `a' to an integer, and
3423 | returns the result as a double-precision floating-point value. The
3424 | operation is performed according to the IEC/IEEE Standard for Binary
3425 | Floating-Point Arithmetic.
3426 *----------------------------------------------------------------------------*/
3427
3428 float64 float64_round_to_int( float64 a STATUS_PARAM )
3429 {
3430 flag aSign;
3431 int_fast16_t aExp;
3432 uint64_t lastBitMask, roundBitsMask;
3433 int8 roundingMode;
3434 uint64_t z;
3435 a = float64_squash_input_denormal(a STATUS_VAR);
3436
3437 aExp = extractFloat64Exp( a );
3438 if ( 0x433 <= aExp ) {
3439 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
3440 return propagateFloat64NaN( a, a STATUS_VAR );
3441 }
3442 return a;
3443 }
3444 if ( aExp < 0x3FF ) {
3445 if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
3446 STATUS(float_exception_flags) |= float_flag_inexact;
3447 aSign = extractFloat64Sign( a );
3448 switch ( STATUS(float_rounding_mode) ) {
3449 case float_round_nearest_even:
3450 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3451 return packFloat64( aSign, 0x3FF, 0 );
3452 }
3453 break;
3454 case float_round_down:
3455 return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
3456 case float_round_up:
3457 return make_float64(
3458 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
3459 }
3460 return packFloat64( aSign, 0, 0 );
3461 }
3462 lastBitMask = 1;
3463 lastBitMask <<= 0x433 - aExp;
3464 roundBitsMask = lastBitMask - 1;
3465 z = float64_val(a);
3466 roundingMode = STATUS(float_rounding_mode);
3467 if ( roundingMode == float_round_nearest_even ) {
3468 z += lastBitMask>>1;
3469 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
3470 }
3471 else if ( roundingMode != float_round_to_zero ) {
3472 if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
3473 z += roundBitsMask;
3474 }
3475 }
3476 z &= ~ roundBitsMask;
3477 if ( z != float64_val(a) )
3478 STATUS(float_exception_flags) |= float_flag_inexact;
3479 return make_float64(z);
3480
3481 }
3482
3483 float64 float64_trunc_to_int( float64 a STATUS_PARAM)
3484 {
3485 int oldmode;
3486 float64 res;
3487 oldmode = STATUS(float_rounding_mode);
3488 STATUS(float_rounding_mode) = float_round_to_zero;
3489 res = float64_round_to_int(a STATUS_VAR);
3490 STATUS(float_rounding_mode) = oldmode;
3491 return res;
3492 }
3493
3494 /*----------------------------------------------------------------------------
3495 | Returns the result of adding the absolute values of the double-precision
3496 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
3497 | before being returned. `zSign' is ignored if the result is a NaN.
3498 | The addition is performed according to the IEC/IEEE Standard for Binary
3499 | Floating-Point Arithmetic.
3500 *----------------------------------------------------------------------------*/
3501
3502 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3503 {
3504 int_fast16_t aExp, bExp, zExp;
3505 uint64_t aSig, bSig, zSig;
3506 int_fast16_t expDiff;
3507
3508 aSig = extractFloat64Frac( a );
3509 aExp = extractFloat64Exp( a );
3510 bSig = extractFloat64Frac( b );
3511 bExp = extractFloat64Exp( b );
3512 expDiff = aExp - bExp;
3513 aSig <<= 9;
3514 bSig <<= 9;
3515 if ( 0 < expDiff ) {
3516 if ( aExp == 0x7FF ) {
3517 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3518 return a;
3519 }
3520 if ( bExp == 0 ) {
3521 --expDiff;
3522 }
3523 else {
3524 bSig |= LIT64( 0x2000000000000000 );
3525 }
3526 shift64RightJamming( bSig, expDiff, &bSig );
3527 zExp = aExp;
3528 }
3529 else if ( expDiff < 0 ) {
3530 if ( bExp == 0x7FF ) {
3531 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3532 return packFloat64( zSign, 0x7FF, 0 );
3533 }
3534 if ( aExp == 0 ) {
3535 ++expDiff;
3536 }
3537 else {
3538 aSig |= LIT64( 0x2000000000000000 );
3539 }
3540 shift64RightJamming( aSig, - expDiff, &aSig );
3541 zExp = bExp;
3542 }
3543 else {
3544 if ( aExp == 0x7FF ) {
3545 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3546 return a;
3547 }
3548 if ( aExp == 0 ) {
3549 if (STATUS(flush_to_zero)) {
3550 if (aSig | bSig) {
3551 float_raise(float_flag_output_denormal STATUS_VAR);
3552 }
3553 return packFloat64(zSign, 0, 0);
3554 }
3555 return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3556 }
3557 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3558 zExp = aExp;
3559 goto roundAndPack;
3560 }
3561 aSig |= LIT64( 0x2000000000000000 );
3562 zSig = ( aSig + bSig )<<1;
3563 --zExp;
3564 if ( (int64_t) zSig < 0 ) {
3565 zSig = aSig + bSig;
3566 ++zExp;
3567 }
3568 roundAndPack:
3569 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3570
3571 }
3572
3573 /*----------------------------------------------------------------------------
3574 | Returns the result of subtracting the absolute values of the double-
3575 | precision floating-point values `a' and `b'. If `zSign' is 1, the
3576 | difference is negated before being returned. `zSign' is ignored if the
3577 | result is a NaN. The subtraction is performed according to the IEC/IEEE
3578 | Standard for Binary Floating-Point Arithmetic.
3579 *----------------------------------------------------------------------------*/
3580
3581 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3582 {
3583 int_fast16_t aExp, bExp, zExp;
3584 uint64_t aSig, bSig, zSig;
3585 int_fast16_t expDiff;
3586
3587 aSig = extractFloat64Frac( a );
3588 aExp = extractFloat64Exp( a );
3589 bSig = extractFloat64Frac( b );
3590 bExp = extractFloat64Exp( b );
3591 expDiff = aExp - bExp;
3592 aSig <<= 10;
3593 bSig <<= 10;
3594 if ( 0 < expDiff ) goto aExpBigger;
3595 if ( expDiff < 0 ) goto bExpBigger;
3596 if ( aExp == 0x7FF ) {
3597 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3598 float_raise( float_flag_invalid STATUS_VAR);
3599 return float64_default_nan;
3600 }
3601 if ( aExp == 0 ) {
3602 aExp = 1;
3603 bExp = 1;
3604 }
3605 if ( bSig < aSig ) goto aBigger;
3606 if ( aSig < bSig ) goto bBigger;
3607 return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3608 bExpBigger:
3609 if ( bExp == 0x7FF ) {
3610 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3611 return packFloat64( zSign ^ 1, 0x7FF, 0 );
3612 }
3613 if ( aExp == 0 ) {
3614 ++expDiff;
3615 }
3616 else {
3617 aSig |= LIT64( 0x4000000000000000 );
3618 }
3619 shift64RightJamming( aSig, - expDiff, &aSig );
3620 bSig |= LIT64( 0x4000000000000000 );
3621 bBigger:
3622 zSig = bSig - aSig;
3623 zExp = bExp;
3624 zSign ^= 1;
3625 goto normalizeRoundAndPack;
3626 aExpBigger:
3627 if ( aExp == 0x7FF ) {
3628 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3629 return a;
3630 }
3631 if ( bExp == 0 ) {
3632 --expDiff;
3633 }
3634 else {
3635 bSig |= LIT64( 0x4000000000000000 );
3636 }
3637 shift64RightJamming( bSig, expDiff, &bSig );
3638 aSig |= LIT64( 0x4000000000000000 );
3639 aBigger:
3640 zSig = aSig - bSig;
3641 zExp = aExp;
3642 normalizeRoundAndPack:
3643 --zExp;
3644 return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3645
3646 }
3647
3648 /*----------------------------------------------------------------------------
3649 | Returns the result of adding the double-precision floating-point values `a'
3650 | and `b'. The operation is performed according to the IEC/IEEE Standard for
3651 | Binary Floating-Point Arithmetic.
3652 *----------------------------------------------------------------------------*/
3653
3654 float64 float64_add( float64 a, float64 b STATUS_PARAM )
3655 {
3656 flag aSign, bSign;
3657 a = float64_squash_input_denormal(a STATUS_VAR);
3658 b = float64_squash_input_denormal(b STATUS_VAR);
3659
3660 aSign = extractFloat64Sign( a );
3661 bSign = extractFloat64Sign( b );
3662 if ( aSign == bSign ) {
3663 return addFloat64Sigs( a, b, aSign STATUS_VAR );
3664 }
3665 else {
3666 return subFloat64Sigs( a, b, aSign STATUS_VAR );
3667 }
3668
3669 }
3670
3671 /*----------------------------------------------------------------------------
3672 | Returns the result of subtracting the double-precision floating-point values
3673 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3674 | for Binary Floating-Point Arithmetic.
3675 *----------------------------------------------------------------------------*/
3676
3677 float64 float64_sub( float64 a, float64 b STATUS_PARAM )
3678 {
3679 flag aSign, bSign;
3680 a = float64_squash_input_denormal(a STATUS_VAR);
3681 b = float64_squash_input_denormal(b STATUS_VAR);
3682
3683 aSign = extractFloat64Sign( a );
3684 bSign = extractFloat64Sign( b );
3685 if ( aSign == bSign ) {
3686 return subFloat64Sigs( a, b, aSign STATUS_VAR );
3687 }
3688 else {
3689 return addFloat64Sigs( a, b, aSign STATUS_VAR );
3690 }
3691
3692 }
3693
3694 /*----------------------------------------------------------------------------
3695 | Returns the result of multiplying the double-precision floating-point values
3696 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3697 | for Binary Floating-Point Arithmetic.
3698 *----------------------------------------------------------------------------*/
3699
3700 float64 float64_mul( float64 a, float64 b STATUS_PARAM )
3701 {
3702 flag aSign, bSign, zSign;
3703 int_fast16_t aExp, bExp, zExp;
3704 uint64_t aSig, bSig, zSig0, zSig1;
3705
3706 a = float64_squash_input_denormal(a STATUS_VAR);
3707 b = float64_squash_input_denormal(b STATUS_VAR);
3708
3709 aSig = extractFloat64Frac( a );
3710 aExp = extractFloat64Exp( a );
3711 aSign = extractFloat64Sign( a );
3712 bSig = extractFloat64Frac( b );
3713 bExp = extractFloat64Exp( b );
3714 bSign = extractFloat64Sign( b );
3715 zSign = aSign ^ bSign;
3716 if ( aExp == 0x7FF ) {
3717 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3718 return propagateFloat64NaN( a, b STATUS_VAR );
3719 }
3720 if ( ( bExp | bSig ) == 0 ) {
3721 float_raise( float_flag_invalid STATUS_VAR);
3722 return float64_default_nan;
3723 }
3724 return packFloat64( zSign, 0x7FF, 0 );
3725 }
3726 if ( bExp == 0x7FF ) {
3727 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3728 if ( ( aExp | aSig ) == 0 ) {
3729 float_raise( float_flag_invalid STATUS_VAR);
3730 return float64_default_nan;
3731 }
3732 return packFloat64( zSign, 0x7FF, 0 );
3733 }
3734 if ( aExp == 0 ) {
3735 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3736 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3737 }
3738 if ( bExp == 0 ) {
3739 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
3740 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3741 }
3742 zExp = aExp + bExp - 0x3FF;
3743 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3744 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3745 mul64To128( aSig, bSig, &zSig0, &zSig1 );
3746 zSig0 |= ( zSig1 != 0 );
3747 if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
3748 zSig0 <<= 1;
3749 --zExp;
3750 }
3751 return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
3752
3753 }
3754
3755 /*----------------------------------------------------------------------------
3756 | Returns the result of dividing the double-precision floating-point value `a'
3757 | by the corresponding value `b'. The operation is performed according to
3758 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3759 *----------------------------------------------------------------------------*/
3760
3761 float64 float64_div( float64 a, float64 b STATUS_PARAM )
3762 {
3763 flag aSign, bSign, zSign;
3764 int_fast16_t aExp, bExp, zExp;
3765 uint64_t aSig, bSig, zSig;
3766 uint64_t rem0, rem1;
3767 uint64_t term0, term1;
3768 a = float64_squash_input_denormal(a STATUS_VAR);
3769 b = float64_squash_input_denormal(b STATUS_VAR);
3770
3771 aSig = extractFloat64Frac( a );
3772 aExp = extractFloat64Exp( a );
3773 aSign = extractFloat64Sign( a );
3774 bSig = extractFloat64Frac( b );
3775 bExp = extractFloat64Exp( b );
3776 bSign = extractFloat64Sign( b );
3777 zSign = aSign ^ bSign;
3778 if ( aExp == 0x7FF ) {
3779 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3780 if ( bExp == 0x7FF ) {
3781 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3782 float_raise( float_flag_invalid STATUS_VAR);
3783 return float64_default_nan;
3784 }
3785 return packFloat64( zSign, 0x7FF, 0 );
3786 }
3787 if ( bExp == 0x7FF ) {
3788 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3789 return packFloat64( zSign, 0, 0 );
3790 }
3791 if ( bExp == 0 ) {
3792 if ( bSig == 0 ) {
3793 if ( ( aExp | aSig ) == 0 ) {
3794 float_raise( float_flag_invalid STATUS_VAR);
3795 return float64_default_nan;
3796 }
3797 float_raise( float_flag_divbyzero STATUS_VAR);
3798 return packFloat64( zSign, 0x7FF, 0 );
3799 }
3800 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3801 }
3802 if ( aExp == 0 ) {
3803 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3804 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3805 }
3806 zExp = aExp - bExp + 0x3FD;
3807 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3808 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3809 if ( bSig <= ( aSig + aSig ) ) {
3810 aSig >>= 1;
3811 ++zExp;
3812 }
3813 zSig = estimateDiv128To64( aSig, 0, bSig );
3814 if ( ( zSig & 0x1FF ) <= 2 ) {
3815 mul64To128( bSig, zSig, &term0, &term1 );
3816 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3817 while ( (int64_t) rem0 < 0 ) {
3818 --zSig;
3819 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3820 }
3821 zSig |= ( rem1 != 0 );
3822 }
3823 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3824
3825 }
3826
3827 /*----------------------------------------------------------------------------
3828 | Returns the remainder of the double-precision floating-point value `a'
3829 | with respect to the corresponding value `b'. The operation is performed
3830 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3831 *----------------------------------------------------------------------------*/
3832
3833 float64 float64_rem( float64 a, float64 b STATUS_PARAM )
3834 {
3835 flag aSign, zSign;
3836 int_fast16_t aExp, bExp, expDiff;
3837 uint64_t aSig, bSig;
3838 uint64_t q, alternateASig;
3839 int64_t sigMean;
3840
3841 a = float64_squash_input_denormal(a STATUS_VAR);
3842 b = float64_squash_input_denormal(b STATUS_VAR);
3843 aSig = extractFloat64Frac( a );
3844 aExp = extractFloat64Exp( a );
3845 aSign = extractFloat64Sign( a );
3846 bSig = extractFloat64Frac( b );
3847 bExp = extractFloat64Exp( b );
3848 if ( aExp == 0x7FF ) {
3849 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3850 return propagateFloat64NaN( a, b STATUS_VAR );
3851 }
3852 float_raise( float_flag_invalid STATUS_VAR);
3853 return float64_default_nan;
3854 }
3855 if ( bExp == 0x7FF ) {
3856 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3857 return a;
3858 }
3859 if ( bExp == 0 ) {
3860 if ( bSig == 0 ) {
3861 float_raise( float_flag_invalid STATUS_VAR);
3862 return float64_default_nan;
3863 }
3864 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3865 }
3866 if ( aExp == 0 ) {
3867 if ( aSig == 0 ) return a;
3868 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3869 }
3870 expDiff = aExp - bExp;
3871 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3872 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3873 if ( expDiff < 0 ) {
3874 if ( expDiff < -1 ) return a;
3875 aSig >>= 1;
3876 }
3877 q = ( bSig <= aSig );
3878 if ( q ) aSig -= bSig;
3879 expDiff -= 64;
3880 while ( 0 < expDiff ) {
3881 q = estimateDiv128To64( aSig, 0, bSig );
3882 q = ( 2 < q ) ? q - 2 : 0;
3883 aSig = - ( ( bSig>>2 ) * q );
3884 expDiff -= 62;
3885 }
3886 expDiff += 64;
3887 if ( 0 < expDiff ) {
3888 q = estimateDiv128To64( aSig, 0, bSig );
3889 q = ( 2 < q ) ? q - 2 : 0;
3890 q >>= 64 - expDiff;
3891 bSig >>= 2;
3892 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3893 }
3894 else {
3895 aSig >>= 2;
3896 bSig >>= 2;
3897 }
3898 do {
3899 alternateASig = aSig;
3900 ++q;
3901 aSig -= bSig;
3902 } while ( 0 <= (int64_t) aSig );
3903 sigMean = aSig + alternateASig;
3904 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3905 aSig = alternateASig;
3906 }
3907 zSign = ( (int64_t) aSig < 0 );
3908 if ( zSign ) aSig = - aSig;
3909 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
3910
3911 }
3912
3913 /*----------------------------------------------------------------------------
3914 | Returns the result of multiplying the double-precision floating-point values
3915 | `a' and `b' then adding 'c', with no intermediate rounding step after the
3916 | multiplication. The operation is performed according to the IEC/IEEE
3917 | Standard for Binary Floating-Point Arithmetic 754-2008.
3918 | The flags argument allows the caller to select negation of the
3919 | addend, the intermediate product, or the final result. (The difference
3920 | between this and having the caller do a separate negation is that negating
3921 | externally will flip the sign bit on NaNs.)
3922 *----------------------------------------------------------------------------*/
3923
3924 float64 float64_muladd(float64 a, float64 b, float64 c, int flags STATUS_PARAM)
3925 {
3926 flag aSign, bSign, cSign, zSign;
3927 int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
3928 uint64_t aSig, bSig, cSig;
3929 flag pInf, pZero, pSign;
3930 uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1;
3931 int shiftcount;
3932 flag signflip, infzero;
3933
3934 a = float64_squash_input_denormal(a STATUS_VAR);
3935 b = float64_squash_input_denormal(b STATUS_VAR);
3936 c = float64_squash_input_denormal(c STATUS_VAR);
3937 aSig = extractFloat64Frac(a);
3938 aExp = extractFloat64Exp(a);
3939 aSign = extractFloat64Sign(a);
3940 bSig = extractFloat64Frac(b);
3941 bExp = extractFloat64Exp(b);
3942 bSign = extractFloat64Sign(b);
3943 cSig = extractFloat64Frac(c);
3944 cExp = extractFloat64Exp(c);
3945 cSign = extractFloat64Sign(c);
3946
3947 infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) ||
3948 (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0));
3949
3950 /* It is implementation-defined whether the cases of (0,inf,qnan)
3951 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
3952 * they return if they do), so we have to hand this information
3953 * off to the target-specific pick-a-NaN routine.
3954 */
3955 if (((aExp == 0x7ff) && aSig) ||
3956 ((bExp == 0x7ff) && bSig) ||
3957 ((cExp == 0x7ff) && cSig)) {
3958 return propagateFloat64MulAddNaN(a, b, c, infzero STATUS_VAR);
3959 }
3960
3961 if (infzero) {
3962 float_raise(float_flag_invalid STATUS_VAR);
3963 return float64_default_nan;
3964 }
3965
3966 if (flags & float_muladd_negate_c) {
3967 cSign ^= 1;
3968 }
3969
3970 signflip = (flags & float_muladd_negate_result) ? 1 : 0;
3971
3972 /* Work out the sign and type of the product */
3973 pSign = aSign ^ bSign;
3974 if (flags & float_muladd_negate_product) {
3975 pSign ^= 1;
3976 }
3977 pInf = (aExp == 0x7ff) || (bExp == 0x7ff);
3978 pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
3979
3980 if (cExp == 0x7ff) {
3981 if (pInf && (pSign ^ cSign)) {
3982 /* addition of opposite-signed infinities => InvalidOperation */
3983 float_raise(float_flag_invalid STATUS_VAR);
3984 return float64_default_nan;
3985 }
3986 /* Otherwise generate an infinity of the same sign */
3987 return packFloat64(cSign ^ signflip, 0x7ff, 0);
3988 }
3989
3990 if (pInf) {
3991 return packFloat64(pSign ^ signflip, 0x7ff, 0);
3992 }
3993
3994 if (pZero) {
3995 if (cExp == 0) {
3996 if (cSig == 0) {
3997 /* Adding two exact zeroes */
3998 if (pSign == cSign) {
3999 zSign = pSign;
4000 } else if (STATUS(float_rounding_mode) == float_round_down) {
4001 zSign = 1;
4002 } else {
4003 zSign = 0;
4004 }
4005 return packFloat64(zSign ^ signflip, 0, 0);
4006 }
4007 /* Exact zero plus a denorm */
4008 if (STATUS(flush_to_zero)) {
4009 float_raise(float_flag_output_denormal STATUS_VAR);
4010 return packFloat64(cSign ^ signflip, 0, 0);
4011 }
4012 }
4013 /* Zero plus something non-zero : just return the something */
4014 return packFloat64(cSign ^ signflip, cExp, cSig);
4015 }
4016
4017 if (aExp == 0) {
4018 normalizeFloat64Subnormal(aSig, &aExp, &aSig);
4019 }
4020 if (bExp == 0) {
4021 normalizeFloat64Subnormal(bSig, &bExp, &bSig);
4022 }
4023
4024 /* Calculate the actual result a * b + c */
4025
4026 /* Multiply first; this is easy. */
4027 /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff
4028 * because we want the true exponent, not the "one-less-than"
4029 * flavour that roundAndPackFloat64() takes.
4030 */
4031 pExp = aExp + bExp - 0x3fe;
4032 aSig = (aSig | LIT64(0x0010000000000000))<<10;
4033 bSig = (bSig | LIT64(0x0010000000000000))<<11;
4034 mul64To128(aSig, bSig, &pSig0, &pSig1);
4035 if ((int64_t)(pSig0 << 1) >= 0) {
4036 shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1);
4037 pExp--;
4038 }
4039
4040 zSign = pSign ^ signflip;
4041
4042 /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit
4043 * bit in position 126.
4044 */
4045 if (cExp == 0) {
4046 if (!cSig) {
4047 /* Throw out the special case of c being an exact zero now */
4048 shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1);
4049 return roundAndPackFloat64(zSign, pExp - 1,
4050 pSig1 STATUS_VAR);
4051 }
4052 normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4053 }
4054
4055 /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the
4056 * significand of the addend, with the explicit bit in position 126.
4057 */
4058 cSig0 = cSig << (126 - 64 - 52);
4059 cSig1 = 0;
4060 cSig0 |= LIT64(0x4000000000000000);
4061 expDiff = pExp - cExp;
4062
4063 if (pSign == cSign) {
4064 /* Addition */
4065 if (expDiff > 0) {
4066 /* scale c to match p */
4067 shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4068 zExp = pExp;
4069 } else if (expDiff < 0) {
4070 /* scale p to match c */
4071 shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4072 zExp = cExp;
4073 } else {
4074 /* no scaling needed */
4075 zExp = cExp;
4076 }
4077 /* Add significands and make sure explicit bit ends up in posn 126 */
4078 add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4079 if ((int64_t)zSig0 < 0) {
4080 shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);
4081 } else {
4082 zExp--;
4083 }
4084 shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1);
4085 return roundAndPackFloat64(zSign, zExp, zSig1 STATUS_VAR);
4086 } else {
4087 /* Subtraction */
4088 if (expDiff > 0) {
4089 shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4090 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4091 zExp = pExp;
4092 } else if (expDiff < 0) {
4093 shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4094 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4095 zExp = cExp;
4096 zSign ^= 1;
4097 } else {
4098 zExp = pExp;
4099 if (lt128(cSig0, cSig1, pSig0, pSig1)) {
4100 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4101 } else if (lt128(pSig0, pSig1, cSig0, cSig1)) {
4102 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4103 zSign ^= 1;
4104 } else {
4105 /* Exact zero */
4106 zSign = signflip;
4107 if (STATUS(float_rounding_mode) == float_round_down) {
4108 zSign ^= 1;
4109 }
4110 return packFloat64(zSign, 0, 0);
4111 }
4112 }
4113 --zExp;
4114 /* Do the equivalent of normalizeRoundAndPackFloat64() but
4115 * starting with the significand in a pair of uint64_t.
4116 */
4117 if (zSig0) {
4118 shiftcount = countLeadingZeros64(zSig0) - 1;
4119 shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1);
4120 if (zSig1) {
4121 zSig0 |= 1;
4122 }
4123 zExp -= shiftcount;
4124 } else {
4125 shiftcount = countLeadingZeros64(zSig1);
4126 if (shiftcount == 0) {
4127 zSig0 = (zSig1 >> 1) | (zSig1 & 1);
4128 zExp -= 63;
4129 } else {
4130 shiftcount--;
4131 zSig0 = zSig1 << shiftcount;
4132 zExp -= (shiftcount + 64);
4133 }
4134 }
4135 return roundAndPackFloat64(zSign, zExp, zSig0 STATUS_VAR);
4136 }
4137 }
4138
4139 /*----------------------------------------------------------------------------
4140 | Returns the square root of the double-precision floating-point value `a'.
4141 | The operation is performed according to the IEC/IEEE Standard for Binary
4142 | Floating-Point Arithmetic.
4143 *----------------------------------------------------------------------------*/
4144
4145 float64 float64_sqrt( float64 a STATUS_PARAM )
4146 {
4147 flag aSign;
4148 int_fast16_t aExp, zExp;
4149 uint64_t aSig, zSig, doubleZSig;
4150 uint64_t rem0, rem1, term0, term1;
4151 a = float64_squash_input_denormal(a STATUS_VAR);
4152
4153 aSig = extractFloat64Frac( a );
4154 aExp = extractFloat64Exp( a );
4155 aSign = extractFloat64Sign( a );
4156 if ( aExp == 0x7FF ) {
4157 if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
4158 if ( ! aSign ) return a;
4159 float_raise( float_flag_invalid STATUS_VAR);
4160 return float64_default_nan;
4161 }
4162 if ( aSign ) {
4163 if ( ( aExp | aSig ) == 0 ) return a;
4164 float_raise( float_flag_invalid STATUS_VAR);
4165 return float64_default_nan;
4166 }
4167 if ( aExp == 0 ) {
4168 if ( aSig == 0 ) return float64_zero;
4169 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4170 }
4171 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
4172 aSig |= LIT64( 0x0010000000000000 );
4173 zSig = estimateSqrt32( aExp, aSig>>21 );
4174 aSig <<= 9 - ( aExp & 1 );
4175 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
4176 if ( ( zSig & 0x1FF ) <= 5 ) {
4177 doubleZSig = zSig<<1;
4178 mul64To128( zSig, zSig, &term0, &term1 );
4179 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
4180 while ( (int64_t) rem0 < 0 ) {
4181 --zSig;
4182 doubleZSig -= 2;
4183 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
4184 }
4185 zSig |= ( ( rem0 | rem1 ) != 0 );
4186 }
4187 return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
4188
4189 }
4190
4191 /*----------------------------------------------------------------------------
4192 | Returns the binary log of the double-precision floating-point value `a'.
4193 | The operation is performed according to the IEC/IEEE Standard for Binary
4194 | Floating-Point Arithmetic.
4195 *----------------------------------------------------------------------------*/
4196 float64 float64_log2( float64 a STATUS_PARAM )
4197 {
4198 flag aSign, zSign;
4199 int_fast16_t aExp;
4200 uint64_t aSig, aSig0, aSig1, zSig, i;
4201 a = float64_squash_input_denormal(a STATUS_VAR);
4202
4203 aSig = extractFloat64Frac( a );
4204 aExp = extractFloat64Exp( a );
4205 aSign = extractFloat64Sign( a );
4206
4207 if ( aExp == 0 ) {
4208 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4209 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4210 }
4211 if ( aSign ) {
4212 float_raise( float_flag_invalid STATUS_VAR);
4213 return float64_default_nan;
4214 }
4215 if ( aExp == 0x7FF ) {
4216 if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
4217 return a;
4218 }
4219
4220 aExp -= 0x3FF;
4221 aSig |= LIT64( 0x0010000000000000 );
4222 zSign = aExp < 0;
4223 zSig = (uint64_t)aExp << 52;
4224 for (i = 1LL << 51; i > 0; i >>= 1) {
4225 mul64To128( aSig, aSig, &aSig0, &aSig1 );
4226 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4227 if ( aSig & LIT64( 0x0020000000000000 ) ) {
4228 aSig >>= 1;
4229 zSig |= i;
4230 }
4231 }
4232
4233 if ( zSign )
4234 zSig = -zSig;
4235 return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
4236 }
4237
4238 /*----------------------------------------------------------------------------
4239 | Returns 1 if the double-precision floating-point value `a' is equal to the
4240 | corresponding value `b', and 0 otherwise. The invalid exception is raised
4241 | if either operand is a NaN. Otherwise, the comparison is performed
4242 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4243 *----------------------------------------------------------------------------*/
4244
4245 int float64_eq( float64 a, float64 b STATUS_PARAM )
4246 {
4247 uint64_t av, bv;
4248 a = float64_squash_input_denormal(a STATUS_VAR);
4249 b = float64_squash_input_denormal(b STATUS_VAR);
4250
4251 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4252 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4253 ) {
4254 float_raise( float_flag_invalid STATUS_VAR);
4255 return 0;
4256 }
4257 av = float64_val(a);
4258 bv = float64_val(b);
4259 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4260
4261 }
4262
4263 /*----------------------------------------------------------------------------
4264 | Returns 1 if the double-precision floating-point value `a' is less than or
4265 | equal to the corresponding value `b', and 0 otherwise. The invalid
4266 | exception is raised if either operand is a NaN. The comparison is performed
4267 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4268 *----------------------------------------------------------------------------*/
4269
4270 int float64_le( float64 a, float64 b STATUS_PARAM )
4271 {
4272 flag aSign, bSign;
4273 uint64_t av, bv;
4274 a = float64_squash_input_denormal(a STATUS_VAR);
4275 b = float64_squash_input_denormal(b STATUS_VAR);
4276
4277 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4278 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4279 ) {
4280 float_raise( float_flag_invalid STATUS_VAR);
4281 return 0;
4282 }
4283 aSign = extractFloat64Sign( a );
4284 bSign = extractFloat64Sign( b );
4285 av = float64_val(a);
4286 bv = float64_val(b);
4287 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4288 return ( av == bv ) || ( aSign ^ ( av < bv ) );
4289
4290 }
4291
4292 /*----------------------------------------------------------------------------
4293 | Returns 1 if the double-precision floating-point value `a' is less than
4294 | the corresponding value `b', and 0 otherwise. The invalid exception is
4295 | raised if either operand is a NaN. The comparison is performed according
4296 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4297 *----------------------------------------------------------------------------*/
4298
4299 int float64_lt( float64 a, float64 b STATUS_PARAM )
4300 {
4301 flag aSign, bSign;
4302 uint64_t av, bv;
4303
4304 a = float64_squash_input_denormal(a STATUS_VAR);
4305 b = float64_squash_input_denormal(b STATUS_VAR);
4306 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4307 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4308 ) {
4309 float_raise( float_flag_invalid STATUS_VAR);
4310 return 0;
4311 }
4312 aSign = extractFloat64Sign( a );
4313 bSign = extractFloat64Sign( b );
4314 av = float64_val(a);
4315 bv = float64_val(b);
4316 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4317 return ( av != bv ) && ( aSign ^ ( av < bv ) );
4318
4319 }
4320
4321 /*----------------------------------------------------------------------------
4322 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4323 | be compared, and 0 otherwise. The invalid exception is raised if either
4324 | operand is a NaN. The comparison is performed according to the IEC/IEEE
4325 | Standard for Binary Floating-Point Arithmetic.
4326 *----------------------------------------------------------------------------*/
4327
4328 int float64_unordered( float64 a, float64 b STATUS_PARAM )
4329 {
4330 a = float64_squash_input_denormal(a STATUS_VAR);
4331 b = float64_squash_input_denormal(b STATUS_VAR);
4332
4333 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4334 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4335 ) {
4336 float_raise( float_flag_invalid STATUS_VAR);
4337 return 1;
4338 }
4339 return 0;
4340 }
4341
4342 /*----------------------------------------------------------------------------
4343 | Returns 1 if the double-precision floating-point value `a' is equal to the
4344 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4345 | exception.The comparison is performed according to the IEC/IEEE Standard
4346 | for Binary Floating-Point Arithmetic.
4347 *----------------------------------------------------------------------------*/
4348
4349 int float64_eq_quiet( float64 a, float64 b STATUS_PARAM )
4350 {
4351 uint64_t av, bv;
4352 a = float64_squash_input_denormal(a STATUS_VAR);
4353 b = float64_squash_input_denormal(b STATUS_VAR);
4354
4355 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4356 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4357 ) {
4358 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4359 float_raise( float_flag_invalid STATUS_VAR);
4360 }
4361 return 0;
4362 }
4363 av = float64_val(a);
4364 bv = float64_val(b);
4365 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4366
4367 }
4368
4369 /*----------------------------------------------------------------------------
4370 | Returns 1 if the double-precision floating-point value `a' is less than or
4371 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4372 | cause an exception. Otherwise, the comparison is performed according to the
4373 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4374 *----------------------------------------------------------------------------*/
4375
4376 int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
4377 {
4378 flag aSign, bSign;
4379 uint64_t av, bv;
4380 a = float64_squash_input_denormal(a STATUS_VAR);
4381 b = float64_squash_input_denormal(b STATUS_VAR);
4382
4383 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4384 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4385 ) {
4386 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4387 float_raise( float_flag_invalid STATUS_VAR);
4388 }
4389 return 0;
4390 }
4391 aSign = extractFloat64Sign( a );
4392 bSign = extractFloat64Sign( b );
4393 av = float64_val(a);
4394 bv = float64_val(b);
4395 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4396 return ( av == bv ) || ( aSign ^ ( av < bv ) );
4397
4398 }
4399
4400 /*----------------------------------------------------------------------------
4401 | Returns 1 if the double-precision floating-point value `a' is less than
4402 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4403 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
4404 | Standard for Binary Floating-Point Arithmetic.
4405 *----------------------------------------------------------------------------*/
4406
4407 int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
4408 {
4409 flag aSign, bSign;
4410 uint64_t av, bv;
4411 a = float64_squash_input_denormal(a STATUS_VAR);
4412 b = float64_squash_input_denormal(b STATUS_VAR);
4413
4414 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4415 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4416 ) {
4417 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4418 float_raise( float_flag_invalid STATUS_VAR);
4419 }
4420 return 0;
4421 }
4422 aSign = extractFloat64Sign( a );
4423 bSign = extractFloat64Sign( b );
4424 av = float64_val(a);
4425 bv = float64_val(b);
4426 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4427 return ( av != bv ) && ( aSign ^ ( av < bv ) );
4428
4429 }
4430
4431 /*----------------------------------------------------------------------------
4432 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4433 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4434 | comparison is performed according to the IEC/IEEE Standard for Binary
4435 | Floating-Point Arithmetic.
4436 *----------------------------------------------------------------------------*/
4437
4438 int float64_unordered_quiet( float64 a, float64 b STATUS_PARAM )
4439 {
4440 a = float64_squash_input_denormal(a STATUS_VAR);
4441 b = float64_squash_input_denormal(b STATUS_VAR);
4442
4443 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4444 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4445 ) {
4446 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4447 float_raise( float_flag_invalid STATUS_VAR);
4448 }
4449 return 1;
4450 }
4451 return 0;
4452 }
4453
4454 /*----------------------------------------------------------------------------
4455 | Returns the result of converting the extended double-precision floating-
4456 | point value `a' to the 32-bit two's complement integer format. The
4457 | conversion is performed according to the IEC/IEEE Standard for Binary
4458 | Floating-Point Arithmetic---which means in particular that the conversion
4459 | is rounded according to the current rounding mode. If `a' is a NaN, the
4460 | largest positive integer is returned. Otherwise, if the conversion
4461 | overflows, the largest integer with the same sign as `a' is returned.
4462 *----------------------------------------------------------------------------*/
4463
4464 int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
4465 {
4466 flag aSign;
4467 int32 aExp, shiftCount;
4468 uint64_t aSig;
4469
4470 aSig = extractFloatx80Frac( a );
4471 aExp = extractFloatx80Exp( a );
4472 aSign = extractFloatx80Sign( a );
4473 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4474 shiftCount = 0x4037 - aExp;
4475 if ( shiftCount <= 0 ) shiftCount = 1;
4476 shift64RightJamming( aSig, shiftCount, &aSig );
4477 return roundAndPackInt32( aSign, aSig STATUS_VAR );
4478
4479 }
4480
4481 /*----------------------------------------------------------------------------
4482 | Returns the result of converting the extended double-precision floating-
4483 | point value `a' to the 32-bit two's complement integer format. The
4484 | conversion is performed according to the IEC/IEEE Standard for Binary
4485 | Floating-Point Arithmetic, except that the conversion is always rounded
4486 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4487 | Otherwise, if the conversion overflows, the largest integer with the same
4488 | sign as `a' is returned.
4489 *----------------------------------------------------------------------------*/
4490
4491 int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
4492 {
4493 flag aSign;
4494 int32 aExp, shiftCount;
4495 uint64_t aSig, savedASig;
4496 int32_t z;
4497
4498 aSig = extractFloatx80Frac( a );
4499 aExp = extractFloatx80Exp( a );
4500 aSign = extractFloatx80Sign( a );
4501 if ( 0x401E < aExp ) {
4502 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4503 goto invalid;
4504 }
4505 else if ( aExp < 0x3FFF ) {
4506 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
4507 return 0;
4508 }
4509 shiftCount = 0x403E - aExp;
4510 savedASig = aSig;
4511 aSig >>= shiftCount;
4512 z = aSig;
4513 if ( aSign ) z = - z;
4514 if ( ( z < 0 ) ^ aSign ) {
4515 invalid:
4516 float_raise( float_flag_invalid STATUS_VAR);
4517 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4518 }
4519 if ( ( aSig<<shiftCount ) != savedASig ) {
4520 STATUS(float_exception_flags) |= float_flag_inexact;
4521 }
4522 return z;
4523
4524 }
4525
4526 /*----------------------------------------------------------------------------
4527 | Returns the result of converting the extended double-precision floating-
4528 | point value `a' to the 64-bit two's complement integer format. The
4529 | conversion is performed according to the IEC/IEEE Standard for Binary
4530 | Floating-Point Arithmetic---which means in particular that the conversion
4531 | is rounded according to the current rounding mode. If `a' is a NaN,
4532 | the largest positive integer is returned. Otherwise, if the conversion
4533 | overflows, the largest integer with the same sign as `a' is returned.
4534 *----------------------------------------------------------------------------*/
4535
4536 int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
4537 {
4538 flag aSign;
4539 int32 aExp, shiftCount;
4540 uint64_t aSig, aSigExtra;
4541
4542 aSig = extractFloatx80Frac( a );
4543 aExp = extractFloatx80Exp( a );
4544 aSign = extractFloatx80Sign( a );
4545 shiftCount = 0x403E - aExp;
4546 if ( shiftCount <= 0 ) {
4547 if ( shiftCount ) {
4548 float_raise( float_flag_invalid STATUS_VAR);
4549 if ( ! aSign
4550 || ( ( aExp == 0x7FFF )
4551 && ( aSig != LIT64( 0x8000000000000000 ) ) )
4552 ) {
4553 return LIT64( 0x7FFFFFFFFFFFFFFF );
4554 }
4555 return (int64_t) LIT64( 0x8000000000000000 );
4556 }
4557 aSigExtra = 0;
4558 }
4559 else {
4560 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4561 }
4562 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
4563
4564 }
4565
4566 /*----------------------------------------------------------------------------
4567 | Returns the result of converting the extended double-precision floating-
4568 | point value `a' to the 64-bit two's complement integer format. The
4569 | conversion is performed according to the IEC/IEEE Standard for Binary
4570 | Floating-Point Arithmetic, except that the conversion is always rounded
4571 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4572 | Otherwise, if the conversion overflows, the largest integer with the same
4573 | sign as `a' is returned.
4574 *----------------------------------------------------------------------------*/
4575
4576 int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
4577 {
4578 flag aSign;
4579 int32 aExp, shiftCount;
4580 uint64_t aSig;
4581 int64 z;
4582
4583 aSig = extractFloatx80Frac( a );
4584 aExp = extractFloatx80Exp( a );
4585 aSign = extractFloatx80Sign( a );
4586 shiftCount = aExp - 0x403E;
4587 if ( 0 <= shiftCount ) {
4588 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4589 if ( ( a.high != 0xC03E ) || aSig ) {
4590 float_raise( float_flag_invalid STATUS_VAR);
4591 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4592 return LIT64( 0x7FFFFFFFFFFFFFFF );
4593 }
4594 }
4595 return (int64_t) LIT64( 0x8000000000000000 );
4596 }
4597 else if ( aExp < 0x3FFF ) {
4598 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
4599 return 0;
4600 }
4601 z = aSig>>( - shiftCount );
4602 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4603 STATUS(float_exception_flags) |= float_flag_inexact;
4604 }
4605 if ( aSign ) z = - z;
4606 return z;
4607
4608 }
4609
4610 /*----------------------------------------------------------------------------
4611 | Returns the result of converting the extended double-precision floating-
4612 | point value `a' to the single-precision floating-point format. The
4613 | conversion is performed according to the IEC/IEEE Standard for Binary
4614 | Floating-Point Arithmetic.
4615 *----------------------------------------------------------------------------*/
4616
4617 float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
4618 {
4619 flag aSign;
4620 int32 aExp;
4621 uint64_t aSig;
4622
4623 aSig = extractFloatx80Frac( a );
4624 aExp = extractFloatx80Exp( a );
4625 aSign = extractFloatx80Sign( a );
4626 if ( aExp == 0x7FFF ) {
4627 if ( (uint64_t) ( aSig<<1 ) ) {
4628 return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4629 }
4630 return packFloat32( aSign, 0xFF, 0 );
4631 }
4632 shift64RightJamming( aSig, 33, &aSig );
4633 if ( aExp || aSig ) aExp -= 0x3F81;
4634 return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
4635
4636 }
4637
4638 /*----------------------------------------------------------------------------
4639 | Returns the result of converting the extended double-precision floating-
4640 | point value `a' to the double-precision floating-point format. The
4641 | conversion is performed according to the IEC/IEEE Standard for Binary
4642 | Floating-Point Arithmetic.
4643 *----------------------------------------------------------------------------*/
4644
4645 float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
4646 {
4647 flag aSign;
4648 int32 aExp;
4649 uint64_t aSig, zSig;
4650
4651 aSig = extractFloatx80Frac( a );
4652 aExp = extractFloatx80Exp( a );
4653 aSign = extractFloatx80Sign( a );
4654 if ( aExp == 0x7FFF ) {
4655 if ( (uint64_t) ( aSig<<1 ) ) {
4656 return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4657 }
4658 return packFloat64( aSign, 0x7FF, 0 );
4659 }
4660 shift64RightJamming( aSig, 1, &zSig );
4661 if ( aExp || aSig ) aExp -= 0x3C01;
4662 return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
4663
4664 }
4665
4666 /*----------------------------------------------------------------------------
4667 | Returns the result of converting the extended double-precision floating-
4668 | point value `a' to the quadruple-precision floating-point format. The
4669 | conversion is performed according to the IEC/IEEE Standard for Binary
4670 | Floating-Point Arithmetic.
4671 *----------------------------------------------------------------------------*/
4672
4673 float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
4674 {
4675 flag aSign;
4676 int_fast16_t aExp;
4677 uint64_t aSig, zSig0, zSig1;
4678
4679 aSig = extractFloatx80Frac( a );
4680 aExp = extractFloatx80Exp( a );
4681 aSign = extractFloatx80Sign( a );
4682 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4683 return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4684 }
4685 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4686 return packFloat128( aSign, aExp, zSig0, zSig1 );
4687
4688 }
4689
4690 /*----------------------------------------------------------------------------
4691 | Rounds the extended double-precision floating-point value `a' to an integer,
4692 | and returns the result as an extended quadruple-precision floating-point
4693 | value. The operation is performed according to the IEC/IEEE Standard for
4694 | Binary Floating-Point Arithmetic.
4695 *----------------------------------------------------------------------------*/
4696
4697 floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
4698 {
4699 flag aSign;
4700 int32 aExp;
4701 uint64_t lastBitMask, roundBitsMask;
4702 int8 roundingMode;
4703 floatx80 z;
4704
4705 aExp = extractFloatx80Exp( a );
4706 if ( 0x403E <= aExp ) {
4707 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4708 return propagateFloatx80NaN( a, a STATUS_VAR );
4709 }
4710 return a;
4711 }
4712 if ( aExp < 0x3FFF ) {
4713 if ( ( aExp == 0 )
4714 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4715 return a;
4716 }
4717 STATUS(float_exception_flags) |= float_flag_inexact;
4718 aSign = extractFloatx80Sign( a );
4719 switch ( STATUS(float_rounding_mode) ) {
4720 case float_round_nearest_even:
4721 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4722 ) {
4723 return
4724 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4725 }
4726 break;
4727 case float_round_down:
4728 return
4729 aSign ?
4730 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4731 : packFloatx80( 0, 0, 0 );
4732 case float_round_up:
4733 return
4734 aSign ? packFloatx80( 1, 0, 0 )
4735 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4736 }
4737 return packFloatx80( aSign, 0, 0 );
4738 }
4739 lastBitMask = 1;
4740 lastBitMask <<= 0x403E - aExp;
4741 roundBitsMask = lastBitMask - 1;
4742 z = a;
4743 roundingMode = STATUS(float_rounding_mode);
4744 if ( roundingMode == float_round_nearest_even ) {
4745 z.low += lastBitMask>>1;
4746 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4747 }
4748 else if ( roundingMode != float_round_to_zero ) {
4749 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
4750 z.low += roundBitsMask;
4751 }
4752 }
4753 z.low &= ~ roundBitsMask;
4754 if ( z.low == 0 ) {
4755 ++z.high;
4756 z.low = LIT64( 0x8000000000000000 );
4757 }
4758 if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
4759 return z;
4760
4761 }
4762
4763 /*----------------------------------------------------------------------------
4764 | Returns the result of adding the absolute values of the extended double-
4765 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
4766 | negated before being returned. `zSign' is ignored if the result is a NaN.
4767 | The addition is performed according to the IEC/IEEE Standard for Binary
4768 | Floating-Point Arithmetic.
4769 *----------------------------------------------------------------------------*/
4770
4771 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
4772 {
4773 int32 aExp, bExp, zExp;
4774 uint64_t aSig, bSig, zSig0, zSig1;
4775 int32 expDiff;
4776
4777 aSig = extractFloatx80Frac( a );
4778 aExp = extractFloatx80Exp( a );
4779 bSig = extractFloatx80Frac( b );
4780 bExp = extractFloatx80Exp( b );
4781 expDiff = aExp - bExp;
4782 if ( 0 < expDiff ) {
4783 if ( aExp == 0x7FFF ) {
4784 if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4785 return a;
4786 }
4787 if ( bExp == 0 ) --expDiff;
4788 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4789 zExp = aExp;
4790 }
4791 else if ( expDiff < 0 ) {
4792 if ( bExp == 0x7FFF ) {
4793 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4794 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4795 }
4796 if ( aExp == 0 ) ++expDiff;
4797 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4798 zExp = bExp;
4799 }
4800 else {
4801 if ( aExp == 0x7FFF ) {
4802 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4803 return propagateFloatx80NaN( a, b STATUS_VAR );
4804 }
4805 return a;
4806 }
4807 zSig1 = 0;
4808 zSig0 = aSig + bSig;
4809 if ( aExp == 0 ) {
4810 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4811 goto roundAndPack;
4812 }
4813 zExp = aExp;
4814 goto shiftRight1;
4815 }
4816 zSig0 = aSig + bSig;
4817 if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
4818 shiftRight1:
4819 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4820 zSig0 |= LIT64( 0x8000000000000000 );
4821 ++zExp;
4822 roundAndPack:
4823 return
4824 roundAndPackFloatx80(
4825 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4826
4827 }
4828
4829 /*----------------------------------------------------------------------------
4830 | Returns the result of subtracting the absolute values of the extended
4831 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
4832 | difference is negated before being returned. `zSign' is ignored if the
4833 | result is a NaN. The subtraction is performed according to the IEC/IEEE
4834 | Standard for Binary Floating-Point Arithmetic.
4835 *----------------------------------------------------------------------------*/
4836
4837 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
4838 {
4839 int32 aExp, bExp, zExp;
4840 uint64_t aSig, bSig, zSig0, zSig1;
4841 int32 expDiff;
4842 floatx80 z;
4843
4844 aSig = extractFloatx80Frac( a );
4845 aExp = extractFloatx80Exp( a );
4846 bSig = extractFloatx80Frac( b );
4847 bExp = extractFloatx80Exp( b );
4848 expDiff = aExp - bExp;
4849 if ( 0 < expDiff ) goto aExpBigger;
4850 if ( expDiff < 0 ) goto bExpBigger;
4851 if ( aExp == 0x7FFF ) {
4852 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4853 return propagateFloatx80NaN( a, b STATUS_VAR );
4854 }
4855 float_raise( float_flag_invalid STATUS_VAR);
4856 z.low = floatx80_default_nan_low;
4857 z.high = floatx80_default_nan_high;
4858 return z;
4859 }
4860 if ( aExp == 0 ) {
4861 aExp = 1;
4862 bExp = 1;
4863 }
4864 zSig1 = 0;
4865 if ( bSig < aSig ) goto aBigger;
4866 if ( aSig < bSig ) goto bBigger;
4867 return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
4868 bExpBigger:
4869 if ( bExp == 0x7FFF ) {
4870 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4871 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
4872 }
4873 if ( aExp == 0 ) ++expDiff;
4874 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4875 bBigger:
4876 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4877 zExp = bExp;
4878 zSign ^= 1;
4879 goto normalizeRoundAndPack;
4880 aExpBigger:
4881 if ( aExp == 0x7FFF ) {
4882 if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4883 return a;
4884 }
4885 if ( bExp == 0 ) --expDiff;
4886 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4887 aBigger:
4888 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4889 zExp = aExp;
4890 normalizeRoundAndPack:
4891 return
4892 normalizeRoundAndPackFloatx80(
4893 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4894
4895 }
4896
4897 /*----------------------------------------------------------------------------
4898 | Returns the result of adding the extended double-precision floating-point
4899 | values `a' and `b'. The operation is performed according to the IEC/IEEE
4900 | Standard for Binary Floating-Point Arithmetic.
4901 *----------------------------------------------------------------------------*/
4902
4903 floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
4904 {
4905 flag aSign, bSign;
4906
4907 aSign = extractFloatx80Sign( a );
4908 bSign = extractFloatx80Sign( b );
4909 if ( aSign == bSign ) {
4910 return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4911 }
4912 else {
4913 return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4914 }
4915
4916 }
4917
4918 /*----------------------------------------------------------------------------
4919 | Returns the result of subtracting the extended double-precision floating-
4920 | point values `a' and `b'. The operation is performed according to the
4921 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4922 *----------------------------------------------------------------------------*/
4923
4924 floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
4925 {
4926 flag aSign, bSign;
4927
4928 aSign = extractFloatx80Sign( a );
4929 bSign = extractFloatx80Sign( b );
4930 if ( aSign == bSign ) {
4931 return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4932 }
4933 else {
4934 return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4935 }
4936
4937 }
4938
4939 /*----------------------------------------------------------------------------
4940 | Returns the result of multiplying the extended double-precision floating-
4941 | point values `a' and `b'. The operation is performed according to the
4942 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4943 *----------------------------------------------------------------------------*/
4944
4945 floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
4946 {
4947 flag aSign, bSign, zSign;
4948 int32 aExp, bExp, zExp;
4949 uint64_t aSig, bSig, zSig0, zSig1;
4950 floatx80 z;
4951
4952 aSig = extractFloatx80Frac( a );
4953 aExp = extractFloatx80Exp( a );
4954 aSign = extractFloatx80Sign( a );
4955 bSig = extractFloatx80Frac( b );
4956 bExp = extractFloatx80Exp( b );
4957 bSign = extractFloatx80Sign( b );
4958 zSign = aSign ^ bSign;
4959 if ( aExp == 0x7FFF ) {
4960 if ( (uint64_t) ( aSig<<1 )
4961 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4962 return propagateFloatx80NaN( a, b STATUS_VAR );
4963 }
4964 if ( ( bExp | bSig ) == 0 ) goto invalid;
4965 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4966 }
4967 if ( bExp == 0x7FFF ) {
4968 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4969 if ( ( aExp | aSig ) == 0 ) {
4970 invalid:
4971 float_raise( float_flag_invalid STATUS_VAR);
4972 z.low = floatx80_default_nan_low;
4973 z.high = floatx80_default_nan_high;
4974 return z;
4975 }
4976 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4977 }
4978 if ( aExp == 0 ) {
4979 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4980 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4981 }
4982 if ( bExp == 0 ) {
4983 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
4984 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4985 }
4986 zExp = aExp + bExp - 0x3FFE;
4987 mul64To128( aSig, bSig, &zSig0, &zSig1 );
4988 if ( 0 < (int64_t) zSig0 ) {
4989 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
4990 --zExp;
4991 }
4992 return
4993 roundAndPackFloatx80(
4994 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4995
4996 }
4997
4998 /*----------------------------------------------------------------------------
4999 | Returns the result of dividing the extended double-precision floating-point
5000 | value `a' by the corresponding value `b'. The operation is performed
5001 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5002 *----------------------------------------------------------------------------*/
5003
5004 floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
5005 {
5006 flag aSign, bSign, zSign;
5007 int32 aExp, bExp, zExp;
5008 uint64_t aSig, bSig, zSig0, zSig1;
5009 uint64_t rem0, rem1, rem2, term0, term1, term2;
5010 floatx80 z;
5011
5012 aSig = extractFloatx80Frac( a );
5013 aExp = extractFloatx80Exp( a );
5014 aSign = extractFloatx80Sign( a );
5015 bSig = extractFloatx80Frac( b );
5016 bExp = extractFloatx80Exp( b );
5017 bSign = extractFloatx80Sign( b );
5018 zSign = aSign ^ bSign;
5019 if ( aExp == 0x7FFF ) {
5020 if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5021 if ( bExp == 0x7FFF ) {
5022 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5023 goto invalid;
5024 }
5025 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5026 }
5027 if ( bExp == 0x7FFF ) {
5028 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5029 return packFloatx80( zSign, 0, 0 );
5030 }
5031 if ( bExp == 0 ) {
5032 if ( bSig == 0 ) {
5033 if ( ( aExp | aSig ) == 0 ) {
5034 invalid:
5035 float_raise( float_flag_invalid STATUS_VAR);
5036 z.low = floatx80_default_nan_low;
5037 z.high = floatx80_default_nan_high;
5038 return z;
5039 }
5040 float_raise( float_flag_divbyzero STATUS_VAR);
5041 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5042 }
5043 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5044 }
5045 if ( aExp == 0 ) {
5046 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5047 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5048 }
5049 zExp = aExp - bExp + 0x3FFE;
5050 rem1 = 0;
5051 if ( bSig <= aSig ) {
5052 shift128Right( aSig, 0, 1, &aSig, &rem1 );
5053 ++zExp;
5054 }
5055 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5056 mul64To128( bSig, zSig0, &term0, &term1 );
5057 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5058 while ( (int64_t) rem0 < 0 ) {
5059 --zSig0;
5060 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5061 }
5062 zSig1 = estimateDiv128To64( rem1, 0, bSig );
5063 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5064 mul64To128( bSig, zSig1, &term1, &term2 );
5065 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5066 while ( (int64_t) rem1 < 0 ) {
5067 --zSig1;
5068 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5069 }
5070 zSig1 |= ( ( rem1 | rem2 ) != 0 );
5071 }
5072 return
5073 roundAndPackFloatx80(
5074 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
5075
5076 }
5077
5078 /*----------------------------------------------------------------------------
5079 | Returns the remainder of the extended double-precision floating-point value
5080 | `a' with respect to the corresponding value `b'. The operation is performed
5081 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5082 *----------------------------------------------------------------------------*/
5083
5084 floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
5085 {
5086 flag aSign, zSign;
5087 int32 aExp, bExp, expDiff;
5088 uint64_t aSig0, aSig1, bSig;
5089 uint64_t q, term0, term1, alternateASig0, alternateASig1;
5090 floatx80 z;
5091
5092 aSig0 = extractFloatx80Frac( a );
5093 aExp = extractFloatx80Exp( a );
5094 aSign = extractFloatx80Sign( a );
5095 bSig = extractFloatx80Frac( b );
5096 bExp = extractFloatx80Exp( b );
5097 if ( aExp == 0x7FFF ) {
5098 if ( (uint64_t) ( aSig0<<1 )
5099 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5100 return propagateFloatx80NaN( a, b STATUS_VAR );
5101 }
5102 goto invalid;
5103 }
5104 if ( bExp == 0x7FFF ) {
5105 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5106 return a;
5107 }
5108 if ( bExp == 0 ) {
5109 if ( bSig == 0 ) {
5110 invalid:
5111 float_raise( float_flag_invalid STATUS_VAR);
5112 z.low = floatx80_default_nan_low;
5113 z.high = floatx80_default_nan_high;
5114 return z;
5115 }
5116 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5117 }
5118 if ( aExp == 0 ) {
5119 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5120 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5121 }
5122 bSig |= LIT64( 0x8000000000000000 );
5123 zSign = aSign;
5124 expDiff = aExp - bExp;
5125 aSig1 = 0;
5126 if ( expDiff < 0 ) {
5127 if ( expDiff < -1 ) return a;
5128 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5129 expDiff = 0;
5130 }
5131 q = ( bSig <= aSig0 );
5132 if ( q ) aSig0 -= bSig;
5133 expDiff -= 64;
5134 while ( 0 < expDiff ) {
5135 q = estimateDiv128To64( aSig0, aSig1, bSig );
5136 q = ( 2 < q ) ? q - 2 : 0;
5137 mul64To128( bSig, q, &term0, &term1 );
5138 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5139 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5140 expDiff -= 62;
5141 }
5142 expDiff += 64;
5143 if ( 0 < expDiff ) {
5144 q = estimateDiv128To64( aSig0, aSig1, bSig );
5145 q = ( 2 < q ) ? q - 2 : 0;
5146 q >>= 64 - expDiff;
5147 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5148 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5149 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5150 while ( le128( term0, term1, aSig0, aSig1 ) ) {
5151 ++q;
5152 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5153 }
5154 }
5155 else {
5156 term1 = 0;
5157 term0 = bSig;
5158 }
5159 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5160 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5161 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5162 && ( q & 1 ) )
5163 ) {
5164 aSig0 = alternateASig0;
5165 aSig1 = alternateASig1;
5166 zSign = ! zSign;
5167 }
5168 return
5169 normalizeRoundAndPackFloatx80(
5170 80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
5171
5172 }
5173
5174 /*----------------------------------------------------------------------------
5175 | Returns the square root of the extended double-precision floating-point
5176 | value `a'. The operation is performed according to the IEC/IEEE Standard
5177 | for Binary Floating-Point Arithmetic.
5178 *----------------------------------------------------------------------------*/
5179
5180 floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
5181 {
5182 flag aSign;
5183 int32 aExp, zExp;
5184 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5185 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5186 floatx80 z;
5187
5188 aSig0 = extractFloatx80Frac( a );
5189 aExp = extractFloatx80Exp( a );
5190 aSign = extractFloatx80Sign( a );
5191 if ( aExp == 0x7FFF ) {
5192 if ( (uint64_t) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
5193 if ( ! aSign ) return a;
5194 goto invalid;
5195 }
5196 if ( aSign ) {
5197 if ( ( aExp | aSig0 ) == 0 ) return a;
5198 invalid:
5199 float_raise( float_flag_invalid STATUS_VAR);
5200 z.low = floatx80_default_nan_low;
5201 z.high = floatx80_default_nan_high;
5202 return z;
5203 }
5204 if ( aExp == 0 ) {
5205 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5206 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5207 }
5208 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5209 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5210 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5211 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5212 doubleZSig0 = zSig0<<1;
5213 mul64To128( zSig0, zSig0, &term0, &term1 );
5214 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5215 while ( (int64_t) rem0 < 0 ) {
5216 --zSig0;
5217 doubleZSig0 -= 2;
5218 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5219 }
5220 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5221 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5222 if ( zSig1 == 0 ) zSig1 = 1;
5223 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5224 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5225 mul64To128( zSig1, zSig1, &term2, &term3 );
5226 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5227 while ( (int64_t) rem1 < 0 ) {
5228 --zSig1;
5229 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5230 term3 |= 1;
5231 term2 |= doubleZSig0;
5232 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5233 }
5234 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5235 }
5236 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5237 zSig0 |= doubleZSig0;
5238 return
5239 roundAndPackFloatx80(
5240 STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
5241
5242 }
5243
5244 /*----------------------------------------------------------------------------
5245 | Returns 1 if the extended double-precision floating-point value `a' is equal
5246 | to the corresponding value `b', and 0 otherwise. The invalid exception is
5247 | raised if either operand is a NaN. Otherwise, the comparison is performed
5248 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5249 *----------------------------------------------------------------------------*/
5250
5251 int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
5252 {
5253
5254 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5255 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5256 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5257 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5258 ) {
5259 float_raise( float_flag_invalid STATUS_VAR);
5260 return 0;
5261 }
5262 return
5263 ( a.low == b.low )
5264 && ( ( a.high == b.high )
5265 || ( ( a.low == 0 )
5266 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5267 );
5268
5269 }
5270
5271 /*----------------------------------------------------------------------------
5272 | Returns 1 if the extended double-precision floating-point value `a' is
5273 | less than or equal to the corresponding value `b', and 0 otherwise. The
5274 | invalid exception is raised if either operand is a NaN. The comparison is
5275 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5276 | Arithmetic.
5277 *----------------------------------------------------------------------------*/
5278
5279 int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
5280 {
5281 flag aSign, bSign;
5282
5283 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5284 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5285 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5286 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5287 ) {
5288 float_raise( float_flag_invalid STATUS_VAR);
5289 return 0;
5290 }
5291 aSign = extractFloatx80Sign( a );
5292 bSign = extractFloatx80Sign( b );
5293 if ( aSign != bSign ) {
5294 return
5295 aSign
5296 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5297 == 0 );
5298 }
5299 return
5300 aSign ? le128( b.high, b.low, a.high, a.low )
5301 : le128( a.high, a.low, b.high, b.low );
5302
5303 }
5304
5305 /*----------------------------------------------------------------------------
5306 | Returns 1 if the extended double-precision floating-point value `a' is
5307 | less than the corresponding value `b', and 0 otherwise. The invalid
5308 | exception is raised if either operand is a NaN. The comparison is performed
5309 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5310 *----------------------------------------------------------------------------*/
5311
5312 int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
5313 {
5314 flag aSign, bSign;
5315
5316 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5317 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5318 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5319 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5320 ) {
5321 float_raise( float_flag_invalid STATUS_VAR);
5322 return 0;
5323 }
5324 aSign = extractFloatx80Sign( a );
5325 bSign = extractFloatx80Sign( b );
5326 if ( aSign != bSign ) {
5327 return
5328 aSign
5329 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5330 != 0 );
5331 }
5332 return
5333 aSign ? lt128( b.high, b.low, a.high, a.low )
5334 : lt128( a.high, a.low, b.high, b.low );
5335
5336 }
5337
5338 /*----------------------------------------------------------------------------
5339 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5340 | cannot be compared, and 0 otherwise. The invalid exception is raised if
5341 | either operand is a NaN. The comparison is performed according to the
5342 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5343 *----------------------------------------------------------------------------*/
5344 int floatx80_unordered( floatx80 a, floatx80 b STATUS_PARAM )
5345 {
5346 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5347 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5348 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5349 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5350 ) {
5351 float_raise( float_flag_invalid STATUS_VAR);
5352 return 1;
5353 }
5354 return 0;
5355 }
5356
5357 /*----------------------------------------------------------------------------
5358 | Returns 1 if the extended double-precision floating-point value `a' is
5359 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5360 | cause an exception. The comparison is performed according to the IEC/IEEE
5361 | Standard for Binary Floating-Point Arithmetic.
5362 *----------------------------------------------------------------------------*/
5363
5364 int floatx80_eq_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5365 {
5366
5367 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5368 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5369 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5370 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5371 ) {
5372 if ( floatx80_is_signaling_nan( a )
5373 || floatx80_is_signaling_nan( b ) ) {
5374 float_raise( float_flag_invalid STATUS_VAR);
5375 }
5376 return 0;
5377 }
5378 return
5379 ( a.low == b.low )
5380 && ( ( a.high == b.high )
5381 || ( ( a.low == 0 )
5382 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5383 );
5384
5385 }
5386
5387 /*----------------------------------------------------------------------------
5388 | Returns 1 if the extended double-precision floating-point value `a' is less
5389 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
5390 | do not cause an exception. Otherwise, the comparison is performed according
5391 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5392 *----------------------------------------------------------------------------*/
5393
5394 int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5395 {
5396 flag aSign, bSign;
5397
5398 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5399 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5400 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5401 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5402 ) {
5403 if ( floatx80_is_signaling_nan( a )
5404 || floatx80_is_signaling_nan( b ) ) {
5405 float_raise( float_flag_invalid STATUS_VAR);
5406 }
5407 return 0;
5408 }
5409 aSign = extractFloatx80Sign( a );
5410 bSign = extractFloatx80Sign( b );
5411 if ( aSign != bSign ) {
5412 return
5413 aSign
5414 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5415 == 0 );
5416 }
5417 return
5418 aSign ? le128( b.high, b.low, a.high, a.low )
5419 : le128( a.high, a.low, b.high, b.low );
5420
5421 }
5422
5423 /*----------------------------------------------------------------------------
5424 | Returns 1 if the extended double-precision floating-point value `a' is less
5425 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
5426 | an exception. Otherwise, the comparison is performed according to the
5427 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5428 *----------------------------------------------------------------------------*/
5429
5430 int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5431 {
5432 flag aSign, bSign;
5433
5434 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5435 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5436 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5437 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5438 ) {
5439 if ( floatx80_is_signaling_nan( a )
5440 || floatx80_is_signaling_nan( b ) ) {
5441 float_raise( float_flag_invalid STATUS_VAR);
5442 }
5443 return 0;
5444 }
5445 aSign = extractFloatx80Sign( a );
5446 bSign = extractFloatx80Sign( b );
5447 if ( aSign != bSign ) {
5448 return
5449 aSign
5450 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5451 != 0 );
5452 }
5453 return
5454 aSign ? lt128( b.high, b.low, a.high, a.low )
5455 : lt128( a.high, a.low, b.high, b.low );
5456
5457 }
5458
5459 /*----------------------------------------------------------------------------
5460 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5461 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
5462 | The comparison is performed according to the IEC/IEEE Standard for Binary
5463 | Floating-Point Arithmetic.
5464 *----------------------------------------------------------------------------*/
5465 int floatx80_unordered_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5466 {
5467 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5468 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5469 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5470 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5471 ) {
5472 if ( floatx80_is_signaling_nan( a )
5473 || floatx80_is_signaling_nan( b ) ) {
5474 float_raise( float_flag_invalid STATUS_VAR);
5475 }
5476 return 1;
5477 }
5478 return 0;
5479 }
5480
5481 /*----------------------------------------------------------------------------
5482 | Returns the result of converting the quadruple-precision floating-point
5483 | value `a' to the 32-bit two's complement integer format. The conversion
5484 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5485 | Arithmetic---which means in particular that the conversion is rounded
5486 | according to the current rounding mode. If `a' is a NaN, the largest
5487 | positive integer is returned. Otherwise, if the conversion overflows, the
5488 | largest integer with the same sign as `a' is returned.
5489 *----------------------------------------------------------------------------*/
5490
5491 int32 float128_to_int32( float128 a STATUS_PARAM )
5492 {
5493 flag aSign;
5494 int32 aExp, shiftCount;
5495 uint64_t aSig0, aSig1;
5496
5497 aSig1 = extractFloat128Frac1( a );
5498 aSig0 = extractFloat128Frac0( a );
5499 aExp = extractFloat128Exp( a );
5500 aSign = extractFloat128Sign( a );
5501 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5502 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5503 aSig0 |= ( aSig1 != 0 );
5504 shiftCount = 0x4028 - aExp;
5505 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5506 return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
5507
5508 }
5509
5510 /*----------------------------------------------------------------------------
5511 | Returns the result of converting the quadruple-precision floating-point
5512 | value `a' to the 32-bit two's complement integer format. The conversion
5513 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5514 | Arithmetic, except that the conversion is always rounded toward zero. If
5515 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
5516 | conversion overflows, the largest integer with the same sign as `a' is
5517 | returned.
5518 *----------------------------------------------------------------------------*/
5519
5520 int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
5521 {
5522 flag aSign;
5523 int32 aExp, shiftCount;
5524 uint64_t aSig0, aSig1, savedASig;
5525 int32_t z;
5526
5527 aSig1 = extractFloat128Frac1( a );
5528 aSig0 = extractFloat128Frac0( a );
5529 aExp = extractFloat128Exp( a );
5530 aSign = extractFloat128Sign( a );
5531 aSig0 |= ( aSig1 != 0 );
5532 if ( 0x401E < aExp ) {
5533 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5534 goto invalid;
5535 }
5536 else if ( aExp < 0x3FFF ) {
5537 if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
5538 return 0;
5539 }
5540 aSig0 |= LIT64( 0x0001000000000000 );
5541 shiftCount = 0x402F - aExp;
5542 savedASig = aSig0;
5543 aSig0 >>= shiftCount;
5544 z = aSig0;
5545 if ( aSign ) z = - z;
5546 if ( ( z < 0 ) ^ aSign ) {
5547 invalid:
5548 float_raise( float_flag_invalid STATUS_VAR);
5549 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5550 }
5551 if ( ( aSig0<<shiftCount ) != savedASig ) {
5552 STATUS(float_exception_flags) |= float_flag_inexact;
5553 }
5554 return z;
5555
5556 }
5557
5558 /*----------------------------------------------------------------------------
5559 | Returns the result of converting the quadruple-precision floating-point
5560 | value `a' to the 64-bit two's complement integer format. The conversion
5561 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5562 | Arithmetic---which means in particular that the conversion is rounded
5563 | according to the current rounding mode. If `a' is a NaN, the largest
5564 | positive integer is returned. Otherwise, if the conversion overflows, the
5565 | largest integer with the same sign as `a' is returned.
5566 *----------------------------------------------------------------------------*/
5567
5568 int64 float128_to_int64( float128 a STATUS_PARAM )
5569 {
5570 flag aSign;
5571 int32 aExp, shiftCount;
5572 uint64_t aSig0, aSig1;
5573
5574 aSig1 = extractFloat128Frac1( a );
5575 aSig0 = extractFloat128Frac0( a );
5576 aExp = extractFloat128Exp( a );
5577 aSign = extractFloat128Sign( a );
5578 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5579 shiftCount = 0x402F - aExp;
5580 if ( shiftCount <= 0 ) {
5581 if ( 0x403E < aExp ) {
5582 float_raise( float_flag_invalid STATUS_VAR);
5583 if ( ! aSign
5584 || ( ( aExp == 0x7FFF )
5585 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5586 )
5587 ) {
5588 return LIT64( 0x7FFFFFFFFFFFFFFF );
5589 }
5590 return (int64_t) LIT64( 0x8000000000000000 );
5591 }
5592 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5593 }
5594 else {
5595 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5596 }
5597 return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
5598
5599 }
5600
5601 /*----------------------------------------------------------------------------
5602 | Returns the result of converting the quadruple-precision floating-point
5603 | value `a' to the 64-bit two's complement integer format. The conversion
5604 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5605 | Arithmetic, except that the conversion is always rounded toward zero.
5606 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
5607 | the conversion overflows, the largest integer with the same sign as `a' is
5608 | returned.
5609 *----------------------------------------------------------------------------*/
5610
5611 int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
5612 {
5613 flag aSign;
5614 int32 aExp, shiftCount;
5615 uint64_t aSig0, aSig1;
5616 int64 z;
5617
5618 aSig1 = extractFloat128Frac1( a );
5619 aSig0 = extractFloat128Frac0( a );
5620 aExp = extractFloat128Exp( a );
5621 aSign = extractFloat128Sign( a );
5622 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5623 shiftCount = aExp - 0x402F;
5624 if ( 0 < shiftCount ) {
5625 if ( 0x403E <= aExp ) {
5626 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5627 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
5628 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5629 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
5630 }
5631 else {
5632 float_raise( float_flag_invalid STATUS_VAR);
5633 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5634 return LIT64( 0x7FFFFFFFFFFFFFFF );
5635 }
5636 }
5637 return (int64_t) LIT64( 0x8000000000000000 );
5638 }
5639 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
5640 if ( (uint64_t) ( aSig1<<shiftCount ) ) {
5641 STATUS(float_exception_flags) |= float_flag_inexact;
5642 }
5643 }
5644 else {
5645 if ( aExp < 0x3FFF ) {
5646 if ( aExp | aSig0 | aSig1 ) {
5647 STATUS(float_exception_flags) |= float_flag_inexact;
5648 }
5649 return 0;
5650 }
5651 z = aSig0>>( - shiftCount );
5652 if ( aSig1
5653 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5654 STATUS(float_exception_flags) |= float_flag_inexact;
5655 }
5656 }
5657 if ( aSign ) z = - z;
5658 return z;
5659
5660 }
5661
5662 /*----------------------------------------------------------------------------
5663 | Returns the result of converting the quadruple-precision floating-point
5664 | value `a' to the single-precision floating-point format. The conversion
5665 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5666 | Arithmetic.
5667 *----------------------------------------------------------------------------*/
5668
5669 float32 float128_to_float32( float128 a STATUS_PARAM )
5670 {
5671 flag aSign;
5672 int32 aExp;
5673 uint64_t aSig0, aSig1;
5674 uint32_t zSig;
5675
5676 aSig1 = extractFloat128Frac1( a );
5677 aSig0 = extractFloat128Frac0( a );
5678 aExp = extractFloat128Exp( a );
5679 aSign = extractFloat128Sign( a );
5680 if ( aExp == 0x7FFF ) {
5681 if ( aSig0 | aSig1 ) {
5682 return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5683 }
5684 return packFloat32( aSign, 0xFF, 0 );
5685 }
5686 aSig0 |= ( aSig1 != 0 );
5687 shift64RightJamming( aSig0, 18, &aSig0 );
5688 zSig = aSig0;
5689 if ( aExp || zSig ) {
5690 zSig |= 0x40000000;
5691 aExp -= 0x3F81;
5692 }
5693 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
5694
5695 }
5696
5697 /*----------------------------------------------------------------------------
5698 | Returns the result of converting the quadruple-precision floating-point
5699 | value `a' to the double-precision floating-point format. The conversion
5700 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5701 | Arithmetic.
5702 *----------------------------------------------------------------------------*/
5703
5704 float64 float128_to_float64( float128 a STATUS_PARAM )
5705 {
5706 flag aSign;
5707 int32 aExp;
5708 uint64_t aSig0, aSig1;
5709
5710 aSig1 = extractFloat128Frac1( a );
5711 aSig0 = extractFloat128Frac0( a );
5712 aExp = extractFloat128Exp( a );
5713 aSign = extractFloat128Sign( a );
5714 if ( aExp == 0x7FFF ) {
5715 if ( aSig0 | aSig1 ) {
5716 return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5717 }
5718 return packFloat64( aSign, 0x7FF, 0 );
5719 }
5720 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5721 aSig0 |= ( aSig1 != 0 );
5722 if ( aExp || aSig0 ) {
5723 aSig0 |= LIT64( 0x4000000000000000 );
5724 aExp -= 0x3C01;
5725 }
5726 return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
5727
5728 }
5729
5730 /*----------------------------------------------------------------------------
5731 | Returns the result of converting the quadruple-precision floating-point
5732 | value `a' to the extended double-precision floating-point format. The
5733 | conversion is performed according to the IEC/IEEE Standard for Binary
5734 | Floating-Point Arithmetic.
5735 *----------------------------------------------------------------------------*/
5736
5737 floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
5738 {
5739 flag aSign;
5740 int32 aExp;
5741 uint64_t aSig0, aSig1;
5742
5743 aSig1 = extractFloat128Frac1( a );
5744 aSig0 = extractFloat128Frac0( a );
5745 aExp = extractFloat128Exp( a );
5746 aSign = extractFloat128Sign( a );
5747 if ( aExp == 0x7FFF ) {
5748 if ( aSig0 | aSig1 ) {
5749 return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5750 }
5751 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5752 }
5753 if ( aExp == 0 ) {
5754 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5755 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5756 }
5757 else {
5758 aSig0 |= LIT64( 0x0001000000000000 );
5759 }
5760 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5761 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
5762
5763 }
5764
5765 /*----------------------------------------------------------------------------
5766 | Rounds the quadruple-precision floating-point value `a' to an integer, and
5767 | returns the result as a quadruple-precision floating-point value. The
5768 | operation is performed according to the IEC/IEEE Standard for Binary
5769 | Floating-Point Arithmetic.
5770 *----------------------------------------------------------------------------*/
5771
5772 float128 float128_round_to_int( float128 a STATUS_PARAM )
5773 {
5774 flag aSign;
5775 int32 aExp;
5776 uint64_t lastBitMask, roundBitsMask;
5777 int8 roundingMode;
5778 float128 z;
5779
5780 aExp = extractFloat128Exp( a );
5781 if ( 0x402F <= aExp ) {
5782 if ( 0x406F <= aExp ) {
5783 if ( ( aExp == 0x7FFF )
5784 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5785 ) {
5786 return propagateFloat128NaN( a, a STATUS_VAR );
5787 }
5788 return a;
5789 }
5790 lastBitMask = 1;
5791 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5792 roundBitsMask = lastBitMask - 1;
5793 z = a;
5794 roundingMode = STATUS(float_rounding_mode);
5795 if ( roundingMode == float_round_nearest_even ) {
5796 if ( lastBitMask ) {
5797 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
5798 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
5799 }
5800 else {
5801 if ( (int64_t) z.low < 0 ) {
5802 ++z.high;
5803 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
5804 }
5805 }
5806 }
5807 else if ( roundingMode != float_round_to_zero ) {
5808 if ( extractFloat128Sign( z )
5809 ^ ( roundingMode == float_round_up ) ) {
5810 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
5811 }
5812 }
5813 z.low &= ~ roundBitsMask;
5814 }
5815 else {
5816 if ( aExp < 0x3FFF ) {
5817 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
5818 STATUS(float_exception_flags) |= float_flag_inexact;
5819 aSign = extractFloat128Sign( a );
5820 switch ( STATUS(float_rounding_mode) ) {
5821 case float_round_nearest_even:
5822 if ( ( aExp == 0x3FFE )
5823 && ( extractFloat128Frac0( a )
5824 | extractFloat128Frac1( a ) )
5825 ) {
5826 return packFloat128( aSign, 0x3FFF, 0, 0 );
5827 }
5828 break;
5829 case float_round_down:
5830 return
5831 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
5832 : packFloat128( 0, 0, 0, 0 );
5833 case float_round_up:
5834 return
5835 aSign ? packFloat128( 1, 0, 0, 0 )
5836 : packFloat128( 0, 0x3FFF, 0, 0 );
5837 }
5838 return packFloat128( aSign, 0, 0, 0 );
5839 }
5840 lastBitMask = 1;
5841 lastBitMask <<= 0x402F - aExp;
5842 roundBitsMask = lastBitMask - 1;
5843 z.low = 0;
5844 z.high = a.high;
5845 roundingMode = STATUS(float_rounding_mode);
5846 if ( roundingMode == float_round_nearest_even ) {
5847 z.high += lastBitMask>>1;
5848 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
5849 z.high &= ~ lastBitMask;
5850 }
5851 }
5852 else if ( roundingMode != float_round_to_zero ) {
5853 if ( extractFloat128Sign( z )
5854 ^ ( roundingMode == float_round_up ) ) {
5855 z.high |= ( a.low != 0 );
5856 z.high += roundBitsMask;
5857 }
5858 }
5859 z.high &= ~ roundBitsMask;
5860 }
5861 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
5862 STATUS(float_exception_flags) |= float_flag_inexact;
5863 }
5864 return z;
5865
5866 }
5867
5868 /*----------------------------------------------------------------------------
5869 | Returns the result of adding the absolute values of the quadruple-precision
5870 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
5871 | before being returned. `zSign' is ignored if the result is a NaN.
5872 | The addition is performed according to the IEC/IEEE Standard for Binary
5873 | Floating-Point Arithmetic.
5874 *----------------------------------------------------------------------------*/
5875
5876 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5877 {
5878 int32 aExp, bExp, zExp;
5879 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5880 int32 expDiff;
5881
5882 aSig1 = extractFloat128Frac1( a );
5883 aSig0 = extractFloat128Frac0( a );
5884 aExp = extractFloat128Exp( a );
5885 bSig1 = extractFloat128Frac1( b );
5886 bSig0 = extractFloat128Frac0( b );
5887 bExp = extractFloat128Exp( b );
5888 expDiff = aExp - bExp;
5889 if ( 0 < expDiff ) {
5890 if ( aExp == 0x7FFF ) {
5891 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5892 return a;
5893 }
5894 if ( bExp == 0 ) {
5895 --expDiff;
5896 }
5897 else {
5898 bSig0 |= LIT64( 0x0001000000000000 );
5899 }
5900 shift128ExtraRightJamming(
5901 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
5902 zExp = aExp;
5903 }
5904 else if ( expDiff < 0 ) {
5905 if ( bExp == 0x7FFF ) {
5906 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5907 return packFloat128( zSign, 0x7FFF, 0, 0 );
5908 }
5909 if ( aExp == 0 ) {
5910 ++expDiff;
5911 }
5912 else {
5913 aSig0 |= LIT64( 0x0001000000000000 );
5914 }
5915 shift128ExtraRightJamming(
5916 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
5917 zExp = bExp;
5918 }
5919 else {
5920 if ( aExp == 0x7FFF ) {
5921 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5922 return propagateFloat128NaN( a, b STATUS_VAR );
5923 }
5924 return a;
5925 }
5926 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5927 if ( aExp == 0 ) {
5928 if (STATUS(flush_to_zero)) {
5929 if (zSig0 | zSig1) {
5930 float_raise(float_flag_output_denormal STATUS_VAR);
5931 }
5932 return packFloat128(zSign, 0, 0, 0);
5933 }
5934 return packFloat128( zSign, 0, zSig0, zSig1 );
5935 }
5936 zSig2 = 0;
5937 zSig0 |= LIT64( 0x0002000000000000 );
5938 zExp = aExp;
5939 goto shiftRight1;
5940 }
5941 aSig0 |= LIT64( 0x0001000000000000 );
5942 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5943 --zExp;
5944 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
5945 ++zExp;
5946 shiftRight1:
5947 shift128ExtraRightJamming(
5948 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5949 roundAndPack:
5950 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5951
5952 }
5953
5954 /*----------------------------------------------------------------------------
5955 | Returns the result of subtracting the absolute values of the quadruple-
5956 | precision floating-point values `a' and `b'. If `zSign' is 1, the
5957 | difference is negated before being returned. `zSign' is ignored if the
5958 | result is a NaN. The subtraction is performed according to the IEC/IEEE
5959 | Standard for Binary Floating-Point Arithmetic.
5960 *----------------------------------------------------------------------------*/
5961
5962 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5963 {
5964 int32 aExp, bExp, zExp;
5965 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
5966 int32 expDiff;
5967 float128 z;
5968
5969 aSig1 = extractFloat128Frac1( a );
5970 aSig0 = extractFloat128Frac0( a );
5971 aExp = extractFloat128Exp( a );
5972 bSig1 = extractFloat128Frac1( b );
5973 bSig0 = extractFloat128Frac0( b );
5974 bExp = extractFloat128Exp( b );
5975 expDiff = aExp - bExp;
5976 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5977 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
5978 if ( 0 < expDiff ) goto aExpBigger;
5979 if ( expDiff < 0 ) goto bExpBigger;
5980 if ( aExp == 0x7FFF ) {
5981 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5982 return propagateFloat128NaN( a, b STATUS_VAR );
5983 }
5984 float_raise( float_flag_invalid STATUS_VAR);
5985 z.low = float128_default_nan_low;
5986 z.high = float128_default_nan_high;
5987 return z;
5988 }
5989 if ( aExp == 0 ) {
5990 aExp = 1;
5991 bExp = 1;
5992 }
5993 if ( bSig0 < aSig0 ) goto aBigger;
5994 if ( aSig0 < bSig0 ) goto bBigger;
5995 if ( bSig1 < aSig1 ) goto aBigger;
5996 if ( aSig1 < bSig1 ) goto bBigger;
5997 return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
5998 bExpBigger:
5999 if ( bExp == 0x7FFF ) {
6000 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6001 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6002 }
6003 if ( aExp == 0 ) {
6004 ++expDiff;
6005 }
6006 else {
6007 aSig0 |= LIT64( 0x4000000000000000 );
6008 }
6009 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6010 bSig0 |= LIT64( 0x4000000000000000 );
6011 bBigger:
6012 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6013 zExp = bExp;
6014 zSign ^= 1;
6015 goto normalizeRoundAndPack;
6016 aExpBigger:
6017 if ( aExp == 0x7FFF ) {
6018 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6019 return a;
6020 }
6021 if ( bExp == 0 ) {
6022 --expDiff;
6023 }
6024 else {
6025 bSig0 |= LIT64( 0x4000000000000000 );
6026 }
6027 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6028 aSig0 |= LIT64( 0x4000000000000000 );
6029 aBigger:
6030 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6031 zExp = aExp;
6032 normalizeRoundAndPack:
6033 --zExp;
6034 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
6035
6036 }
6037
6038 /*----------------------------------------------------------------------------
6039 | Returns the result of adding the quadruple-precision floating-point values
6040 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
6041 | for Binary Floating-Point Arithmetic.
6042 *----------------------------------------------------------------------------*/
6043
6044 float128 float128_add( float128 a, float128 b STATUS_PARAM )
6045 {
6046 flag aSign, bSign;
6047
6048 aSign = extractFloat128Sign( a );
6049 bSign = extractFloat128Sign( b );
6050 if ( aSign == bSign ) {
6051 return addFloat128Sigs( a, b, aSign STATUS_VAR );
6052 }
6053 else {
6054 return subFloat128Sigs( a, b, aSign STATUS_VAR );
6055 }
6056
6057 }
6058
6059 /*----------------------------------------------------------------------------
6060 | Returns the result of subtracting the quadruple-precision floating-point
6061 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6062 | Standard for Binary Floating-Point Arithmetic.
6063 *----------------------------------------------------------------------------*/
6064
6065 float128 float128_sub( float128 a, float128 b STATUS_PARAM )
6066 {
6067 flag aSign, bSign;
6068
6069 aSign = extractFloat128Sign( a );
6070 bSign = extractFloat128Sign( b );
6071 if ( aSign == bSign ) {
6072 return subFloat128Sigs( a, b, aSign STATUS_VAR );
6073 }
6074 else {
6075 return addFloat128Sigs( a, b, aSign STATUS_VAR );
6076 }
6077
6078 }
6079
6080 /*----------------------------------------------------------------------------
6081 | Returns the result of multiplying the quadruple-precision floating-point
6082 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6083 | Standard for Binary Floating-Point Arithmetic.
6084 *----------------------------------------------------------------------------*/
6085
6086 float128 float128_mul( float128 a, float128 b STATUS_PARAM )
6087 {
6088 flag aSign, bSign, zSign;
6089 int32 aExp, bExp, zExp;
6090 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6091 float128 z;
6092
6093 aSig1 = extractFloat128Frac1( a );
6094 aSig0 = extractFloat128Frac0( a );
6095 aExp = extractFloat128Exp( a );
6096 aSign = extractFloat128Sign( a );
6097 bSig1 = extractFloat128Frac1( b );
6098 bSig0 = extractFloat128Frac0( b );
6099 bExp = extractFloat128Exp( b );
6100 bSign = extractFloat128Sign( b );
6101 zSign = aSign ^ bSign;
6102 if ( aExp == 0x7FFF ) {
6103 if ( ( aSig0 | aSig1 )
6104 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6105 return propagateFloat128NaN( a, b STATUS_VAR );
6106 }
6107 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6108 return packFloat128( zSign, 0x7FFF, 0, 0 );
6109 }
6110 if ( bExp == 0x7FFF ) {
6111 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6112 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6113 invalid:
6114 float_raise( float_flag_invalid STATUS_VAR);
6115 z.low = float128_default_nan_low;
6116 z.high = float128_default_nan_high;
6117 return z;
6118 }
6119 return packFloat128( zSign, 0x7FFF, 0, 0 );
6120 }
6121 if ( aExp == 0 ) {
6122 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6123 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6124 }
6125 if ( bExp == 0 ) {
6126 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6127 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6128 }
6129 zExp = aExp + bExp - 0x4000;
6130 aSig0 |= LIT64( 0x0001000000000000 );
6131 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6132 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6133 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6134 zSig2 |= ( zSig3 != 0 );
6135 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6136 shift128ExtraRightJamming(
6137 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6138 ++zExp;
6139 }
6140 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6141
6142 }
6143
6144 /*----------------------------------------------------------------------------
6145 | Returns the result of dividing the quadruple-precision floating-point value
6146 | `a' by the corresponding value `b'. The operation is performed according to
6147 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6148 *----------------------------------------------------------------------------*/
6149
6150 float128 float128_div( float128 a, float128 b STATUS_PARAM )
6151 {
6152 flag aSign, bSign, zSign;
6153 int32 aExp, bExp, zExp;
6154 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6155 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6156 float128 z;
6157
6158 aSig1 = extractFloat128Frac1( a );
6159 aSig0 = extractFloat128Frac0( a );
6160 aExp = extractFloat128Exp( a );
6161 aSign = extractFloat128Sign( a );
6162 bSig1 = extractFloat128Frac1( b );
6163 bSig0 = extractFloat128Frac0( b );
6164 bExp = extractFloat128Exp( b );
6165 bSign = extractFloat128Sign( b );
6166 zSign = aSign ^ bSign;
6167 if ( aExp == 0x7FFF ) {
6168 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6169 if ( bExp == 0x7FFF ) {
6170 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6171 goto invalid;
6172 }
6173 return packFloat128( zSign, 0x7FFF, 0, 0 );
6174 }
6175 if ( bExp == 0x7FFF ) {
6176 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6177 return packFloat128( zSign, 0, 0, 0 );
6178 }
6179 if ( bExp == 0 ) {
6180 if ( ( bSig0 | bSig1 ) == 0 ) {
6181 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6182 invalid:
6183 float_raise( float_flag_invalid STATUS_VAR);
6184 z.low = float128_default_nan_low;
6185 z.high = float128_default_nan_high;
6186 return z;
6187 }
6188 float_raise( float_flag_divbyzero STATUS_VAR);
6189 return packFloat128( zSign, 0x7FFF, 0, 0 );
6190 }
6191 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6192 }
6193 if ( aExp == 0 ) {
6194 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6195 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6196 }
6197 zExp = aExp - bExp + 0x3FFD;
6198 shortShift128Left(
6199 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6200 shortShift128Left(
6201 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6202 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6203 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6204 ++zExp;
6205 }
6206 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6207 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6208 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6209 while ( (int64_t) rem0 < 0 ) {
6210 --zSig0;
6211 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6212 }
6213 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6214 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6215 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6216 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6217 while ( (int64_t) rem1 < 0 ) {
6218 --zSig1;
6219 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6220 }
6221 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6222 }
6223 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6224 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6225
6226 }
6227
6228 /*----------------------------------------------------------------------------
6229 | Returns the remainder of the quadruple-precision floating-point value `a'
6230 | with respect to the corresponding value `b'. The operation is performed
6231 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6232 *----------------------------------------------------------------------------*/
6233
6234 float128 float128_rem( float128 a, float128 b STATUS_PARAM )
6235 {
6236 flag aSign, zSign;
6237 int32 aExp, bExp, expDiff;
6238 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6239 uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6240 int64_t sigMean0;
6241 float128 z;
6242
6243 aSig1 = extractFloat128Frac1( a );
6244 aSig0 = extractFloat128Frac0( a );
6245 aExp = extractFloat128Exp( a );
6246 aSign = extractFloat128Sign( a );
6247 bSig1 = extractFloat128Frac1( b );
6248 bSig0 = extractFloat128Frac0( b );
6249 bExp = extractFloat128Exp( b );
6250 if ( aExp == 0x7FFF ) {
6251 if ( ( aSig0 | aSig1 )
6252 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6253 return propagateFloat128NaN( a, b STATUS_VAR );
6254 }
6255 goto invalid;
6256 }
6257 if ( bExp == 0x7FFF ) {
6258 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6259 return a;
6260 }
6261 if ( bExp == 0 ) {
6262 if ( ( bSig0 | bSig1 ) == 0 ) {
6263 invalid:
6264 float_raise( float_flag_invalid STATUS_VAR);
6265 z.low = float128_default_nan_low;
6266 z.high = float128_default_nan_high;
6267 return z;
6268 }
6269 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6270 }
6271 if ( aExp == 0 ) {
6272 if ( ( aSig0 | aSig1 ) == 0 ) return a;
6273 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6274 }
6275 expDiff = aExp - bExp;
6276 if ( expDiff < -1 ) return a;
6277 shortShift128Left(
6278 aSig0 | LIT64( 0x0001000000000000 ),
6279 aSig1,
6280 15 - ( expDiff < 0 ),
6281 &aSig0,
6282 &aSig1
6283 );
6284 shortShift128Left(
6285 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6286 q = le128( bSig0, bSig1, aSig0, aSig1 );
6287 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6288 expDiff -= 64;
6289 while ( 0 < expDiff ) {
6290 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6291 q = ( 4 < q ) ? q - 4 : 0;
6292 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6293 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6294 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6295 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6296 expDiff -= 61;
6297 }
6298 if ( -64 < expDiff ) {
6299 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6300 q = ( 4 < q ) ? q - 4 : 0;
6301 q >>= - expDiff;
6302 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6303 expDiff += 52;
6304 if ( expDiff < 0 ) {
6305 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6306 }
6307 else {
6308 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6309 }
6310 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6311 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6312 }
6313 else {
6314 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6315 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6316 }
6317 do {
6318 alternateASig0 = aSig0;
6319 alternateASig1 = aSig1;
6320 ++q;
6321 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6322 } while ( 0 <= (int64_t) aSig0 );
6323 add128(
6324 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6325 if ( ( sigMean0 < 0 )
6326 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6327 aSig0 = alternateASig0;
6328 aSig1 = alternateASig1;
6329 }
6330 zSign = ( (int64_t) aSig0 < 0 );
6331 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6332 return
6333 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
6334
6335 }
6336
6337 /*----------------------------------------------------------------------------
6338 | Returns the square root of the quadruple-precision floating-point value `a'.
6339 | The operation is performed according to the IEC/IEEE Standard for Binary
6340 | Floating-Point Arithmetic.
6341 *----------------------------------------------------------------------------*/
6342
6343 float128 float128_sqrt( float128 a STATUS_PARAM )
6344 {
6345 flag aSign;
6346 int32 aExp, zExp;
6347 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6348 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6349 float128 z;
6350
6351 aSig1 = extractFloat128Frac1( a );
6352 aSig0 = extractFloat128Frac0( a );
6353 aExp = extractFloat128Exp( a );
6354 aSign = extractFloat128Sign( a );
6355 if ( aExp == 0x7FFF ) {
6356 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
6357 if ( ! aSign ) return a;
6358 goto invalid;
6359 }
6360 if ( aSign ) {
6361 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6362 invalid:
6363 float_raise( float_flag_invalid STATUS_VAR);
6364 z.low = float128_default_nan_low;
6365 z.high = float128_default_nan_high;
6366 return z;
6367 }
6368 if ( aExp == 0 ) {
6369 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6370 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6371 }
6372 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6373 aSig0 |= LIT64( 0x0001000000000000 );
6374 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6375 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6376 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6377 doubleZSig0 = zSig0<<1;
6378 mul64To128( zSig0, zSig0, &term0, &term1 );
6379 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6380 while ( (int64_t) rem0 < 0 ) {
6381 --zSig0;
6382 doubleZSig0 -= 2;
6383 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6384 }
6385 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6386 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6387 if ( zSig1 == 0 ) zSig1 = 1;
6388 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6389 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6390 mul64To128( zSig1, zSig1, &term2, &term3 );
6391 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6392 while ( (int64_t) rem1 < 0 ) {
6393 --zSig1;
6394 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6395 term3 |= 1;
6396 term2 |= doubleZSig0;
6397 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6398 }
6399 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6400 }
6401 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6402 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6403
6404 }
6405
6406 /*----------------------------------------------------------------------------
6407 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6408 | the corresponding value `b', and 0 otherwise. The invalid exception is
6409 | raised if either operand is a NaN. Otherwise, the comparison is performed
6410 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6411 *----------------------------------------------------------------------------*/
6412
6413 int float128_eq( float128 a, float128 b STATUS_PARAM )
6414 {
6415
6416 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6417 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6418 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6419 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6420 ) {
6421 float_raise( float_flag_invalid STATUS_VAR);
6422 return 0;
6423 }
6424 return
6425 ( a.low == b.low )
6426 && ( ( a.high == b.high )
6427 || ( ( a.low == 0 )
6428 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6429 );
6430
6431 }
6432
6433 /*----------------------------------------------------------------------------
6434 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6435 | or equal to the corresponding value `b', and 0 otherwise. The invalid
6436 | exception is raised if either operand is a NaN. The comparison is performed
6437 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6438 *----------------------------------------------------------------------------*/
6439
6440 int float128_le( float128 a, float128 b STATUS_PARAM )
6441 {
6442 flag aSign, bSign;
6443
6444 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6445 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6446 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6447 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6448 ) {
6449 float_raise( float_flag_invalid STATUS_VAR);
6450 return 0;
6451 }
6452 aSign = extractFloat128Sign( a );
6453 bSign = extractFloat128Sign( b );
6454 if ( aSign != bSign ) {
6455 return
6456 aSign
6457 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6458 == 0 );
6459 }
6460 return
6461 aSign ? le128( b.high, b.low, a.high, a.low )
6462 : le128( a.high, a.low, b.high, b.low );
6463
6464 }
6465
6466 /*----------------------------------------------------------------------------
6467 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6468 | the corresponding value `b', and 0 otherwise. The invalid exception is
6469 | raised if either operand is a NaN. The comparison is performed according
6470 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6471 *----------------------------------------------------------------------------*/
6472
6473 int float128_lt( float128 a, float128 b STATUS_PARAM )
6474 {
6475 flag aSign, bSign;
6476
6477 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6478 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6479 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6480 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6481 ) {
6482 float_raise( float_flag_invalid STATUS_VAR);
6483 return 0;
6484 }
6485 aSign = extractFloat128Sign( a );
6486 bSign = extractFloat128Sign( b );
6487 if ( aSign != bSign ) {
6488 return
6489 aSign
6490 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6491 != 0 );
6492 }
6493 return
6494 aSign ? lt128( b.high, b.low, a.high, a.low )
6495 : lt128( a.high, a.low, b.high, b.low );
6496
6497 }
6498
6499 /*----------------------------------------------------------------------------
6500 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6501 | be compared, and 0 otherwise. The invalid exception is raised if either
6502 | operand is a NaN. The comparison is performed according to the IEC/IEEE
6503 | Standard for Binary Floating-Point Arithmetic.
6504 *----------------------------------------------------------------------------*/
6505
6506 int float128_unordered( float128 a, float128 b STATUS_PARAM )
6507 {
6508 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6509 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6510 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6511 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6512 ) {
6513 float_raise( float_flag_invalid STATUS_VAR);
6514 return 1;
6515 }
6516 return 0;
6517 }
6518
6519 /*----------------------------------------------------------------------------
6520 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6521 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6522 | exception. The comparison is performed according to the IEC/IEEE Standard
6523 | for Binary Floating-Point Arithmetic.
6524 *----------------------------------------------------------------------------*/
6525
6526 int float128_eq_quiet( float128 a, float128 b STATUS_PARAM )
6527 {
6528
6529 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6530 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6531 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6532 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6533 ) {
6534 if ( float128_is_signaling_nan( a )
6535 || float128_is_signaling_nan( b ) ) {
6536 float_raise( float_flag_invalid STATUS_VAR);
6537 }
6538 return 0;
6539 }
6540 return
6541 ( a.low == b.low )
6542 && ( ( a.high == b.high )
6543 || ( ( a.low == 0 )
6544 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6545 );
6546
6547 }
6548
6549 /*----------------------------------------------------------------------------
6550 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6551 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
6552 | cause an exception. Otherwise, the comparison is performed according to the
6553 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6554 *----------------------------------------------------------------------------*/
6555
6556 int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
6557 {
6558 flag aSign, bSign;
6559
6560 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6561 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6562 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6563 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6564 ) {
6565 if ( float128_is_signaling_nan( a )
6566 || float128_is_signaling_nan( b ) ) {
6567 float_raise( float_flag_invalid STATUS_VAR);
6568 }
6569 return 0;
6570 }
6571 aSign = extractFloat128Sign( a );
6572 bSign = extractFloat128Sign( b );
6573 if ( aSign != bSign ) {
6574 return
6575 aSign
6576 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6577 == 0 );
6578 }
6579 return
6580 aSign ? le128( b.high, b.low, a.high, a.low )
6581 : le128( a.high, a.low, b.high, b.low );
6582
6583 }
6584
6585 /*----------------------------------------------------------------------------
6586 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6587 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6588 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
6589 | Standard for Binary Floating-Point Arithmetic.
6590 *----------------------------------------------------------------------------*/
6591
6592 int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
6593 {
6594 flag aSign, bSign;
6595
6596 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6597 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6598 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6599 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6600 ) {
6601 if ( float128_is_signaling_nan( a )
6602 || float128_is_signaling_nan( b ) ) {
6603 float_raise( float_flag_invalid STATUS_VAR);
6604 }
6605 return 0;
6606 }
6607 aSign = extractFloat128Sign( a );
6608 bSign = extractFloat128Sign( b );
6609 if ( aSign != bSign ) {
6610 return
6611 aSign
6612 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6613 != 0 );
6614 }
6615 return
6616 aSign ? lt128( b.high, b.low, a.high, a.low )
6617 : lt128( a.high, a.low, b.high, b.low );
6618
6619 }
6620
6621 /*----------------------------------------------------------------------------
6622 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6623 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
6624 | comparison is performed according to the IEC/IEEE Standard for Binary
6625 | Floating-Point Arithmetic.
6626 *----------------------------------------------------------------------------*/
6627
6628 int float128_unordered_quiet( float128 a, float128 b STATUS_PARAM )
6629 {
6630 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6631 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6632 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6633 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6634 ) {
6635 if ( float128_is_signaling_nan( a )
6636 || float128_is_signaling_nan( b ) ) {
6637 float_raise( float_flag_invalid STATUS_VAR);
6638 }
6639 return 1;
6640 }
6641 return 0;
6642 }
6643
6644 /* misc functions */
6645 float32 uint32_to_float32(uint32_t a STATUS_PARAM)
6646 {
6647 return int64_to_float32(a STATUS_VAR);
6648 }
6649
6650 float64 uint32_to_float64(uint32_t a STATUS_PARAM)
6651 {
6652 return int64_to_float64(a STATUS_VAR);
6653 }
6654
6655 uint32 float32_to_uint32( float32 a STATUS_PARAM )
6656 {
6657 int64_t v;
6658 uint32 res;
6659 int old_exc_flags = get_float_exception_flags(status);
6660
6661 v = float32_to_int64(a STATUS_VAR);
6662 if (v < 0) {
6663 res = 0;
6664 } else if (v > 0xffffffff) {
6665 res = 0xffffffff;
6666 } else {
6667 return v;
6668 }
6669 set_float_exception_flags(old_exc_flags, status);
6670 float_raise(float_flag_invalid STATUS_VAR);
6671 return res;
6672 }
6673
6674 uint32 float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
6675 {
6676 int64_t v;
6677 uint32 res;
6678 int old_exc_flags = get_float_exception_flags(status);
6679
6680 v = float32_to_int64_round_to_zero(a STATUS_VAR);
6681 if (v < 0) {
6682 res = 0;
6683 } else if (v > 0xffffffff) {
6684 res = 0xffffffff;
6685 } else {
6686 return v;
6687 }
6688 set_float_exception_flags(old_exc_flags, status);
6689 float_raise(float_flag_invalid STATUS_VAR);
6690 return res;
6691 }
6692
6693 int_fast16_t float32_to_int16(float32 a STATUS_PARAM)
6694 {
6695 int32_t v;
6696 int_fast16_t res;
6697 int old_exc_flags = get_float_exception_flags(status);
6698
6699 v = float32_to_int32(a STATUS_VAR);
6700 if (v < -0x8000) {
6701 res = -0x8000;
6702 } else if (v > 0x7fff) {
6703 res = 0x7fff;
6704 } else {
6705 return v;
6706 }
6707
6708 set_float_exception_flags(old_exc_flags, status);
6709 float_raise(float_flag_invalid STATUS_VAR);
6710 return res;
6711 }
6712
6713 uint_fast16_t float32_to_uint16(float32 a STATUS_PARAM)
6714 {
6715 int32_t v;
6716 uint_fast16_t res;
6717 int old_exc_flags = get_float_exception_flags(status);
6718
6719 v = float32_to_int32(a STATUS_VAR);
6720 if (v < 0) {
6721 res = 0;
6722 } else if (v > 0xffff) {
6723 res = 0xffff;
6724 } else {
6725 return v;
6726 }
6727
6728 set_float_exception_flags(old_exc_flags, status);
6729 float_raise(float_flag_invalid STATUS_VAR);
6730 return res;
6731 }
6732
6733 uint_fast16_t float32_to_uint16_round_to_zero(float32 a STATUS_PARAM)
6734 {
6735 int64_t v;
6736 uint_fast16_t res;
6737 int old_exc_flags = get_float_exception_flags(status);
6738
6739 v = float32_to_int64_round_to_zero(a STATUS_VAR);
6740 if (v < 0) {
6741 res = 0;
6742 } else if (v > 0xffff) {
6743 res = 0xffff;
6744 } else {
6745 return v;
6746 }
6747 set_float_exception_flags(old_exc_flags, status);
6748 float_raise(float_flag_invalid STATUS_VAR);
6749 return res;
6750 }
6751
6752 uint32 float64_to_uint32( float64 a STATUS_PARAM )
6753 {
6754 uint64_t v;
6755 uint32 res;
6756 int old_exc_flags = get_float_exception_flags(status);
6757
6758 v = float64_to_uint64(a STATUS_VAR);
6759 if (v > 0xffffffff) {
6760 res = 0xffffffff;
6761 } else {
6762 return v;
6763 }
6764 set_float_exception_flags(old_exc_flags, status);
6765 float_raise(float_flag_invalid STATUS_VAR);
6766 return res;
6767 }
6768
6769 uint32 float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
6770 {
6771 uint64_t v;
6772 uint32 res;
6773 int old_exc_flags = get_float_exception_flags(status);
6774
6775 v = float64_to_uint64_round_to_zero(a STATUS_VAR);
6776 if (v > 0xffffffff) {
6777 res = 0xffffffff;
6778 } else {
6779 return v;
6780 }
6781 set_float_exception_flags(old_exc_flags, status);
6782 float_raise(float_flag_invalid STATUS_VAR);
6783 return res;
6784 }
6785
6786 int_fast16_t float64_to_int16(float64 a STATUS_PARAM)
6787 {
6788 int64_t v;
6789 int_fast16_t res;
6790 int old_exc_flags = get_float_exception_flags(status);
6791
6792 v = float64_to_int32(a STATUS_VAR);
6793 if (v < -0x8000) {
6794 res = -0x8000;
6795 } else if (v > 0x7fff) {
6796 res = 0x7fff;
6797 } else {
6798 return v;
6799 }
6800
6801 set_float_exception_flags(old_exc_flags, status);
6802 float_raise(float_flag_invalid STATUS_VAR);
6803 return res;
6804 }
6805
6806 uint_fast16_t float64_to_uint16(float64 a STATUS_PARAM)
6807 {
6808 int64_t v;
6809 uint_fast16_t res;
6810 int old_exc_flags = get_float_exception_flags(status);
6811
6812 v = float64_to_int32(a STATUS_VAR);
6813 if (v < 0) {
6814 res = 0;
6815 } else if (v > 0xffff) {
6816 res = 0xffff;
6817 } else {
6818 return v;
6819 }
6820
6821 set_float_exception_flags(old_exc_flags, status);
6822 float_raise(float_flag_invalid STATUS_VAR);
6823 return res;
6824 }
6825
6826 uint_fast16_t float64_to_uint16_round_to_zero(float64 a STATUS_PARAM)
6827 {
6828 int64_t v;
6829 uint_fast16_t res;
6830 int old_exc_flags = get_float_exception_flags(status);
6831
6832 v = float64_to_int64_round_to_zero(a STATUS_VAR);
6833 if (v < 0) {
6834 res = 0;
6835 } else if (v > 0xffff) {
6836 res = 0xffff;
6837 } else {
6838 return v;
6839 }
6840 set_float_exception_flags(old_exc_flags, status);
6841 float_raise(float_flag_invalid STATUS_VAR);
6842 return res;
6843 }
6844
6845 /*----------------------------------------------------------------------------
6846 | Returns the result of converting the double-precision floating-point value
6847 | `a' to the 64-bit unsigned integer format. The conversion is
6848 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6849 | Arithmetic---which means in particular that the conversion is rounded
6850 | according to the current rounding mode. If `a' is a NaN, the largest
6851 | positive integer is returned. If the conversion overflows, the
6852 | largest unsigned integer is returned. If 'a' is negative, the value is
6853 | rounded and zero is returned; negative values that do not round to zero
6854 | will raise the inexact exception.
6855 *----------------------------------------------------------------------------*/
6856
6857 uint64_t float64_to_uint64(float64 a STATUS_PARAM)
6858 {
6859 flag aSign;
6860 int_fast16_t aExp, shiftCount;
6861 uint64_t aSig, aSigExtra;
6862 a = float64_squash_input_denormal(a STATUS_VAR);
6863
6864 aSig = extractFloat64Frac(a);
6865 aExp = extractFloat64Exp(a);
6866 aSign = extractFloat64Sign(a);
6867 if (aSign && (aExp > 1022)) {
6868 float_raise(float_flag_invalid STATUS_VAR);
6869 if (float64_is_any_nan(a)) {
6870 return LIT64(0xFFFFFFFFFFFFFFFF);
6871 } else {
6872 return 0;
6873 }
6874 }
6875 if (aExp) {
6876 aSig |= LIT64(0x0010000000000000);
6877 }
6878 shiftCount = 0x433 - aExp;
6879 if (shiftCount <= 0) {
6880 if (0x43E < aExp) {
6881 float_raise(float_flag_invalid STATUS_VAR);
6882 return LIT64(0xFFFFFFFFFFFFFFFF);
6883 }
6884 aSigExtra = 0;
6885 aSig <<= -shiftCount;
6886 } else {
6887 shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra);
6888 }
6889 return roundAndPackUint64(aSign, aSig, aSigExtra STATUS_VAR);
6890 }
6891
6892 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
6893 {
6894 signed char current_rounding_mode = STATUS(float_rounding_mode);
6895 set_float_rounding_mode(float_round_to_zero STATUS_VAR);
6896 int64_t v = float64_to_uint64(a STATUS_VAR);
6897 set_float_rounding_mode(current_rounding_mode STATUS_VAR);
6898 return v;
6899 }
6900
6901 #define COMPARE(s, nan_exp) \
6902 INLINE int float ## s ## _compare_internal( float ## s a, float ## s b, \
6903 int is_quiet STATUS_PARAM ) \
6904 { \
6905 flag aSign, bSign; \
6906 uint ## s ## _t av, bv; \
6907 a = float ## s ## _squash_input_denormal(a STATUS_VAR); \
6908 b = float ## s ## _squash_input_denormal(b STATUS_VAR); \
6909 \
6910 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
6911 extractFloat ## s ## Frac( a ) ) || \
6912 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
6913 extractFloat ## s ## Frac( b ) )) { \
6914 if (!is_quiet || \
6915 float ## s ## _is_signaling_nan( a ) || \
6916 float ## s ## _is_signaling_nan( b ) ) { \
6917 float_raise( float_flag_invalid STATUS_VAR); \
6918 } \
6919 return float_relation_unordered; \
6920 } \
6921 aSign = extractFloat ## s ## Sign( a ); \
6922 bSign = extractFloat ## s ## Sign( b ); \
6923 av = float ## s ## _val(a); \
6924 bv = float ## s ## _val(b); \
6925 if ( aSign != bSign ) { \
6926 if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) { \
6927 /* zero case */ \
6928 return float_relation_equal; \
6929 } else { \
6930 return 1 - (2 * aSign); \
6931 } \
6932 } else { \
6933 if (av == bv) { \
6934 return float_relation_equal; \
6935 } else { \
6936 return 1 - 2 * (aSign ^ ( av < bv )); \
6937 } \
6938 } \
6939 } \
6940 \
6941 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM ) \
6942 { \
6943 return float ## s ## _compare_internal(a, b, 0 STATUS_VAR); \
6944 } \
6945 \
6946 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \
6947 { \
6948 return float ## s ## _compare_internal(a, b, 1 STATUS_VAR); \
6949 }
6950
6951 COMPARE(32, 0xff)
6952 COMPARE(64, 0x7ff)
6953
6954 INLINE int floatx80_compare_internal( floatx80 a, floatx80 b,
6955 int is_quiet STATUS_PARAM )
6956 {
6957 flag aSign, bSign;
6958
6959 if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
6960 ( extractFloatx80Frac( a )<<1 ) ) ||
6961 ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
6962 ( extractFloatx80Frac( b )<<1 ) )) {
6963 if (!is_quiet ||
6964 floatx80_is_signaling_nan( a ) ||
6965 floatx80_is_signaling_nan( b ) ) {
6966 float_raise( float_flag_invalid STATUS_VAR);
6967 }
6968 return float_relation_unordered;
6969 }
6970 aSign = extractFloatx80Sign( a );
6971 bSign = extractFloatx80Sign( b );
6972 if ( aSign != bSign ) {
6973
6974 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
6975 ( ( a.low | b.low ) == 0 ) ) {
6976 /* zero case */
6977 return float_relation_equal;
6978 } else {
6979 return 1 - (2 * aSign);
6980 }
6981 } else {
6982 if (a.low == b.low && a.high == b.high) {
6983 return float_relation_equal;
6984 } else {
6985 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6986 }
6987 }
6988 }
6989
6990 int floatx80_compare( floatx80 a, floatx80 b STATUS_PARAM )
6991 {
6992 return floatx80_compare_internal(a, b, 0 STATUS_VAR);
6993 }
6994
6995 int floatx80_compare_quiet( floatx80 a, floatx80 b STATUS_PARAM )
6996 {
6997 return floatx80_compare_internal(a, b, 1 STATUS_VAR);
6998 }
6999
7000 INLINE int float128_compare_internal( float128 a, float128 b,
7001 int is_quiet STATUS_PARAM )
7002 {
7003 flag aSign, bSign;
7004
7005 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7006 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7007 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7008 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7009 if (!is_quiet ||
7010 float128_is_signaling_nan( a ) ||
7011 float128_is_signaling_nan( b ) ) {
7012 float_raise( float_flag_invalid STATUS_VAR);
7013 }
7014 return float_relation_unordered;
7015 }
7016 aSign = extractFloat128Sign( a );
7017 bSign = extractFloat128Sign( b );
7018 if ( aSign != bSign ) {
7019 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7020 /* zero case */
7021 return float_relation_equal;
7022 } else {
7023 return 1 - (2 * aSign);
7024 }
7025 } else {
7026 if (a.low == b.low && a.high == b.high) {
7027 return float_relation_equal;
7028 } else {
7029 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7030 }
7031 }
7032 }
7033
7034 int float128_compare( float128 a, float128 b STATUS_PARAM )
7035 {
7036 return float128_compare_internal(a, b, 0 STATUS_VAR);
7037 }
7038
7039 int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
7040 {
7041 return float128_compare_internal(a, b, 1 STATUS_VAR);
7042 }
7043
7044 /* min() and max() functions. These can't be implemented as
7045 * 'compare and pick one input' because that would mishandle
7046 * NaNs and +0 vs -0.
7047 *
7048 * minnum() and maxnum() functions. These are similar to the min()
7049 * and max() functions but if one of the arguments is a QNaN and
7050 * the other is numerical then the numerical argument is returned.
7051 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
7052 * and maxNum() operations. min() and max() are the typical min/max
7053 * semantics provided by many CPUs which predate that specification.
7054 */
7055 #define MINMAX(s) \
7056 INLINE float ## s float ## s ## _minmax(float ## s a, float ## s b, \
7057 int ismin, int isieee STATUS_PARAM) \
7058 { \
7059 flag aSign, bSign; \
7060 uint ## s ## _t av, bv; \
7061 a = float ## s ## _squash_input_denormal(a STATUS_VAR); \
7062 b = float ## s ## _squash_input_denormal(b STATUS_VAR); \
7063 if (float ## s ## _is_any_nan(a) || \
7064 float ## s ## _is_any_nan(b)) { \
7065 if (isieee) { \
7066 if (float ## s ## _is_quiet_nan(a) && \
7067 !float ## s ##_is_any_nan(b)) { \
7068 return b; \
7069 } else if (float ## s ## _is_quiet_nan(b) && \
7070 !float ## s ## _is_any_nan(a)) { \
7071 return a; \
7072 } \
7073 } \
7074 return propagateFloat ## s ## NaN(a, b STATUS_VAR); \
7075 } \
7076 aSign = extractFloat ## s ## Sign(a); \
7077 bSign = extractFloat ## s ## Sign(b); \
7078 av = float ## s ## _val(a); \
7079 bv = float ## s ## _val(b); \
7080 if (aSign != bSign) { \
7081 if (ismin) { \
7082 return aSign ? a : b; \
7083 } else { \
7084 return aSign ? b : a; \
7085 } \
7086 } else { \
7087 if (ismin) { \
7088 return (aSign ^ (av < bv)) ? a : b; \
7089 } else { \
7090 return (aSign ^ (av < bv)) ? b : a; \
7091 } \
7092 } \
7093 } \
7094 \
7095 float ## s float ## s ## _min(float ## s a, float ## s b STATUS_PARAM) \
7096 { \
7097 return float ## s ## _minmax(a, b, 1, 0 STATUS_VAR); \
7098 } \
7099 \
7100 float ## s float ## s ## _max(float ## s a, float ## s b STATUS_PARAM) \
7101 { \
7102 return float ## s ## _minmax(a, b, 0, 0 STATUS_VAR); \
7103 } \
7104 \
7105 float ## s float ## s ## _minnum(float ## s a, float ## s b STATUS_PARAM) \
7106 { \
7107 return float ## s ## _minmax(a, b, 1, 1 STATUS_VAR); \
7108 } \
7109 \
7110 float ## s float ## s ## _maxnum(float ## s a, float ## s b STATUS_PARAM) \
7111 { \
7112 return float ## s ## _minmax(a, b, 0, 1 STATUS_VAR); \
7113 }
7114
7115 MINMAX(32)
7116 MINMAX(64)
7117
7118
7119 /* Multiply A by 2 raised to the power N. */
7120 float32 float32_scalbn( float32 a, int n STATUS_PARAM )
7121 {
7122 flag aSign;
7123 int16_t aExp;
7124 uint32_t aSig;
7125
7126 a = float32_squash_input_denormal(a STATUS_VAR);
7127 aSig = extractFloat32Frac( a );
7128 aExp = extractFloat32Exp( a );
7129 aSign = extractFloat32Sign( a );
7130
7131 if ( aExp == 0xFF ) {
7132 if ( aSig ) {
7133 return propagateFloat32NaN( a, a STATUS_VAR );
7134 }
7135 return a;
7136 }
7137 if (aExp != 0) {
7138 aSig |= 0x00800000;
7139 } else if (aSig == 0) {
7140 return a;
7141 } else {
7142 aExp++;
7143 }
7144
7145 if (n > 0x200) {
7146 n = 0x200;
7147 } else if (n < -0x200) {
7148 n = -0x200;
7149 }
7150
7151 aExp += n - 1;
7152 aSig <<= 7;
7153 return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
7154 }
7155
7156 float64 float64_scalbn( float64 a, int n STATUS_PARAM )
7157 {
7158 flag aSign;
7159 int16_t aExp;
7160 uint64_t aSig;
7161
7162 a = float64_squash_input_denormal(a STATUS_VAR);
7163 aSig = extractFloat64Frac( a );
7164 aExp = extractFloat64Exp( a );
7165 aSign = extractFloat64Sign( a );
7166
7167 if ( aExp == 0x7FF ) {
7168 if ( aSig ) {
7169 return propagateFloat64NaN( a, a STATUS_VAR );
7170 }
7171 return a;
7172 }
7173 if (aExp != 0) {
7174 aSig |= LIT64( 0x0010000000000000 );
7175 } else if (aSig == 0) {
7176 return a;
7177 } else {
7178 aExp++;
7179 }
7180
7181 if (n > 0x1000) {
7182 n = 0x1000;
7183 } else if (n < -0x1000) {
7184 n = -0x1000;
7185 }
7186
7187 aExp += n - 1;
7188 aSig <<= 10;
7189 return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
7190 }
7191
7192 floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
7193 {
7194 flag aSign;
7195 int32_t aExp;
7196 uint64_t aSig;
7197
7198 aSig = extractFloatx80Frac( a );
7199 aExp = extractFloatx80Exp( a );
7200 aSign = extractFloatx80Sign( a );
7201
7202 if ( aExp == 0x7FFF ) {
7203 if ( aSig<<1 ) {
7204 return propagateFloatx80NaN( a, a STATUS_VAR );
7205 }
7206 return a;
7207 }
7208
7209 if (aExp == 0) {
7210 if (aSig == 0) {
7211 return a;
7212 }
7213 aExp++;
7214 }
7215
7216 if (n > 0x10000) {
7217 n = 0x10000;
7218 } else if (n < -0x10000) {
7219 n = -0x10000;
7220 }
7221
7222 aExp += n;
7223 return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
7224 aSign, aExp, aSig, 0 STATUS_VAR );
7225 }
7226
7227 float128 float128_scalbn( float128 a, int n STATUS_PARAM )
7228 {
7229 flag aSign;
7230 int32_t aExp;
7231 uint64_t aSig0, aSig1;
7232
7233 aSig1 = extractFloat128Frac1( a );
7234 aSig0 = extractFloat128Frac0( a );
7235 aExp = extractFloat128Exp( a );
7236 aSign = extractFloat128Sign( a );
7237 if ( aExp == 0x7FFF ) {
7238 if ( aSig0 | aSig1 ) {
7239 return propagateFloat128NaN( a, a STATUS_VAR );
7240 }
7241 return a;
7242 }
7243 if (aExp != 0) {
7244 aSig0 |= LIT64( 0x0001000000000000 );
7245 } else if (aSig0 == 0 && aSig1 == 0) {
7246 return a;
7247 } else {
7248 aExp++;
7249 }
7250
7251 if (n > 0x10000) {
7252 n = 0x10000;
7253 } else if (n < -0x10000) {
7254 n = -0x10000;
7255 }
7256
7257 aExp += n - 1;
7258 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7259 STATUS_VAR );
7260
7261 }