]> git.proxmox.com Git - mirror_qemu.git/blame - fpu/softfloat.c
contrib: Clean up includes
[mirror_qemu.git] / fpu / softfloat.c
CommitLineData
8d725fac
AF
1/*
2 * QEMU float support
3 *
16017c48
PM
4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
10 * the BSD license
11 * GPL-v2-or-later
12 *
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
8d725fac 16 */
158142c2 17
a7d1ac78
PM
18/*
19===============================================================================
20This C source file is part of the SoftFloat IEC/IEEE Floating-point
21Arithmetic Package, Release 2a.
158142c2
FB
22
23Written by John R. Hauser. This work was made possible in part by the
24International Computer Science Institute, located at Suite 600, 1947 Center
25Street, Berkeley, California 94704. Funding was partially provided by the
26National Science Foundation under grant MIP-9311980. The original version
27of this code was written as part of a project to build a fixed-point vector
28processor in collaboration with the University of California at Berkeley,
29overseen by Profs. Nelson Morgan and John Wawrzynek. More information
a7d1ac78 30is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
158142c2
FB
31arithmetic/SoftFloat.html'.
32
a7d1ac78
PM
33THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
158142c2
FB
38
39Derivative works are acceptable, even for commercial purposes, so long as
a7d1ac78
PM
40(1) they include prominent notice that the work is derivative, and (2) they
41include prominent notice akin to these four paragraphs for those parts of
42this code that are retained.
158142c2 43
a7d1ac78
PM
44===============================================================================
45*/
158142c2 46
16017c48
PM
47/* BSD licensing:
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
50 *
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
53 *
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
56 *
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
60 *
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
64 *
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
76 */
77
78/* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
80 */
81
2ac8bd03
PM
82/* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
84 */
85#include "config.h"
86
6b4c305c 87#include "fpu/softfloat.h"
158142c2 88
dc355b76
PM
89/* We only need stdlib for abort() */
90#include <stdlib.h>
91
158142c2
FB
92/*----------------------------------------------------------------------------
93| Primitive arithmetic functions, including multi-word arithmetic, and
94| division and square root approximations. (Can be specialized to target if
95| desired.)
96*----------------------------------------------------------------------------*/
97#include "softfloat-macros.h"
98
99/*----------------------------------------------------------------------------
100| Functions and definitions to determine: (1) whether tininess for underflow
101| is detected before or after rounding by default, (2) what (if anything)
102| happens when exceptions are raised, (3) how signaling NaNs are distinguished
103| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
104| are propagated from function inputs to output. These details are target-
105| specific.
106*----------------------------------------------------------------------------*/
107#include "softfloat-specialize.h"
108
bb4d4bb3
PM
109/*----------------------------------------------------------------------------
110| Returns the fraction bits of the half-precision floating-point value `a'.
111*----------------------------------------------------------------------------*/
112
a49db98d 113static inline uint32_t extractFloat16Frac(float16 a)
bb4d4bb3
PM
114{
115 return float16_val(a) & 0x3ff;
116}
117
118/*----------------------------------------------------------------------------
119| Returns the exponent bits of the half-precision floating-point value `a'.
120*----------------------------------------------------------------------------*/
121
a49db98d 122static inline int_fast16_t extractFloat16Exp(float16 a)
bb4d4bb3
PM
123{
124 return (float16_val(a) >> 10) & 0x1f;
125}
126
127/*----------------------------------------------------------------------------
128| Returns the sign bit of the single-precision floating-point value `a'.
129*----------------------------------------------------------------------------*/
130
a49db98d 131static inline flag extractFloat16Sign(float16 a)
bb4d4bb3
PM
132{
133 return float16_val(a)>>15;
134}
135
158142c2
FB
136/*----------------------------------------------------------------------------
137| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
138| and 7, and returns the properly rounded 32-bit integer corresponding to the
139| input. If `zSign' is 1, the input is negated before being converted to an
140| integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
141| is simply rounded to an integer, with the inexact exception raised if the
142| input cannot be represented exactly as an integer. However, if the fixed-
143| point input is too large, the invalid exception is raised and the largest
144| positive or negative integer is returned.
145*----------------------------------------------------------------------------*/
146
f4014512 147static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
158142c2 148{
8f506c70 149 int8_t roundingMode;
158142c2 150 flag roundNearestEven;
8f506c70 151 int8_t roundIncrement, roundBits;
760e1416 152 int32_t z;
158142c2 153
a2f2d288 154 roundingMode = status->float_rounding_mode;
158142c2 155 roundNearestEven = ( roundingMode == float_round_nearest_even );
dc355b76
PM
156 switch (roundingMode) {
157 case float_round_nearest_even:
f9288a76 158 case float_round_ties_away:
dc355b76
PM
159 roundIncrement = 0x40;
160 break;
161 case float_round_to_zero:
162 roundIncrement = 0;
163 break;
164 case float_round_up:
165 roundIncrement = zSign ? 0 : 0x7f;
166 break;
167 case float_round_down:
168 roundIncrement = zSign ? 0x7f : 0;
169 break;
170 default:
171 abort();
158142c2
FB
172 }
173 roundBits = absZ & 0x7F;
174 absZ = ( absZ + roundIncrement )>>7;
175 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
176 z = absZ;
177 if ( zSign ) z = - z;
178 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
ff32e16e 179 float_raise(float_flag_invalid, status);
bb98fe42 180 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2 181 }
a2f2d288
PM
182 if (roundBits) {
183 status->float_exception_flags |= float_flag_inexact;
184 }
158142c2
FB
185 return z;
186
187}
188
189/*----------------------------------------------------------------------------
190| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
191| `absZ1', with binary point between bits 63 and 64 (between the input words),
192| and returns the properly rounded 64-bit integer corresponding to the input.
193| If `zSign' is 1, the input is negated before being converted to an integer.
194| Ordinarily, the fixed-point input is simply rounded to an integer, with
195| the inexact exception raised if the input cannot be represented exactly as
196| an integer. However, if the fixed-point input is too large, the invalid
197| exception is raised and the largest positive or negative integer is
198| returned.
199*----------------------------------------------------------------------------*/
200
f42c2224 201static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
e5a41ffa 202 float_status *status)
158142c2 203{
8f506c70 204 int8_t roundingMode;
158142c2 205 flag roundNearestEven, increment;
760e1416 206 int64_t z;
158142c2 207
a2f2d288 208 roundingMode = status->float_rounding_mode;
158142c2 209 roundNearestEven = ( roundingMode == float_round_nearest_even );
dc355b76
PM
210 switch (roundingMode) {
211 case float_round_nearest_even:
f9288a76 212 case float_round_ties_away:
dc355b76
PM
213 increment = ((int64_t) absZ1 < 0);
214 break;
215 case float_round_to_zero:
216 increment = 0;
217 break;
218 case float_round_up:
219 increment = !zSign && absZ1;
220 break;
221 case float_round_down:
222 increment = zSign && absZ1;
223 break;
224 default:
225 abort();
158142c2
FB
226 }
227 if ( increment ) {
228 ++absZ0;
229 if ( absZ0 == 0 ) goto overflow;
bb98fe42 230 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
158142c2
FB
231 }
232 z = absZ0;
233 if ( zSign ) z = - z;
234 if ( z && ( ( z < 0 ) ^ zSign ) ) {
235 overflow:
ff32e16e 236 float_raise(float_flag_invalid, status);
158142c2 237 return
bb98fe42 238 zSign ? (int64_t) LIT64( 0x8000000000000000 )
158142c2
FB
239 : LIT64( 0x7FFFFFFFFFFFFFFF );
240 }
a2f2d288
PM
241 if (absZ1) {
242 status->float_exception_flags |= float_flag_inexact;
243 }
158142c2
FB
244 return z;
245
246}
247
fb3ea83a
TM
248/*----------------------------------------------------------------------------
249| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
250| `absZ1', with binary point between bits 63 and 64 (between the input words),
251| and returns the properly rounded 64-bit unsigned integer corresponding to the
252| input. Ordinarily, the fixed-point input is simply rounded to an integer,
253| with the inexact exception raised if the input cannot be represented exactly
254| as an integer. However, if the fixed-point input is too large, the invalid
255| exception is raised and the largest unsigned integer is returned.
256*----------------------------------------------------------------------------*/
257
f42c2224 258static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
e5a41ffa 259 uint64_t absZ1, float_status *status)
fb3ea83a 260{
8f506c70 261 int8_t roundingMode;
fb3ea83a
TM
262 flag roundNearestEven, increment;
263
a2f2d288 264 roundingMode = status->float_rounding_mode;
fb3ea83a 265 roundNearestEven = (roundingMode == float_round_nearest_even);
dc355b76
PM
266 switch (roundingMode) {
267 case float_round_nearest_even:
f9288a76 268 case float_round_ties_away:
dc355b76
PM
269 increment = ((int64_t)absZ1 < 0);
270 break;
271 case float_round_to_zero:
272 increment = 0;
273 break;
274 case float_round_up:
275 increment = !zSign && absZ1;
276 break;
277 case float_round_down:
278 increment = zSign && absZ1;
279 break;
280 default:
281 abort();
fb3ea83a
TM
282 }
283 if (increment) {
284 ++absZ0;
285 if (absZ0 == 0) {
ff32e16e 286 float_raise(float_flag_invalid, status);
fb3ea83a
TM
287 return LIT64(0xFFFFFFFFFFFFFFFF);
288 }
289 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
290 }
291
292 if (zSign && absZ0) {
ff32e16e 293 float_raise(float_flag_invalid, status);
fb3ea83a
TM
294 return 0;
295 }
296
297 if (absZ1) {
a2f2d288 298 status->float_exception_flags |= float_flag_inexact;
fb3ea83a
TM
299 }
300 return absZ0;
301}
302
158142c2
FB
303/*----------------------------------------------------------------------------
304| Returns the fraction bits of the single-precision floating-point value `a'.
305*----------------------------------------------------------------------------*/
306
a49db98d 307static inline uint32_t extractFloat32Frac( float32 a )
158142c2
FB
308{
309
f090c9d4 310 return float32_val(a) & 0x007FFFFF;
158142c2
FB
311
312}
313
314/*----------------------------------------------------------------------------
315| Returns the exponent bits of the single-precision floating-point value `a'.
316*----------------------------------------------------------------------------*/
317
a49db98d 318static inline int_fast16_t extractFloat32Exp(float32 a)
158142c2
FB
319{
320
f090c9d4 321 return ( float32_val(a)>>23 ) & 0xFF;
158142c2
FB
322
323}
324
325/*----------------------------------------------------------------------------
326| Returns the sign bit of the single-precision floating-point value `a'.
327*----------------------------------------------------------------------------*/
328
a49db98d 329static inline flag extractFloat32Sign( float32 a )
158142c2
FB
330{
331
f090c9d4 332 return float32_val(a)>>31;
158142c2
FB
333
334}
335
37d18660
PM
336/*----------------------------------------------------------------------------
337| If `a' is denormal and we are in flush-to-zero mode then set the
338| input-denormal exception and return zero. Otherwise just return the value.
339*----------------------------------------------------------------------------*/
e5a41ffa 340float32 float32_squash_input_denormal(float32 a, float_status *status)
37d18660 341{
a2f2d288 342 if (status->flush_inputs_to_zero) {
37d18660 343 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
ff32e16e 344 float_raise(float_flag_input_denormal, status);
37d18660
PM
345 return make_float32(float32_val(a) & 0x80000000);
346 }
347 }
348 return a;
349}
350
158142c2
FB
351/*----------------------------------------------------------------------------
352| Normalizes the subnormal single-precision floating-point value represented
353| by the denormalized significand `aSig'. The normalized exponent and
354| significand are stored at the locations pointed to by `zExpPtr' and
355| `zSigPtr', respectively.
356*----------------------------------------------------------------------------*/
357
358static void
94a49d86 359 normalizeFloat32Subnormal(uint32_t aSig, int_fast16_t *zExpPtr, uint32_t *zSigPtr)
158142c2 360{
8f506c70 361 int8_t shiftCount;
158142c2
FB
362
363 shiftCount = countLeadingZeros32( aSig ) - 8;
364 *zSigPtr = aSig<<shiftCount;
365 *zExpPtr = 1 - shiftCount;
366
367}
368
369/*----------------------------------------------------------------------------
370| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
371| single-precision floating-point value, returning the result. After being
372| shifted into the proper positions, the three fields are simply added
373| together to form the result. This means that any integer portion of `zSig'
374| will be added into the exponent. Since a properly normalized significand
375| will have an integer portion equal to 1, the `zExp' input should be 1 less
376| than the desired result exponent whenever `zSig' is a complete, normalized
377| significand.
378*----------------------------------------------------------------------------*/
379
a49db98d 380static inline float32 packFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig)
158142c2
FB
381{
382
f090c9d4 383 return make_float32(
bb98fe42 384 ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
158142c2
FB
385
386}
387
388/*----------------------------------------------------------------------------
389| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
390| and significand `zSig', and returns the proper single-precision floating-
391| point value corresponding to the abstract input. Ordinarily, the abstract
392| value is simply rounded and packed into the single-precision format, with
393| the inexact exception raised if the abstract input cannot be represented
394| exactly. However, if the abstract value is too large, the overflow and
395| inexact exceptions are raised and an infinity or maximal finite value is
396| returned. If the abstract value is too small, the input value is rounded to
397| a subnormal number, and the underflow and inexact exceptions are raised if
398| the abstract input cannot be represented exactly as a subnormal single-
399| precision floating-point number.
400| The input significand `zSig' has its binary point between bits 30
401| and 29, which is 7 bits to the left of the usual location. This shifted
402| significand must be normalized or smaller. If `zSig' is not normalized,
403| `zExp' must be 0; in that case, the result returned is a subnormal number,
404| and it must not require rounding. In the usual case that `zSig' is
405| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
406| The handling of underflow and overflow follows the IEC/IEEE Standard for
407| Binary Floating-Point Arithmetic.
408*----------------------------------------------------------------------------*/
409
e5a41ffa
PM
410static float32 roundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig,
411 float_status *status)
158142c2 412{
8f506c70 413 int8_t roundingMode;
158142c2 414 flag roundNearestEven;
8f506c70 415 int8_t roundIncrement, roundBits;
158142c2
FB
416 flag isTiny;
417
a2f2d288 418 roundingMode = status->float_rounding_mode;
158142c2 419 roundNearestEven = ( roundingMode == float_round_nearest_even );
dc355b76
PM
420 switch (roundingMode) {
421 case float_round_nearest_even:
f9288a76 422 case float_round_ties_away:
dc355b76
PM
423 roundIncrement = 0x40;
424 break;
425 case float_round_to_zero:
426 roundIncrement = 0;
427 break;
428 case float_round_up:
429 roundIncrement = zSign ? 0 : 0x7f;
430 break;
431 case float_round_down:
432 roundIncrement = zSign ? 0x7f : 0;
433 break;
434 default:
435 abort();
436 break;
158142c2
FB
437 }
438 roundBits = zSig & 0x7F;
bb98fe42 439 if ( 0xFD <= (uint16_t) zExp ) {
158142c2
FB
440 if ( ( 0xFD < zExp )
441 || ( ( zExp == 0xFD )
bb98fe42 442 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
158142c2 443 ) {
ff32e16e 444 float_raise(float_flag_overflow | float_flag_inexact, status);
f090c9d4 445 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
158142c2
FB
446 }
447 if ( zExp < 0 ) {
a2f2d288 448 if (status->flush_to_zero) {
ff32e16e 449 float_raise(float_flag_output_denormal, status);
e6afc87f
PM
450 return packFloat32(zSign, 0, 0);
451 }
158142c2 452 isTiny =
a2f2d288
PM
453 (status->float_detect_tininess
454 == float_tininess_before_rounding)
158142c2
FB
455 || ( zExp < -1 )
456 || ( zSig + roundIncrement < 0x80000000 );
457 shift32RightJamming( zSig, - zExp, &zSig );
458 zExp = 0;
459 roundBits = zSig & 0x7F;
ff32e16e
PM
460 if (isTiny && roundBits) {
461 float_raise(float_flag_underflow, status);
462 }
158142c2
FB
463 }
464 }
a2f2d288
PM
465 if (roundBits) {
466 status->float_exception_flags |= float_flag_inexact;
467 }
158142c2
FB
468 zSig = ( zSig + roundIncrement )>>7;
469 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
470 if ( zSig == 0 ) zExp = 0;
471 return packFloat32( zSign, zExp, zSig );
472
473}
474
475/*----------------------------------------------------------------------------
476| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
477| and significand `zSig', and returns the proper single-precision floating-
478| point value corresponding to the abstract input. This routine is just like
479| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
480| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
481| floating-point exponent.
482*----------------------------------------------------------------------------*/
483
484static float32
e5a41ffa
PM
485 normalizeRoundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig,
486 float_status *status)
158142c2 487{
8f506c70 488 int8_t shiftCount;
158142c2
FB
489
490 shiftCount = countLeadingZeros32( zSig ) - 1;
ff32e16e
PM
491 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
492 status);
158142c2
FB
493
494}
495
496/*----------------------------------------------------------------------------
497| Returns the fraction bits of the double-precision floating-point value `a'.
498*----------------------------------------------------------------------------*/
499
a49db98d 500static inline uint64_t extractFloat64Frac( float64 a )
158142c2
FB
501{
502
f090c9d4 503 return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
158142c2
FB
504
505}
506
507/*----------------------------------------------------------------------------
508| Returns the exponent bits of the double-precision floating-point value `a'.
509*----------------------------------------------------------------------------*/
510
a49db98d 511static inline int_fast16_t extractFloat64Exp(float64 a)
158142c2
FB
512{
513
f090c9d4 514 return ( float64_val(a)>>52 ) & 0x7FF;
158142c2
FB
515
516}
517
518/*----------------------------------------------------------------------------
519| Returns the sign bit of the double-precision floating-point value `a'.
520*----------------------------------------------------------------------------*/
521
a49db98d 522static inline flag extractFloat64Sign( float64 a )
158142c2
FB
523{
524
f090c9d4 525 return float64_val(a)>>63;
158142c2
FB
526
527}
528
37d18660
PM
529/*----------------------------------------------------------------------------
530| If `a' is denormal and we are in flush-to-zero mode then set the
531| input-denormal exception and return zero. Otherwise just return the value.
532*----------------------------------------------------------------------------*/
e5a41ffa 533float64 float64_squash_input_denormal(float64 a, float_status *status)
37d18660 534{
a2f2d288 535 if (status->flush_inputs_to_zero) {
37d18660 536 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
ff32e16e 537 float_raise(float_flag_input_denormal, status);
37d18660
PM
538 return make_float64(float64_val(a) & (1ULL << 63));
539 }
540 }
541 return a;
542}
543
158142c2
FB
544/*----------------------------------------------------------------------------
545| Normalizes the subnormal double-precision floating-point value represented
546| by the denormalized significand `aSig'. The normalized exponent and
547| significand are stored at the locations pointed to by `zExpPtr' and
548| `zSigPtr', respectively.
549*----------------------------------------------------------------------------*/
550
551static void
94a49d86 552 normalizeFloat64Subnormal(uint64_t aSig, int_fast16_t *zExpPtr, uint64_t *zSigPtr)
158142c2 553{
8f506c70 554 int8_t shiftCount;
158142c2
FB
555
556 shiftCount = countLeadingZeros64( aSig ) - 11;
557 *zSigPtr = aSig<<shiftCount;
558 *zExpPtr = 1 - shiftCount;
559
560}
561
562/*----------------------------------------------------------------------------
563| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
564| double-precision floating-point value, returning the result. After being
565| shifted into the proper positions, the three fields are simply added
566| together to form the result. This means that any integer portion of `zSig'
567| will be added into the exponent. Since a properly normalized significand
568| will have an integer portion equal to 1, the `zExp' input should be 1 less
569| than the desired result exponent whenever `zSig' is a complete, normalized
570| significand.
571*----------------------------------------------------------------------------*/
572
a49db98d 573static inline float64 packFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig)
158142c2
FB
574{
575
f090c9d4 576 return make_float64(
bb98fe42 577 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
158142c2
FB
578
579}
580
581/*----------------------------------------------------------------------------
582| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
583| and significand `zSig', and returns the proper double-precision floating-
584| point value corresponding to the abstract input. Ordinarily, the abstract
585| value is simply rounded and packed into the double-precision format, with
586| the inexact exception raised if the abstract input cannot be represented
587| exactly. However, if the abstract value is too large, the overflow and
588| inexact exceptions are raised and an infinity or maximal finite value is
a7d1ac78
PM
589| returned. If the abstract value is too small, the input value is rounded to
590| a subnormal number, and the underflow and inexact exceptions are raised if
591| the abstract input cannot be represented exactly as a subnormal double-
158142c2
FB
592| precision floating-point number.
593| The input significand `zSig' has its binary point between bits 62
594| and 61, which is 10 bits to the left of the usual location. This shifted
595| significand must be normalized or smaller. If `zSig' is not normalized,
596| `zExp' must be 0; in that case, the result returned is a subnormal number,
597| and it must not require rounding. In the usual case that `zSig' is
598| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
599| The handling of underflow and overflow follows the IEC/IEEE Standard for
600| Binary Floating-Point Arithmetic.
601*----------------------------------------------------------------------------*/
602
e5a41ffa
PM
603static float64 roundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig,
604 float_status *status)
158142c2 605{
8f506c70 606 int8_t roundingMode;
158142c2 607 flag roundNearestEven;
94a49d86 608 int_fast16_t roundIncrement, roundBits;
158142c2
FB
609 flag isTiny;
610
a2f2d288 611 roundingMode = status->float_rounding_mode;
158142c2 612 roundNearestEven = ( roundingMode == float_round_nearest_even );
dc355b76
PM
613 switch (roundingMode) {
614 case float_round_nearest_even:
f9288a76 615 case float_round_ties_away:
dc355b76
PM
616 roundIncrement = 0x200;
617 break;
618 case float_round_to_zero:
619 roundIncrement = 0;
620 break;
621 case float_round_up:
622 roundIncrement = zSign ? 0 : 0x3ff;
623 break;
624 case float_round_down:
625 roundIncrement = zSign ? 0x3ff : 0;
626 break;
627 default:
628 abort();
158142c2
FB
629 }
630 roundBits = zSig & 0x3FF;
bb98fe42 631 if ( 0x7FD <= (uint16_t) zExp ) {
158142c2
FB
632 if ( ( 0x7FD < zExp )
633 || ( ( zExp == 0x7FD )
bb98fe42 634 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
158142c2 635 ) {
ff32e16e 636 float_raise(float_flag_overflow | float_flag_inexact, status);
f090c9d4 637 return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
158142c2
FB
638 }
639 if ( zExp < 0 ) {
a2f2d288 640 if (status->flush_to_zero) {
ff32e16e 641 float_raise(float_flag_output_denormal, status);
e6afc87f
PM
642 return packFloat64(zSign, 0, 0);
643 }
158142c2 644 isTiny =
a2f2d288
PM
645 (status->float_detect_tininess
646 == float_tininess_before_rounding)
158142c2
FB
647 || ( zExp < -1 )
648 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
649 shift64RightJamming( zSig, - zExp, &zSig );
650 zExp = 0;
651 roundBits = zSig & 0x3FF;
ff32e16e
PM
652 if (isTiny && roundBits) {
653 float_raise(float_flag_underflow, status);
654 }
158142c2
FB
655 }
656 }
a2f2d288
PM
657 if (roundBits) {
658 status->float_exception_flags |= float_flag_inexact;
659 }
158142c2
FB
660 zSig = ( zSig + roundIncrement )>>10;
661 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
662 if ( zSig == 0 ) zExp = 0;
663 return packFloat64( zSign, zExp, zSig );
664
665}
666
667/*----------------------------------------------------------------------------
668| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
669| and significand `zSig', and returns the proper double-precision floating-
670| point value corresponding to the abstract input. This routine is just like
671| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
672| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
673| floating-point exponent.
674*----------------------------------------------------------------------------*/
675
676static float64
e5a41ffa
PM
677 normalizeRoundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig,
678 float_status *status)
158142c2 679{
8f506c70 680 int8_t shiftCount;
158142c2
FB
681
682 shiftCount = countLeadingZeros64( zSig ) - 1;
ff32e16e
PM
683 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
684 status);
158142c2
FB
685
686}
687
158142c2
FB
688/*----------------------------------------------------------------------------
689| Returns the fraction bits of the extended double-precision floating-point
690| value `a'.
691*----------------------------------------------------------------------------*/
692
a49db98d 693static inline uint64_t extractFloatx80Frac( floatx80 a )
158142c2
FB
694{
695
696 return a.low;
697
698}
699
700/*----------------------------------------------------------------------------
701| Returns the exponent bits of the extended double-precision floating-point
702| value `a'.
703*----------------------------------------------------------------------------*/
704
f4014512 705static inline int32_t extractFloatx80Exp( floatx80 a )
158142c2
FB
706{
707
708 return a.high & 0x7FFF;
709
710}
711
712/*----------------------------------------------------------------------------
713| Returns the sign bit of the extended double-precision floating-point value
714| `a'.
715*----------------------------------------------------------------------------*/
716
a49db98d 717static inline flag extractFloatx80Sign( floatx80 a )
158142c2
FB
718{
719
720 return a.high>>15;
721
722}
723
724/*----------------------------------------------------------------------------
725| Normalizes the subnormal extended double-precision floating-point value
726| represented by the denormalized significand `aSig'. The normalized exponent
727| and significand are stored at the locations pointed to by `zExpPtr' and
728| `zSigPtr', respectively.
729*----------------------------------------------------------------------------*/
730
731static void
f4014512 732 normalizeFloatx80Subnormal( uint64_t aSig, int32_t *zExpPtr, uint64_t *zSigPtr )
158142c2 733{
8f506c70 734 int8_t shiftCount;
158142c2
FB
735
736 shiftCount = countLeadingZeros64( aSig );
737 *zSigPtr = aSig<<shiftCount;
738 *zExpPtr = 1 - shiftCount;
739
740}
741
742/*----------------------------------------------------------------------------
743| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
744| extended double-precision floating-point value, returning the result.
745*----------------------------------------------------------------------------*/
746
f4014512 747static inline floatx80 packFloatx80( flag zSign, int32_t zExp, uint64_t zSig )
158142c2
FB
748{
749 floatx80 z;
750
751 z.low = zSig;
bb98fe42 752 z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
158142c2
FB
753 return z;
754
755}
756
757/*----------------------------------------------------------------------------
758| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
759| and extended significand formed by the concatenation of `zSig0' and `zSig1',
760| and returns the proper extended double-precision floating-point value
761| corresponding to the abstract input. Ordinarily, the abstract value is
762| rounded and packed into the extended double-precision format, with the
763| inexact exception raised if the abstract input cannot be represented
764| exactly. However, if the abstract value is too large, the overflow and
765| inexact exceptions are raised and an infinity or maximal finite value is
766| returned. If the abstract value is too small, the input value is rounded to
767| a subnormal number, and the underflow and inexact exceptions are raised if
768| the abstract input cannot be represented exactly as a subnormal extended
769| double-precision floating-point number.
770| If `roundingPrecision' is 32 or 64, the result is rounded to the same
771| number of bits as single or double precision, respectively. Otherwise, the
772| result is rounded to the full precision of the extended double-precision
773| format.
774| The input significand must be normalized or smaller. If the input
775| significand is not normalized, `zExp' must be 0; in that case, the result
776| returned is a subnormal number, and it must not require rounding. The
777| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
778| Floating-Point Arithmetic.
779*----------------------------------------------------------------------------*/
780
8f506c70 781static floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
f4014512 782 int32_t zExp, uint64_t zSig0, uint64_t zSig1,
e5a41ffa 783 float_status *status)
158142c2 784{
8f506c70 785 int8_t roundingMode;
158142c2 786 flag roundNearestEven, increment, isTiny;
f42c2224 787 int64_t roundIncrement, roundMask, roundBits;
158142c2 788
a2f2d288 789 roundingMode = status->float_rounding_mode;
158142c2
FB
790 roundNearestEven = ( roundingMode == float_round_nearest_even );
791 if ( roundingPrecision == 80 ) goto precision80;
792 if ( roundingPrecision == 64 ) {
793 roundIncrement = LIT64( 0x0000000000000400 );
794 roundMask = LIT64( 0x00000000000007FF );
795 }
796 else if ( roundingPrecision == 32 ) {
797 roundIncrement = LIT64( 0x0000008000000000 );
798 roundMask = LIT64( 0x000000FFFFFFFFFF );
799 }
800 else {
801 goto precision80;
802 }
803 zSig0 |= ( zSig1 != 0 );
dc355b76
PM
804 switch (roundingMode) {
805 case float_round_nearest_even:
f9288a76 806 case float_round_ties_away:
dc355b76
PM
807 break;
808 case float_round_to_zero:
809 roundIncrement = 0;
810 break;
811 case float_round_up:
812 roundIncrement = zSign ? 0 : roundMask;
813 break;
814 case float_round_down:
815 roundIncrement = zSign ? roundMask : 0;
816 break;
817 default:
818 abort();
158142c2
FB
819 }
820 roundBits = zSig0 & roundMask;
bb98fe42 821 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
158142c2
FB
822 if ( ( 0x7FFE < zExp )
823 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
824 ) {
825 goto overflow;
826 }
827 if ( zExp <= 0 ) {
a2f2d288 828 if (status->flush_to_zero) {
ff32e16e 829 float_raise(float_flag_output_denormal, status);
e6afc87f
PM
830 return packFloatx80(zSign, 0, 0);
831 }
158142c2 832 isTiny =
a2f2d288
PM
833 (status->float_detect_tininess
834 == float_tininess_before_rounding)
158142c2
FB
835 || ( zExp < 0 )
836 || ( zSig0 <= zSig0 + roundIncrement );
837 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
838 zExp = 0;
839 roundBits = zSig0 & roundMask;
ff32e16e
PM
840 if (isTiny && roundBits) {
841 float_raise(float_flag_underflow, status);
842 }
a2f2d288
PM
843 if (roundBits) {
844 status->float_exception_flags |= float_flag_inexact;
845 }
158142c2 846 zSig0 += roundIncrement;
bb98fe42 847 if ( (int64_t) zSig0 < 0 ) zExp = 1;
158142c2
FB
848 roundIncrement = roundMask + 1;
849 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
850 roundMask |= roundIncrement;
851 }
852 zSig0 &= ~ roundMask;
853 return packFloatx80( zSign, zExp, zSig0 );
854 }
855 }
a2f2d288
PM
856 if (roundBits) {
857 status->float_exception_flags |= float_flag_inexact;
858 }
158142c2
FB
859 zSig0 += roundIncrement;
860 if ( zSig0 < roundIncrement ) {
861 ++zExp;
862 zSig0 = LIT64( 0x8000000000000000 );
863 }
864 roundIncrement = roundMask + 1;
865 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
866 roundMask |= roundIncrement;
867 }
868 zSig0 &= ~ roundMask;
869 if ( zSig0 == 0 ) zExp = 0;
870 return packFloatx80( zSign, zExp, zSig0 );
871 precision80:
dc355b76
PM
872 switch (roundingMode) {
873 case float_round_nearest_even:
f9288a76 874 case float_round_ties_away:
dc355b76
PM
875 increment = ((int64_t)zSig1 < 0);
876 break;
877 case float_round_to_zero:
878 increment = 0;
879 break;
880 case float_round_up:
881 increment = !zSign && zSig1;
882 break;
883 case float_round_down:
884 increment = zSign && zSig1;
885 break;
886 default:
887 abort();
158142c2 888 }
bb98fe42 889 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
158142c2
FB
890 if ( ( 0x7FFE < zExp )
891 || ( ( zExp == 0x7FFE )
892 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
893 && increment
894 )
895 ) {
896 roundMask = 0;
897 overflow:
ff32e16e 898 float_raise(float_flag_overflow | float_flag_inexact, status);
158142c2
FB
899 if ( ( roundingMode == float_round_to_zero )
900 || ( zSign && ( roundingMode == float_round_up ) )
901 || ( ! zSign && ( roundingMode == float_round_down ) )
902 ) {
903 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
904 }
905 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
906 }
907 if ( zExp <= 0 ) {
908 isTiny =
a2f2d288
PM
909 (status->float_detect_tininess
910 == float_tininess_before_rounding)
158142c2
FB
911 || ( zExp < 0 )
912 || ! increment
913 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
914 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
915 zExp = 0;
ff32e16e
PM
916 if (isTiny && zSig1) {
917 float_raise(float_flag_underflow, status);
918 }
a2f2d288
PM
919 if (zSig1) {
920 status->float_exception_flags |= float_flag_inexact;
921 }
dc355b76
PM
922 switch (roundingMode) {
923 case float_round_nearest_even:
f9288a76 924 case float_round_ties_away:
dc355b76
PM
925 increment = ((int64_t)zSig1 < 0);
926 break;
927 case float_round_to_zero:
928 increment = 0;
929 break;
930 case float_round_up:
931 increment = !zSign && zSig1;
932 break;
933 case float_round_down:
934 increment = zSign && zSig1;
935 break;
936 default:
937 abort();
158142c2
FB
938 }
939 if ( increment ) {
940 ++zSig0;
941 zSig0 &=
bb98fe42
AF
942 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
943 if ( (int64_t) zSig0 < 0 ) zExp = 1;
158142c2
FB
944 }
945 return packFloatx80( zSign, zExp, zSig0 );
946 }
947 }
a2f2d288
PM
948 if (zSig1) {
949 status->float_exception_flags |= float_flag_inexact;
950 }
158142c2
FB
951 if ( increment ) {
952 ++zSig0;
953 if ( zSig0 == 0 ) {
954 ++zExp;
955 zSig0 = LIT64( 0x8000000000000000 );
956 }
957 else {
bb98fe42 958 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
158142c2
FB
959 }
960 }
961 else {
962 if ( zSig0 == 0 ) zExp = 0;
963 }
964 return packFloatx80( zSign, zExp, zSig0 );
965
966}
967
968/*----------------------------------------------------------------------------
969| Takes an abstract floating-point value having sign `zSign', exponent
970| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
971| and returns the proper extended double-precision floating-point value
972| corresponding to the abstract input. This routine is just like
973| `roundAndPackFloatx80' except that the input significand does not have to be
974| normalized.
975*----------------------------------------------------------------------------*/
976
8f506c70 977static floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
f4014512 978 flag zSign, int32_t zExp,
e5a41ffa
PM
979 uint64_t zSig0, uint64_t zSig1,
980 float_status *status)
158142c2 981{
8f506c70 982 int8_t shiftCount;
158142c2
FB
983
984 if ( zSig0 == 0 ) {
985 zSig0 = zSig1;
986 zSig1 = 0;
987 zExp -= 64;
988 }
989 shiftCount = countLeadingZeros64( zSig0 );
990 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
991 zExp -= shiftCount;
ff32e16e
PM
992 return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
993 zSig0, zSig1, status);
158142c2
FB
994
995}
996
158142c2
FB
997/*----------------------------------------------------------------------------
998| Returns the least-significant 64 fraction bits of the quadruple-precision
999| floating-point value `a'.
1000*----------------------------------------------------------------------------*/
1001
a49db98d 1002static inline uint64_t extractFloat128Frac1( float128 a )
158142c2
FB
1003{
1004
1005 return a.low;
1006
1007}
1008
1009/*----------------------------------------------------------------------------
1010| Returns the most-significant 48 fraction bits of the quadruple-precision
1011| floating-point value `a'.
1012*----------------------------------------------------------------------------*/
1013
a49db98d 1014static inline uint64_t extractFloat128Frac0( float128 a )
158142c2
FB
1015{
1016
1017 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
1018
1019}
1020
1021/*----------------------------------------------------------------------------
1022| Returns the exponent bits of the quadruple-precision floating-point value
1023| `a'.
1024*----------------------------------------------------------------------------*/
1025
f4014512 1026static inline int32_t extractFloat128Exp( float128 a )
158142c2
FB
1027{
1028
1029 return ( a.high>>48 ) & 0x7FFF;
1030
1031}
1032
1033/*----------------------------------------------------------------------------
1034| Returns the sign bit of the quadruple-precision floating-point value `a'.
1035*----------------------------------------------------------------------------*/
1036
a49db98d 1037static inline flag extractFloat128Sign( float128 a )
158142c2
FB
1038{
1039
1040 return a.high>>63;
1041
1042}
1043
1044/*----------------------------------------------------------------------------
1045| Normalizes the subnormal quadruple-precision floating-point value
1046| represented by the denormalized significand formed by the concatenation of
1047| `aSig0' and `aSig1'. The normalized exponent is stored at the location
1048| pointed to by `zExpPtr'. The most significant 49 bits of the normalized
1049| significand are stored at the location pointed to by `zSig0Ptr', and the
1050| least significant 64 bits of the normalized significand are stored at the
1051| location pointed to by `zSig1Ptr'.
1052*----------------------------------------------------------------------------*/
1053
1054static void
1055 normalizeFloat128Subnormal(
bb98fe42
AF
1056 uint64_t aSig0,
1057 uint64_t aSig1,
f4014512 1058 int32_t *zExpPtr,
bb98fe42
AF
1059 uint64_t *zSig0Ptr,
1060 uint64_t *zSig1Ptr
158142c2
FB
1061 )
1062{
8f506c70 1063 int8_t shiftCount;
158142c2
FB
1064
1065 if ( aSig0 == 0 ) {
1066 shiftCount = countLeadingZeros64( aSig1 ) - 15;
1067 if ( shiftCount < 0 ) {
1068 *zSig0Ptr = aSig1>>( - shiftCount );
1069 *zSig1Ptr = aSig1<<( shiftCount & 63 );
1070 }
1071 else {
1072 *zSig0Ptr = aSig1<<shiftCount;
1073 *zSig1Ptr = 0;
1074 }
1075 *zExpPtr = - shiftCount - 63;
1076 }
1077 else {
1078 shiftCount = countLeadingZeros64( aSig0 ) - 15;
1079 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
1080 *zExpPtr = 1 - shiftCount;
1081 }
1082
1083}
1084
1085/*----------------------------------------------------------------------------
1086| Packs the sign `zSign', the exponent `zExp', and the significand formed
1087| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
1088| floating-point value, returning the result. After being shifted into the
1089| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
1090| added together to form the most significant 32 bits of the result. This
1091| means that any integer portion of `zSig0' will be added into the exponent.
1092| Since a properly normalized significand will have an integer portion equal
1093| to 1, the `zExp' input should be 1 less than the desired result exponent
1094| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
1095| significand.
1096*----------------------------------------------------------------------------*/
1097
a49db98d 1098static inline float128
f4014512 1099 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
158142c2
FB
1100{
1101 float128 z;
1102
1103 z.low = zSig1;
bb98fe42 1104 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
158142c2
FB
1105 return z;
1106
1107}
1108
1109/*----------------------------------------------------------------------------
1110| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1111| and extended significand formed by the concatenation of `zSig0', `zSig1',
1112| and `zSig2', and returns the proper quadruple-precision floating-point value
1113| corresponding to the abstract input. Ordinarily, the abstract value is
1114| simply rounded and packed into the quadruple-precision format, with the
1115| inexact exception raised if the abstract input cannot be represented
1116| exactly. However, if the abstract value is too large, the overflow and
1117| inexact exceptions are raised and an infinity or maximal finite value is
1118| returned. If the abstract value is too small, the input value is rounded to
1119| a subnormal number, and the underflow and inexact exceptions are raised if
1120| the abstract input cannot be represented exactly as a subnormal quadruple-
1121| precision floating-point number.
1122| The input significand must be normalized or smaller. If the input
1123| significand is not normalized, `zExp' must be 0; in that case, the result
1124| returned is a subnormal number, and it must not require rounding. In the
1125| usual case that the input significand is normalized, `zExp' must be 1 less
1126| than the ``true'' floating-point exponent. The handling of underflow and
1127| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1128*----------------------------------------------------------------------------*/
1129
f4014512 1130static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
e5a41ffa
PM
1131 uint64_t zSig0, uint64_t zSig1,
1132 uint64_t zSig2, float_status *status)
158142c2 1133{
8f506c70 1134 int8_t roundingMode;
158142c2
FB
1135 flag roundNearestEven, increment, isTiny;
1136
a2f2d288 1137 roundingMode = status->float_rounding_mode;
158142c2 1138 roundNearestEven = ( roundingMode == float_round_nearest_even );
dc355b76
PM
1139 switch (roundingMode) {
1140 case float_round_nearest_even:
f9288a76 1141 case float_round_ties_away:
dc355b76
PM
1142 increment = ((int64_t)zSig2 < 0);
1143 break;
1144 case float_round_to_zero:
1145 increment = 0;
1146 break;
1147 case float_round_up:
1148 increment = !zSign && zSig2;
1149 break;
1150 case float_round_down:
1151 increment = zSign && zSig2;
1152 break;
1153 default:
1154 abort();
158142c2 1155 }
bb98fe42 1156 if ( 0x7FFD <= (uint32_t) zExp ) {
158142c2
FB
1157 if ( ( 0x7FFD < zExp )
1158 || ( ( zExp == 0x7FFD )
1159 && eq128(
1160 LIT64( 0x0001FFFFFFFFFFFF ),
1161 LIT64( 0xFFFFFFFFFFFFFFFF ),
1162 zSig0,
1163 zSig1
1164 )
1165 && increment
1166 )
1167 ) {
ff32e16e 1168 float_raise(float_flag_overflow | float_flag_inexact, status);
158142c2
FB
1169 if ( ( roundingMode == float_round_to_zero )
1170 || ( zSign && ( roundingMode == float_round_up ) )
1171 || ( ! zSign && ( roundingMode == float_round_down ) )
1172 ) {
1173 return
1174 packFloat128(
1175 zSign,
1176 0x7FFE,
1177 LIT64( 0x0000FFFFFFFFFFFF ),
1178 LIT64( 0xFFFFFFFFFFFFFFFF )
1179 );
1180 }
1181 return packFloat128( zSign, 0x7FFF, 0, 0 );
1182 }
1183 if ( zExp < 0 ) {
a2f2d288 1184 if (status->flush_to_zero) {
ff32e16e 1185 float_raise(float_flag_output_denormal, status);
e6afc87f
PM
1186 return packFloat128(zSign, 0, 0, 0);
1187 }
158142c2 1188 isTiny =
a2f2d288
PM
1189 (status->float_detect_tininess
1190 == float_tininess_before_rounding)
158142c2
FB
1191 || ( zExp < -1 )
1192 || ! increment
1193 || lt128(
1194 zSig0,
1195 zSig1,
1196 LIT64( 0x0001FFFFFFFFFFFF ),
1197 LIT64( 0xFFFFFFFFFFFFFFFF )
1198 );
1199 shift128ExtraRightJamming(
1200 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1201 zExp = 0;
ff32e16e
PM
1202 if (isTiny && zSig2) {
1203 float_raise(float_flag_underflow, status);
1204 }
dc355b76
PM
1205 switch (roundingMode) {
1206 case float_round_nearest_even:
f9288a76 1207 case float_round_ties_away:
dc355b76
PM
1208 increment = ((int64_t)zSig2 < 0);
1209 break;
1210 case float_round_to_zero:
1211 increment = 0;
1212 break;
1213 case float_round_up:
1214 increment = !zSign && zSig2;
1215 break;
1216 case float_round_down:
1217 increment = zSign && zSig2;
1218 break;
1219 default:
1220 abort();
158142c2
FB
1221 }
1222 }
1223 }
a2f2d288
PM
1224 if (zSig2) {
1225 status->float_exception_flags |= float_flag_inexact;
1226 }
158142c2
FB
1227 if ( increment ) {
1228 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1229 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1230 }
1231 else {
1232 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1233 }
1234 return packFloat128( zSign, zExp, zSig0, zSig1 );
1235
1236}
1237
1238/*----------------------------------------------------------------------------
1239| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1240| and significand formed by the concatenation of `zSig0' and `zSig1', and
1241| returns the proper quadruple-precision floating-point value corresponding
1242| to the abstract input. This routine is just like `roundAndPackFloat128'
1243| except that the input significand has fewer bits and does not have to be
1244| normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1245| point exponent.
1246*----------------------------------------------------------------------------*/
1247
f4014512 1248static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
e5a41ffa
PM
1249 uint64_t zSig0, uint64_t zSig1,
1250 float_status *status)
158142c2 1251{
8f506c70 1252 int8_t shiftCount;
bb98fe42 1253 uint64_t zSig2;
158142c2
FB
1254
1255 if ( zSig0 == 0 ) {
1256 zSig0 = zSig1;
1257 zSig1 = 0;
1258 zExp -= 64;
1259 }
1260 shiftCount = countLeadingZeros64( zSig0 ) - 15;
1261 if ( 0 <= shiftCount ) {
1262 zSig2 = 0;
1263 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1264 }
1265 else {
1266 shift128ExtraRightJamming(
1267 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1268 }
1269 zExp -= shiftCount;
ff32e16e 1270 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
158142c2
FB
1271
1272}
1273
158142c2
FB
1274/*----------------------------------------------------------------------------
1275| Returns the result of converting the 32-bit two's complement integer `a'
1276| to the single-precision floating-point format. The conversion is performed
1277| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1278*----------------------------------------------------------------------------*/
1279
e5a41ffa 1280float32 int32_to_float32(int32_t a, float_status *status)
158142c2
FB
1281{
1282 flag zSign;
1283
f090c9d4 1284 if ( a == 0 ) return float32_zero;
bb98fe42 1285 if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
158142c2 1286 zSign = ( a < 0 );
ff32e16e 1287 return normalizeRoundAndPackFloat32(zSign, 0x9C, zSign ? -a : a, status);
158142c2
FB
1288}
1289
1290/*----------------------------------------------------------------------------
1291| Returns the result of converting the 32-bit two's complement integer `a'
1292| to the double-precision floating-point format. The conversion is performed
1293| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1294*----------------------------------------------------------------------------*/
1295
e5a41ffa 1296float64 int32_to_float64(int32_t a, float_status *status)
158142c2
FB
1297{
1298 flag zSign;
3a87d009 1299 uint32_t absA;
8f506c70 1300 int8_t shiftCount;
bb98fe42 1301 uint64_t zSig;
158142c2 1302
f090c9d4 1303 if ( a == 0 ) return float64_zero;
158142c2
FB
1304 zSign = ( a < 0 );
1305 absA = zSign ? - a : a;
1306 shiftCount = countLeadingZeros32( absA ) + 21;
1307 zSig = absA;
1308 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1309
1310}
1311
158142c2
FB
1312/*----------------------------------------------------------------------------
1313| Returns the result of converting the 32-bit two's complement integer `a'
1314| to the extended double-precision floating-point format. The conversion
1315| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1316| Arithmetic.
1317*----------------------------------------------------------------------------*/
1318
e5a41ffa 1319floatx80 int32_to_floatx80(int32_t a, float_status *status)
158142c2
FB
1320{
1321 flag zSign;
3a87d009 1322 uint32_t absA;
8f506c70 1323 int8_t shiftCount;
bb98fe42 1324 uint64_t zSig;
158142c2
FB
1325
1326 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1327 zSign = ( a < 0 );
1328 absA = zSign ? - a : a;
1329 shiftCount = countLeadingZeros32( absA ) + 32;
1330 zSig = absA;
1331 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1332
1333}
1334
158142c2
FB
1335/*----------------------------------------------------------------------------
1336| Returns the result of converting the 32-bit two's complement integer `a' to
1337| the quadruple-precision floating-point format. The conversion is performed
1338| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1339*----------------------------------------------------------------------------*/
1340
e5a41ffa 1341float128 int32_to_float128(int32_t a, float_status *status)
158142c2
FB
1342{
1343 flag zSign;
3a87d009 1344 uint32_t absA;
8f506c70 1345 int8_t shiftCount;
bb98fe42 1346 uint64_t zSig0;
158142c2
FB
1347
1348 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1349 zSign = ( a < 0 );
1350 absA = zSign ? - a : a;
1351 shiftCount = countLeadingZeros32( absA ) + 17;
1352 zSig0 = absA;
1353 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1354
1355}
1356
158142c2
FB
1357/*----------------------------------------------------------------------------
1358| Returns the result of converting the 64-bit two's complement integer `a'
1359| to the single-precision floating-point format. The conversion is performed
1360| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1361*----------------------------------------------------------------------------*/
1362
e5a41ffa 1363float32 int64_to_float32(int64_t a, float_status *status)
158142c2
FB
1364{
1365 flag zSign;
182f42fd 1366 uint64_t absA;
8f506c70 1367 int8_t shiftCount;
158142c2 1368
f090c9d4 1369 if ( a == 0 ) return float32_zero;
158142c2
FB
1370 zSign = ( a < 0 );
1371 absA = zSign ? - a : a;
1372 shiftCount = countLeadingZeros64( absA ) - 40;
1373 if ( 0 <= shiftCount ) {
1374 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1375 }
1376 else {
1377 shiftCount += 7;
1378 if ( shiftCount < 0 ) {
1379 shift64RightJamming( absA, - shiftCount, &absA );
1380 }
1381 else {
1382 absA <<= shiftCount;
1383 }
ff32e16e 1384 return roundAndPackFloat32(zSign, 0x9C - shiftCount, absA, status);
158142c2
FB
1385 }
1386
1387}
1388
1389/*----------------------------------------------------------------------------
1390| Returns the result of converting the 64-bit two's complement integer `a'
1391| to the double-precision floating-point format. The conversion is performed
1392| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1393*----------------------------------------------------------------------------*/
1394
e5a41ffa 1395float64 int64_to_float64(int64_t a, float_status *status)
158142c2
FB
1396{
1397 flag zSign;
1398
f090c9d4 1399 if ( a == 0 ) return float64_zero;
bb98fe42 1400 if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
158142c2
FB
1401 return packFloat64( 1, 0x43E, 0 );
1402 }
1403 zSign = ( a < 0 );
ff32e16e 1404 return normalizeRoundAndPackFloat64(zSign, 0x43C, zSign ? -a : a, status);
158142c2
FB
1405}
1406
158142c2
FB
1407/*----------------------------------------------------------------------------
1408| Returns the result of converting the 64-bit two's complement integer `a'
1409| to the extended double-precision floating-point format. The conversion
1410| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1411| Arithmetic.
1412*----------------------------------------------------------------------------*/
1413
e5a41ffa 1414floatx80 int64_to_floatx80(int64_t a, float_status *status)
158142c2
FB
1415{
1416 flag zSign;
182f42fd 1417 uint64_t absA;
8f506c70 1418 int8_t shiftCount;
158142c2
FB
1419
1420 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1421 zSign = ( a < 0 );
1422 absA = zSign ? - a : a;
1423 shiftCount = countLeadingZeros64( absA );
1424 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1425
1426}
1427
158142c2
FB
1428/*----------------------------------------------------------------------------
1429| Returns the result of converting the 64-bit two's complement integer `a' to
1430| the quadruple-precision floating-point format. The conversion is performed
1431| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1432*----------------------------------------------------------------------------*/
1433
e5a41ffa 1434float128 int64_to_float128(int64_t a, float_status *status)
158142c2
FB
1435{
1436 flag zSign;
182f42fd 1437 uint64_t absA;
8f506c70 1438 int8_t shiftCount;
f4014512 1439 int32_t zExp;
bb98fe42 1440 uint64_t zSig0, zSig1;
158142c2
FB
1441
1442 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1443 zSign = ( a < 0 );
1444 absA = zSign ? - a : a;
1445 shiftCount = countLeadingZeros64( absA ) + 49;
1446 zExp = 0x406E - shiftCount;
1447 if ( 64 <= shiftCount ) {
1448 zSig1 = 0;
1449 zSig0 = absA;
1450 shiftCount -= 64;
1451 }
1452 else {
1453 zSig1 = absA;
1454 zSig0 = 0;
1455 }
1456 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1457 return packFloat128( zSign, zExp, zSig0, zSig1 );
1458
1459}
1460
6bb8e0f1
PM
1461/*----------------------------------------------------------------------------
1462| Returns the result of converting the 64-bit unsigned integer `a'
1463| to the single-precision floating-point format. The conversion is performed
1464| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1465*----------------------------------------------------------------------------*/
1466
e5a41ffa 1467float32 uint64_to_float32(uint64_t a, float_status *status)
6bb8e0f1
PM
1468{
1469 int shiftcount;
1470
1471 if (a == 0) {
1472 return float32_zero;
1473 }
1474
1475 /* Determine (left) shift needed to put first set bit into bit posn 23
1476 * (since packFloat32() expects the binary point between bits 23 and 22);
1477 * this is the fast case for smallish numbers.
1478 */
1479 shiftcount = countLeadingZeros64(a) - 40;
1480 if (shiftcount >= 0) {
1481 return packFloat32(0, 0x95 - shiftcount, a << shiftcount);
1482 }
1483 /* Otherwise we need to do a round-and-pack. roundAndPackFloat32()
1484 * expects the binary point between bits 30 and 29, hence the + 7.
1485 */
1486 shiftcount += 7;
1487 if (shiftcount < 0) {
1488 shift64RightJamming(a, -shiftcount, &a);
1489 } else {
1490 a <<= shiftcount;
1491 }
1492
ff32e16e 1493 return roundAndPackFloat32(0, 0x9c - shiftcount, a, status);
6bb8e0f1
PM
1494}
1495
1496/*----------------------------------------------------------------------------
1497| Returns the result of converting the 64-bit unsigned integer `a'
1498| to the double-precision floating-point format. The conversion is performed
1499| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1500*----------------------------------------------------------------------------*/
1501
e5a41ffa 1502float64 uint64_to_float64(uint64_t a, float_status *status)
6bb8e0f1
PM
1503{
1504 int exp = 0x43C;
1505 int shiftcount;
1506
1507 if (a == 0) {
1508 return float64_zero;
1509 }
1510
1511 shiftcount = countLeadingZeros64(a) - 1;
1512 if (shiftcount < 0) {
1513 shift64RightJamming(a, -shiftcount, &a);
1514 } else {
1515 a <<= shiftcount;
1516 }
ff32e16e 1517 return roundAndPackFloat64(0, exp - shiftcount, a, status);
6bb8e0f1
PM
1518}
1519
1520/*----------------------------------------------------------------------------
1521| Returns the result of converting the 64-bit unsigned integer `a'
1522| to the quadruple-precision floating-point format. The conversion is performed
1523| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1524*----------------------------------------------------------------------------*/
1525
e5a41ffa 1526float128 uint64_to_float128(uint64_t a, float_status *status)
1e397ead
RH
1527{
1528 if (a == 0) {
1529 return float128_zero;
1530 }
ff32e16e 1531 return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status);
1e397ead
RH
1532}
1533
158142c2
FB
1534/*----------------------------------------------------------------------------
1535| Returns the result of converting the single-precision floating-point value
1536| `a' to the 32-bit two's complement integer format. The conversion is
1537| performed according to the IEC/IEEE Standard for Binary Floating-Point
1538| Arithmetic---which means in particular that the conversion is rounded
1539| according to the current rounding mode. If `a' is a NaN, the largest
1540| positive integer is returned. Otherwise, if the conversion overflows, the
1541| largest integer with the same sign as `a' is returned.
1542*----------------------------------------------------------------------------*/
1543
f4014512 1544int32_t float32_to_int32(float32 a, float_status *status)
158142c2
FB
1545{
1546 flag aSign;
94a49d86 1547 int_fast16_t aExp, shiftCount;
bb98fe42
AF
1548 uint32_t aSig;
1549 uint64_t aSig64;
158142c2 1550
ff32e16e 1551 a = float32_squash_input_denormal(a, status);
158142c2
FB
1552 aSig = extractFloat32Frac( a );
1553 aExp = extractFloat32Exp( a );
1554 aSign = extractFloat32Sign( a );
1555 if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1556 if ( aExp ) aSig |= 0x00800000;
1557 shiftCount = 0xAF - aExp;
1558 aSig64 = aSig;
1559 aSig64 <<= 32;
1560 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
ff32e16e 1561 return roundAndPackInt32(aSign, aSig64, status);
158142c2
FB
1562
1563}
1564
1565/*----------------------------------------------------------------------------
1566| Returns the result of converting the single-precision floating-point value
1567| `a' to the 32-bit two's complement integer format. The conversion is
1568| performed according to the IEC/IEEE Standard for Binary Floating-Point
1569| Arithmetic, except that the conversion is always rounded toward zero.
1570| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1571| the conversion overflows, the largest integer with the same sign as `a' is
1572| returned.
1573*----------------------------------------------------------------------------*/
1574
f4014512 1575int32_t float32_to_int32_round_to_zero(float32 a, float_status *status)
158142c2
FB
1576{
1577 flag aSign;
94a49d86 1578 int_fast16_t aExp, shiftCount;
bb98fe42 1579 uint32_t aSig;
b3a6a2e0 1580 int32_t z;
ff32e16e 1581 a = float32_squash_input_denormal(a, status);
158142c2
FB
1582
1583 aSig = extractFloat32Frac( a );
1584 aExp = extractFloat32Exp( a );
1585 aSign = extractFloat32Sign( a );
1586 shiftCount = aExp - 0x9E;
1587 if ( 0 <= shiftCount ) {
f090c9d4 1588 if ( float32_val(a) != 0xCF000000 ) {
ff32e16e 1589 float_raise(float_flag_invalid, status);
158142c2
FB
1590 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1591 }
bb98fe42 1592 return (int32_t) 0x80000000;
158142c2
FB
1593 }
1594 else if ( aExp <= 0x7E ) {
a2f2d288
PM
1595 if (aExp | aSig) {
1596 status->float_exception_flags |= float_flag_inexact;
1597 }
158142c2
FB
1598 return 0;
1599 }
1600 aSig = ( aSig | 0x00800000 )<<8;
1601 z = aSig>>( - shiftCount );
bb98fe42 1602 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
a2f2d288 1603 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
1604 }
1605 if ( aSign ) z = - z;
1606 return z;
1607
1608}
1609
cbcef455
PM
1610/*----------------------------------------------------------------------------
1611| Returns the result of converting the single-precision floating-point value
1612| `a' to the 16-bit two's complement integer format. The conversion is
1613| performed according to the IEC/IEEE Standard for Binary Floating-Point
1614| Arithmetic, except that the conversion is always rounded toward zero.
1615| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1616| the conversion overflows, the largest integer with the same sign as `a' is
1617| returned.
1618*----------------------------------------------------------------------------*/
1619
e5a41ffa 1620int_fast16_t float32_to_int16_round_to_zero(float32 a, float_status *status)
cbcef455
PM
1621{
1622 flag aSign;
94a49d86 1623 int_fast16_t aExp, shiftCount;
bb98fe42 1624 uint32_t aSig;
f4014512 1625 int32_t z;
cbcef455
PM
1626
1627 aSig = extractFloat32Frac( a );
1628 aExp = extractFloat32Exp( a );
1629 aSign = extractFloat32Sign( a );
1630 shiftCount = aExp - 0x8E;
1631 if ( 0 <= shiftCount ) {
1632 if ( float32_val(a) != 0xC7000000 ) {
ff32e16e 1633 float_raise(float_flag_invalid, status);
cbcef455
PM
1634 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1635 return 0x7FFF;
1636 }
1637 }
bb98fe42 1638 return (int32_t) 0xffff8000;
cbcef455
PM
1639 }
1640 else if ( aExp <= 0x7E ) {
1641 if ( aExp | aSig ) {
a2f2d288 1642 status->float_exception_flags |= float_flag_inexact;
cbcef455
PM
1643 }
1644 return 0;
1645 }
1646 shiftCount -= 0x10;
1647 aSig = ( aSig | 0x00800000 )<<8;
1648 z = aSig>>( - shiftCount );
bb98fe42 1649 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
a2f2d288 1650 status->float_exception_flags |= float_flag_inexact;
cbcef455
PM
1651 }
1652 if ( aSign ) {
1653 z = - z;
1654 }
1655 return z;
1656
1657}
1658
158142c2
FB
1659/*----------------------------------------------------------------------------
1660| Returns the result of converting the single-precision floating-point value
1661| `a' to the 64-bit two's complement integer format. The conversion is
1662| performed according to the IEC/IEEE Standard for Binary Floating-Point
1663| Arithmetic---which means in particular that the conversion is rounded
1664| according to the current rounding mode. If `a' is a NaN, the largest
1665| positive integer is returned. Otherwise, if the conversion overflows, the
1666| largest integer with the same sign as `a' is returned.
1667*----------------------------------------------------------------------------*/
1668
f42c2224 1669int64_t float32_to_int64(float32 a, float_status *status)
158142c2
FB
1670{
1671 flag aSign;
94a49d86 1672 int_fast16_t aExp, shiftCount;
bb98fe42
AF
1673 uint32_t aSig;
1674 uint64_t aSig64, aSigExtra;
ff32e16e 1675 a = float32_squash_input_denormal(a, status);
158142c2
FB
1676
1677 aSig = extractFloat32Frac( a );
1678 aExp = extractFloat32Exp( a );
1679 aSign = extractFloat32Sign( a );
1680 shiftCount = 0xBE - aExp;
1681 if ( shiftCount < 0 ) {
ff32e16e 1682 float_raise(float_flag_invalid, status);
158142c2
FB
1683 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1684 return LIT64( 0x7FFFFFFFFFFFFFFF );
1685 }
bb98fe42 1686 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
1687 }
1688 if ( aExp ) aSig |= 0x00800000;
1689 aSig64 = aSig;
1690 aSig64 <<= 40;
1691 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
ff32e16e 1692 return roundAndPackInt64(aSign, aSig64, aSigExtra, status);
158142c2
FB
1693
1694}
1695
2f18bbf9
TM
1696/*----------------------------------------------------------------------------
1697| Returns the result of converting the single-precision floating-point value
1698| `a' to the 64-bit unsigned integer format. The conversion is
1699| performed according to the IEC/IEEE Standard for Binary Floating-Point
1700| Arithmetic---which means in particular that the conversion is rounded
1701| according to the current rounding mode. If `a' is a NaN, the largest
1702| unsigned integer is returned. Otherwise, if the conversion overflows, the
1703| largest unsigned integer is returned. If the 'a' is negative, the result
1704| is rounded and zero is returned; values that do not round to zero will
1705| raise the inexact exception flag.
1706*----------------------------------------------------------------------------*/
1707
182f42fd 1708uint64_t float32_to_uint64(float32 a, float_status *status)
2f18bbf9
TM
1709{
1710 flag aSign;
1711 int_fast16_t aExp, shiftCount;
1712 uint32_t aSig;
1713 uint64_t aSig64, aSigExtra;
ff32e16e 1714 a = float32_squash_input_denormal(a, status);
2f18bbf9
TM
1715
1716 aSig = extractFloat32Frac(a);
1717 aExp = extractFloat32Exp(a);
1718 aSign = extractFloat32Sign(a);
1719 if ((aSign) && (aExp > 126)) {
ff32e16e 1720 float_raise(float_flag_invalid, status);
2f18bbf9
TM
1721 if (float32_is_any_nan(a)) {
1722 return LIT64(0xFFFFFFFFFFFFFFFF);
1723 } else {
1724 return 0;
1725 }
1726 }
1727 shiftCount = 0xBE - aExp;
1728 if (aExp) {
1729 aSig |= 0x00800000;
1730 }
1731 if (shiftCount < 0) {
ff32e16e 1732 float_raise(float_flag_invalid, status);
2f18bbf9
TM
1733 return LIT64(0xFFFFFFFFFFFFFFFF);
1734 }
1735
1736 aSig64 = aSig;
1737 aSig64 <<= 40;
1738 shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra);
ff32e16e 1739 return roundAndPackUint64(aSign, aSig64, aSigExtra, status);
2f18bbf9
TM
1740}
1741
a13d4489
TM
1742/*----------------------------------------------------------------------------
1743| Returns the result of converting the single-precision floating-point value
1744| `a' to the 64-bit unsigned integer format. The conversion is
1745| performed according to the IEC/IEEE Standard for Binary Floating-Point
1746| Arithmetic, except that the conversion is always rounded toward zero. If
1747| `a' is a NaN, the largest unsigned integer is returned. Otherwise, if the
1748| conversion overflows, the largest unsigned integer is returned. If the
1749| 'a' is negative, the result is rounded and zero is returned; values that do
1750| not round to zero will raise the inexact flag.
1751*----------------------------------------------------------------------------*/
1752
182f42fd 1753uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *status)
a13d4489 1754{
a2f2d288 1755 signed char current_rounding_mode = status->float_rounding_mode;
ff32e16e
PM
1756 set_float_rounding_mode(float_round_to_zero, status);
1757 int64_t v = float32_to_uint64(a, status);
1758 set_float_rounding_mode(current_rounding_mode, status);
a13d4489
TM
1759 return v;
1760}
1761
158142c2
FB
1762/*----------------------------------------------------------------------------
1763| Returns the result of converting the single-precision floating-point value
1764| `a' to the 64-bit two's complement integer format. The conversion is
1765| performed according to the IEC/IEEE Standard for Binary Floating-Point
1766| Arithmetic, except that the conversion is always rounded toward zero. If
1767| `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1768| conversion overflows, the largest integer with the same sign as `a' is
1769| returned.
1770*----------------------------------------------------------------------------*/
1771
f42c2224 1772int64_t float32_to_int64_round_to_zero(float32 a, float_status *status)
158142c2
FB
1773{
1774 flag aSign;
94a49d86 1775 int_fast16_t aExp, shiftCount;
bb98fe42
AF
1776 uint32_t aSig;
1777 uint64_t aSig64;
f42c2224 1778 int64_t z;
ff32e16e 1779 a = float32_squash_input_denormal(a, status);
158142c2
FB
1780
1781 aSig = extractFloat32Frac( a );
1782 aExp = extractFloat32Exp( a );
1783 aSign = extractFloat32Sign( a );
1784 shiftCount = aExp - 0xBE;
1785 if ( 0 <= shiftCount ) {
f090c9d4 1786 if ( float32_val(a) != 0xDF000000 ) {
ff32e16e 1787 float_raise(float_flag_invalid, status);
158142c2
FB
1788 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1789 return LIT64( 0x7FFFFFFFFFFFFFFF );
1790 }
1791 }
bb98fe42 1792 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
1793 }
1794 else if ( aExp <= 0x7E ) {
a2f2d288
PM
1795 if (aExp | aSig) {
1796 status->float_exception_flags |= float_flag_inexact;
1797 }
158142c2
FB
1798 return 0;
1799 }
1800 aSig64 = aSig | 0x00800000;
1801 aSig64 <<= 40;
1802 z = aSig64>>( - shiftCount );
bb98fe42 1803 if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
a2f2d288 1804 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
1805 }
1806 if ( aSign ) z = - z;
1807 return z;
1808
1809}
1810
1811/*----------------------------------------------------------------------------
1812| Returns the result of converting the single-precision floating-point value
1813| `a' to the double-precision floating-point format. The conversion is
1814| performed according to the IEC/IEEE Standard for Binary Floating-Point
1815| Arithmetic.
1816*----------------------------------------------------------------------------*/
1817
e5a41ffa 1818float64 float32_to_float64(float32 a, float_status *status)
158142c2
FB
1819{
1820 flag aSign;
94a49d86 1821 int_fast16_t aExp;
bb98fe42 1822 uint32_t aSig;
ff32e16e 1823 a = float32_squash_input_denormal(a, status);
158142c2
FB
1824
1825 aSig = extractFloat32Frac( a );
1826 aExp = extractFloat32Exp( a );
1827 aSign = extractFloat32Sign( a );
1828 if ( aExp == 0xFF ) {
ff32e16e
PM
1829 if (aSig) {
1830 return commonNaNToFloat64(float32ToCommonNaN(a, status), status);
1831 }
158142c2
FB
1832 return packFloat64( aSign, 0x7FF, 0 );
1833 }
1834 if ( aExp == 0 ) {
1835 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1836 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1837 --aExp;
1838 }
bb98fe42 1839 return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
158142c2
FB
1840
1841}
1842
158142c2
FB
1843/*----------------------------------------------------------------------------
1844| Returns the result of converting the single-precision floating-point value
1845| `a' to the extended double-precision floating-point format. The conversion
1846| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1847| Arithmetic.
1848*----------------------------------------------------------------------------*/
1849
e5a41ffa 1850floatx80 float32_to_floatx80(float32 a, float_status *status)
158142c2
FB
1851{
1852 flag aSign;
94a49d86 1853 int_fast16_t aExp;
bb98fe42 1854 uint32_t aSig;
158142c2 1855
ff32e16e 1856 a = float32_squash_input_denormal(a, status);
158142c2
FB
1857 aSig = extractFloat32Frac( a );
1858 aExp = extractFloat32Exp( a );
1859 aSign = extractFloat32Sign( a );
1860 if ( aExp == 0xFF ) {
ff32e16e
PM
1861 if (aSig) {
1862 return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
1863 }
158142c2
FB
1864 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1865 }
1866 if ( aExp == 0 ) {
1867 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1868 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1869 }
1870 aSig |= 0x00800000;
bb98fe42 1871 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
158142c2
FB
1872
1873}
1874
158142c2
FB
1875/*----------------------------------------------------------------------------
1876| Returns the result of converting the single-precision floating-point value
1877| `a' to the double-precision floating-point format. The conversion is
1878| performed according to the IEC/IEEE Standard for Binary Floating-Point
1879| Arithmetic.
1880*----------------------------------------------------------------------------*/
1881
e5a41ffa 1882float128 float32_to_float128(float32 a, float_status *status)
158142c2
FB
1883{
1884 flag aSign;
94a49d86 1885 int_fast16_t aExp;
bb98fe42 1886 uint32_t aSig;
158142c2 1887
ff32e16e 1888 a = float32_squash_input_denormal(a, status);
158142c2
FB
1889 aSig = extractFloat32Frac( a );
1890 aExp = extractFloat32Exp( a );
1891 aSign = extractFloat32Sign( a );
1892 if ( aExp == 0xFF ) {
ff32e16e
PM
1893 if (aSig) {
1894 return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
1895 }
158142c2
FB
1896 return packFloat128( aSign, 0x7FFF, 0, 0 );
1897 }
1898 if ( aExp == 0 ) {
1899 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1900 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1901 --aExp;
1902 }
bb98fe42 1903 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
158142c2
FB
1904
1905}
1906
158142c2
FB
1907/*----------------------------------------------------------------------------
1908| Rounds the single-precision floating-point value `a' to an integer, and
1909| returns the result as a single-precision floating-point value. The
1910| operation is performed according to the IEC/IEEE Standard for Binary
1911| Floating-Point Arithmetic.
1912*----------------------------------------------------------------------------*/
1913
e5a41ffa 1914float32 float32_round_to_int(float32 a, float_status *status)
158142c2
FB
1915{
1916 flag aSign;
94a49d86 1917 int_fast16_t aExp;
bb98fe42 1918 uint32_t lastBitMask, roundBitsMask;
bb98fe42 1919 uint32_t z;
ff32e16e 1920 a = float32_squash_input_denormal(a, status);
158142c2
FB
1921
1922 aExp = extractFloat32Exp( a );
1923 if ( 0x96 <= aExp ) {
1924 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
ff32e16e 1925 return propagateFloat32NaN(a, a, status);
158142c2
FB
1926 }
1927 return a;
1928 }
1929 if ( aExp <= 0x7E ) {
bb98fe42 1930 if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
a2f2d288 1931 status->float_exception_flags |= float_flag_inexact;
158142c2 1932 aSign = extractFloat32Sign( a );
a2f2d288 1933 switch (status->float_rounding_mode) {
158142c2
FB
1934 case float_round_nearest_even:
1935 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1936 return packFloat32( aSign, 0x7F, 0 );
1937 }
1938 break;
f9288a76
PM
1939 case float_round_ties_away:
1940 if (aExp == 0x7E) {
1941 return packFloat32(aSign, 0x7F, 0);
1942 }
1943 break;
158142c2 1944 case float_round_down:
f090c9d4 1945 return make_float32(aSign ? 0xBF800000 : 0);
158142c2 1946 case float_round_up:
f090c9d4 1947 return make_float32(aSign ? 0x80000000 : 0x3F800000);
158142c2
FB
1948 }
1949 return packFloat32( aSign, 0, 0 );
1950 }
1951 lastBitMask = 1;
1952 lastBitMask <<= 0x96 - aExp;
1953 roundBitsMask = lastBitMask - 1;
f090c9d4 1954 z = float32_val(a);
a2f2d288 1955 switch (status->float_rounding_mode) {
dc355b76 1956 case float_round_nearest_even:
158142c2 1957 z += lastBitMask>>1;
dc355b76
PM
1958 if ((z & roundBitsMask) == 0) {
1959 z &= ~lastBitMask;
1960 }
1961 break;
f9288a76
PM
1962 case float_round_ties_away:
1963 z += lastBitMask >> 1;
1964 break;
dc355b76
PM
1965 case float_round_to_zero:
1966 break;
1967 case float_round_up:
1968 if (!extractFloat32Sign(make_float32(z))) {
1969 z += roundBitsMask;
1970 }
1971 break;
1972 case float_round_down:
1973 if (extractFloat32Sign(make_float32(z))) {
158142c2
FB
1974 z += roundBitsMask;
1975 }
dc355b76
PM
1976 break;
1977 default:
1978 abort();
158142c2
FB
1979 }
1980 z &= ~ roundBitsMask;
a2f2d288
PM
1981 if (z != float32_val(a)) {
1982 status->float_exception_flags |= float_flag_inexact;
1983 }
f090c9d4 1984 return make_float32(z);
158142c2
FB
1985
1986}
1987
1988/*----------------------------------------------------------------------------
1989| Returns the result of adding the absolute values of the single-precision
1990| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1991| before being returned. `zSign' is ignored if the result is a NaN.
1992| The addition is performed according to the IEC/IEEE Standard for Binary
1993| Floating-Point Arithmetic.
1994*----------------------------------------------------------------------------*/
1995
e5a41ffa
PM
1996static float32 addFloat32Sigs(float32 a, float32 b, flag zSign,
1997 float_status *status)
158142c2 1998{
94a49d86 1999 int_fast16_t aExp, bExp, zExp;
bb98fe42 2000 uint32_t aSig, bSig, zSig;
94a49d86 2001 int_fast16_t expDiff;
158142c2
FB
2002
2003 aSig = extractFloat32Frac( a );
2004 aExp = extractFloat32Exp( a );
2005 bSig = extractFloat32Frac( b );
2006 bExp = extractFloat32Exp( b );
2007 expDiff = aExp - bExp;
2008 aSig <<= 6;
2009 bSig <<= 6;
2010 if ( 0 < expDiff ) {
2011 if ( aExp == 0xFF ) {
ff32e16e
PM
2012 if (aSig) {
2013 return propagateFloat32NaN(a, b, status);
2014 }
158142c2
FB
2015 return a;
2016 }
2017 if ( bExp == 0 ) {
2018 --expDiff;
2019 }
2020 else {
2021 bSig |= 0x20000000;
2022 }
2023 shift32RightJamming( bSig, expDiff, &bSig );
2024 zExp = aExp;
2025 }
2026 else if ( expDiff < 0 ) {
2027 if ( bExp == 0xFF ) {
ff32e16e
PM
2028 if (bSig) {
2029 return propagateFloat32NaN(a, b, status);
2030 }
158142c2
FB
2031 return packFloat32( zSign, 0xFF, 0 );
2032 }
2033 if ( aExp == 0 ) {
2034 ++expDiff;
2035 }
2036 else {
2037 aSig |= 0x20000000;
2038 }
2039 shift32RightJamming( aSig, - expDiff, &aSig );
2040 zExp = bExp;
2041 }
2042 else {
2043 if ( aExp == 0xFF ) {
ff32e16e
PM
2044 if (aSig | bSig) {
2045 return propagateFloat32NaN(a, b, status);
2046 }
158142c2
FB
2047 return a;
2048 }
fe76d976 2049 if ( aExp == 0 ) {
a2f2d288 2050 if (status->flush_to_zero) {
e6afc87f 2051 if (aSig | bSig) {
ff32e16e 2052 float_raise(float_flag_output_denormal, status);
e6afc87f
PM
2053 }
2054 return packFloat32(zSign, 0, 0);
2055 }
fe76d976
PB
2056 return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
2057 }
158142c2
FB
2058 zSig = 0x40000000 + aSig + bSig;
2059 zExp = aExp;
2060 goto roundAndPack;
2061 }
2062 aSig |= 0x20000000;
2063 zSig = ( aSig + bSig )<<1;
2064 --zExp;
bb98fe42 2065 if ( (int32_t) zSig < 0 ) {
158142c2
FB
2066 zSig = aSig + bSig;
2067 ++zExp;
2068 }
2069 roundAndPack:
ff32e16e 2070 return roundAndPackFloat32(zSign, zExp, zSig, status);
158142c2
FB
2071
2072}
2073
2074/*----------------------------------------------------------------------------
2075| Returns the result of subtracting the absolute values of the single-
2076| precision floating-point values `a' and `b'. If `zSign' is 1, the
2077| difference is negated before being returned. `zSign' is ignored if the
2078| result is a NaN. The subtraction is performed according to the IEC/IEEE
2079| Standard for Binary Floating-Point Arithmetic.
2080*----------------------------------------------------------------------------*/
2081
e5a41ffa
PM
2082static float32 subFloat32Sigs(float32 a, float32 b, flag zSign,
2083 float_status *status)
158142c2 2084{
94a49d86 2085 int_fast16_t aExp, bExp, zExp;
bb98fe42 2086 uint32_t aSig, bSig, zSig;
94a49d86 2087 int_fast16_t expDiff;
158142c2
FB
2088
2089 aSig = extractFloat32Frac( a );
2090 aExp = extractFloat32Exp( a );
2091 bSig = extractFloat32Frac( b );
2092 bExp = extractFloat32Exp( b );
2093 expDiff = aExp - bExp;
2094 aSig <<= 7;
2095 bSig <<= 7;
2096 if ( 0 < expDiff ) goto aExpBigger;
2097 if ( expDiff < 0 ) goto bExpBigger;
2098 if ( aExp == 0xFF ) {
ff32e16e
PM
2099 if (aSig | bSig) {
2100 return propagateFloat32NaN(a, b, status);
2101 }
2102 float_raise(float_flag_invalid, status);
158142c2
FB
2103 return float32_default_nan;
2104 }
2105 if ( aExp == 0 ) {
2106 aExp = 1;
2107 bExp = 1;
2108 }
2109 if ( bSig < aSig ) goto aBigger;
2110 if ( aSig < bSig ) goto bBigger;
a2f2d288 2111 return packFloat32(status->float_rounding_mode == float_round_down, 0, 0);
158142c2
FB
2112 bExpBigger:
2113 if ( bExp == 0xFF ) {
ff32e16e
PM
2114 if (bSig) {
2115 return propagateFloat32NaN(a, b, status);
2116 }
158142c2
FB
2117 return packFloat32( zSign ^ 1, 0xFF, 0 );
2118 }
2119 if ( aExp == 0 ) {
2120 ++expDiff;
2121 }
2122 else {
2123 aSig |= 0x40000000;
2124 }
2125 shift32RightJamming( aSig, - expDiff, &aSig );
2126 bSig |= 0x40000000;
2127 bBigger:
2128 zSig = bSig - aSig;
2129 zExp = bExp;
2130 zSign ^= 1;
2131 goto normalizeRoundAndPack;
2132 aExpBigger:
2133 if ( aExp == 0xFF ) {
ff32e16e
PM
2134 if (aSig) {
2135 return propagateFloat32NaN(a, b, status);
2136 }
158142c2
FB
2137 return a;
2138 }
2139 if ( bExp == 0 ) {
2140 --expDiff;
2141 }
2142 else {
2143 bSig |= 0x40000000;
2144 }
2145 shift32RightJamming( bSig, expDiff, &bSig );
2146 aSig |= 0x40000000;
2147 aBigger:
2148 zSig = aSig - bSig;
2149 zExp = aExp;
2150 normalizeRoundAndPack:
2151 --zExp;
ff32e16e 2152 return normalizeRoundAndPackFloat32(zSign, zExp, zSig, status);
158142c2
FB
2153
2154}
2155
2156/*----------------------------------------------------------------------------
2157| Returns the result of adding the single-precision floating-point values `a'
2158| and `b'. The operation is performed according to the IEC/IEEE Standard for
2159| Binary Floating-Point Arithmetic.
2160*----------------------------------------------------------------------------*/
2161
e5a41ffa 2162float32 float32_add(float32 a, float32 b, float_status *status)
158142c2
FB
2163{
2164 flag aSign, bSign;
ff32e16e
PM
2165 a = float32_squash_input_denormal(a, status);
2166 b = float32_squash_input_denormal(b, status);
158142c2
FB
2167
2168 aSign = extractFloat32Sign( a );
2169 bSign = extractFloat32Sign( b );
2170 if ( aSign == bSign ) {
ff32e16e 2171 return addFloat32Sigs(a, b, aSign, status);
158142c2
FB
2172 }
2173 else {
ff32e16e 2174 return subFloat32Sigs(a, b, aSign, status);
158142c2
FB
2175 }
2176
2177}
2178
2179/*----------------------------------------------------------------------------
2180| Returns the result of subtracting the single-precision floating-point values
2181| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2182| for Binary Floating-Point Arithmetic.
2183*----------------------------------------------------------------------------*/
2184
e5a41ffa 2185float32 float32_sub(float32 a, float32 b, float_status *status)
158142c2
FB
2186{
2187 flag aSign, bSign;
ff32e16e
PM
2188 a = float32_squash_input_denormal(a, status);
2189 b = float32_squash_input_denormal(b, status);
158142c2
FB
2190
2191 aSign = extractFloat32Sign( a );
2192 bSign = extractFloat32Sign( b );
2193 if ( aSign == bSign ) {
ff32e16e 2194 return subFloat32Sigs(a, b, aSign, status);
158142c2
FB
2195 }
2196 else {
ff32e16e 2197 return addFloat32Sigs(a, b, aSign, status);
158142c2
FB
2198 }
2199
2200}
2201
2202/*----------------------------------------------------------------------------
2203| Returns the result of multiplying the single-precision floating-point values
2204| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2205| for Binary Floating-Point Arithmetic.
2206*----------------------------------------------------------------------------*/
2207
e5a41ffa 2208float32 float32_mul(float32 a, float32 b, float_status *status)
158142c2
FB
2209{
2210 flag aSign, bSign, zSign;
94a49d86 2211 int_fast16_t aExp, bExp, zExp;
bb98fe42
AF
2212 uint32_t aSig, bSig;
2213 uint64_t zSig64;
2214 uint32_t zSig;
158142c2 2215
ff32e16e
PM
2216 a = float32_squash_input_denormal(a, status);
2217 b = float32_squash_input_denormal(b, status);
37d18660 2218
158142c2
FB
2219 aSig = extractFloat32Frac( a );
2220 aExp = extractFloat32Exp( a );
2221 aSign = extractFloat32Sign( a );
2222 bSig = extractFloat32Frac( b );
2223 bExp = extractFloat32Exp( b );
2224 bSign = extractFloat32Sign( b );
2225 zSign = aSign ^ bSign;
2226 if ( aExp == 0xFF ) {
2227 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
ff32e16e 2228 return propagateFloat32NaN(a, b, status);
158142c2
FB
2229 }
2230 if ( ( bExp | bSig ) == 0 ) {
ff32e16e 2231 float_raise(float_flag_invalid, status);
158142c2
FB
2232 return float32_default_nan;
2233 }
2234 return packFloat32( zSign, 0xFF, 0 );
2235 }
2236 if ( bExp == 0xFF ) {
ff32e16e
PM
2237 if (bSig) {
2238 return propagateFloat32NaN(a, b, status);
2239 }
158142c2 2240 if ( ( aExp | aSig ) == 0 ) {
ff32e16e 2241 float_raise(float_flag_invalid, status);
158142c2
FB
2242 return float32_default_nan;
2243 }
2244 return packFloat32( zSign, 0xFF, 0 );
2245 }
2246 if ( aExp == 0 ) {
2247 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2248 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2249 }
2250 if ( bExp == 0 ) {
2251 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
2252 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2253 }
2254 zExp = aExp + bExp - 0x7F;
2255 aSig = ( aSig | 0x00800000 )<<7;
2256 bSig = ( bSig | 0x00800000 )<<8;
bb98fe42 2257 shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
158142c2 2258 zSig = zSig64;
bb98fe42 2259 if ( 0 <= (int32_t) ( zSig<<1 ) ) {
158142c2
FB
2260 zSig <<= 1;
2261 --zExp;
2262 }
ff32e16e 2263 return roundAndPackFloat32(zSign, zExp, zSig, status);
158142c2
FB
2264
2265}
2266
2267/*----------------------------------------------------------------------------
2268| Returns the result of dividing the single-precision floating-point value `a'
2269| by the corresponding value `b'. The operation is performed according to the
2270| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2271*----------------------------------------------------------------------------*/
2272
e5a41ffa 2273float32 float32_div(float32 a, float32 b, float_status *status)
158142c2
FB
2274{
2275 flag aSign, bSign, zSign;
94a49d86 2276 int_fast16_t aExp, bExp, zExp;
bb98fe42 2277 uint32_t aSig, bSig, zSig;
ff32e16e
PM
2278 a = float32_squash_input_denormal(a, status);
2279 b = float32_squash_input_denormal(b, status);
158142c2
FB
2280
2281 aSig = extractFloat32Frac( a );
2282 aExp = extractFloat32Exp( a );
2283 aSign = extractFloat32Sign( a );
2284 bSig = extractFloat32Frac( b );
2285 bExp = extractFloat32Exp( b );
2286 bSign = extractFloat32Sign( b );
2287 zSign = aSign ^ bSign;
2288 if ( aExp == 0xFF ) {
ff32e16e
PM
2289 if (aSig) {
2290 return propagateFloat32NaN(a, b, status);
2291 }
158142c2 2292 if ( bExp == 0xFF ) {
ff32e16e
PM
2293 if (bSig) {
2294 return propagateFloat32NaN(a, b, status);
2295 }
2296 float_raise(float_flag_invalid, status);
158142c2
FB
2297 return float32_default_nan;
2298 }
2299 return packFloat32( zSign, 0xFF, 0 );
2300 }
2301 if ( bExp == 0xFF ) {
ff32e16e
PM
2302 if (bSig) {
2303 return propagateFloat32NaN(a, b, status);
2304 }
158142c2
FB
2305 return packFloat32( zSign, 0, 0 );
2306 }
2307 if ( bExp == 0 ) {
2308 if ( bSig == 0 ) {
2309 if ( ( aExp | aSig ) == 0 ) {
ff32e16e 2310 float_raise(float_flag_invalid, status);
158142c2
FB
2311 return float32_default_nan;
2312 }
ff32e16e 2313 float_raise(float_flag_divbyzero, status);
158142c2
FB
2314 return packFloat32( zSign, 0xFF, 0 );
2315 }
2316 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2317 }
2318 if ( aExp == 0 ) {
2319 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2320 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2321 }
2322 zExp = aExp - bExp + 0x7D;
2323 aSig = ( aSig | 0x00800000 )<<7;
2324 bSig = ( bSig | 0x00800000 )<<8;
2325 if ( bSig <= ( aSig + aSig ) ) {
2326 aSig >>= 1;
2327 ++zExp;
2328 }
bb98fe42 2329 zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
158142c2 2330 if ( ( zSig & 0x3F ) == 0 ) {
bb98fe42 2331 zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
158142c2 2332 }
ff32e16e 2333 return roundAndPackFloat32(zSign, zExp, zSig, status);
158142c2
FB
2334
2335}
2336
2337/*----------------------------------------------------------------------------
2338| Returns the remainder of the single-precision floating-point value `a'
2339| with respect to the corresponding value `b'. The operation is performed
2340| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2341*----------------------------------------------------------------------------*/
2342
e5a41ffa 2343float32 float32_rem(float32 a, float32 b, float_status *status)
158142c2 2344{
ed086f3d 2345 flag aSign, zSign;
94a49d86 2346 int_fast16_t aExp, bExp, expDiff;
bb98fe42
AF
2347 uint32_t aSig, bSig;
2348 uint32_t q;
2349 uint64_t aSig64, bSig64, q64;
2350 uint32_t alternateASig;
2351 int32_t sigMean;
ff32e16e
PM
2352 a = float32_squash_input_denormal(a, status);
2353 b = float32_squash_input_denormal(b, status);
158142c2
FB
2354
2355 aSig = extractFloat32Frac( a );
2356 aExp = extractFloat32Exp( a );
2357 aSign = extractFloat32Sign( a );
2358 bSig = extractFloat32Frac( b );
2359 bExp = extractFloat32Exp( b );
158142c2
FB
2360 if ( aExp == 0xFF ) {
2361 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
ff32e16e 2362 return propagateFloat32NaN(a, b, status);
158142c2 2363 }
ff32e16e 2364 float_raise(float_flag_invalid, status);
158142c2
FB
2365 return float32_default_nan;
2366 }
2367 if ( bExp == 0xFF ) {
ff32e16e
PM
2368 if (bSig) {
2369 return propagateFloat32NaN(a, b, status);
2370 }
158142c2
FB
2371 return a;
2372 }
2373 if ( bExp == 0 ) {
2374 if ( bSig == 0 ) {
ff32e16e 2375 float_raise(float_flag_invalid, status);
158142c2
FB
2376 return float32_default_nan;
2377 }
2378 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2379 }
2380 if ( aExp == 0 ) {
2381 if ( aSig == 0 ) return a;
2382 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2383 }
2384 expDiff = aExp - bExp;
2385 aSig |= 0x00800000;
2386 bSig |= 0x00800000;
2387 if ( expDiff < 32 ) {
2388 aSig <<= 8;
2389 bSig <<= 8;
2390 if ( expDiff < 0 ) {
2391 if ( expDiff < -1 ) return a;
2392 aSig >>= 1;
2393 }
2394 q = ( bSig <= aSig );
2395 if ( q ) aSig -= bSig;
2396 if ( 0 < expDiff ) {
bb98fe42 2397 q = ( ( (uint64_t) aSig )<<32 ) / bSig;
158142c2
FB
2398 q >>= 32 - expDiff;
2399 bSig >>= 2;
2400 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2401 }
2402 else {
2403 aSig >>= 2;
2404 bSig >>= 2;
2405 }
2406 }
2407 else {
2408 if ( bSig <= aSig ) aSig -= bSig;
bb98fe42
AF
2409 aSig64 = ( (uint64_t) aSig )<<40;
2410 bSig64 = ( (uint64_t) bSig )<<40;
158142c2
FB
2411 expDiff -= 64;
2412 while ( 0 < expDiff ) {
2413 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2414 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2415 aSig64 = - ( ( bSig * q64 )<<38 );
2416 expDiff -= 62;
2417 }
2418 expDiff += 64;
2419 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2420 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2421 q = q64>>( 64 - expDiff );
2422 bSig <<= 6;
2423 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2424 }
2425 do {
2426 alternateASig = aSig;
2427 ++q;
2428 aSig -= bSig;
bb98fe42 2429 } while ( 0 <= (int32_t) aSig );
158142c2
FB
2430 sigMean = aSig + alternateASig;
2431 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2432 aSig = alternateASig;
2433 }
bb98fe42 2434 zSign = ( (int32_t) aSig < 0 );
158142c2 2435 if ( zSign ) aSig = - aSig;
ff32e16e 2436 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
158142c2
FB
2437}
2438
369be8f6
PM
2439/*----------------------------------------------------------------------------
2440| Returns the result of multiplying the single-precision floating-point values
2441| `a' and `b' then adding 'c', with no intermediate rounding step after the
2442| multiplication. The operation is performed according to the IEC/IEEE
2443| Standard for Binary Floating-Point Arithmetic 754-2008.
2444| The flags argument allows the caller to select negation of the
2445| addend, the intermediate product, or the final result. (The difference
2446| between this and having the caller do a separate negation is that negating
2447| externally will flip the sign bit on NaNs.)
2448*----------------------------------------------------------------------------*/
2449
e5a41ffa
PM
2450float32 float32_muladd(float32 a, float32 b, float32 c, int flags,
2451 float_status *status)
369be8f6
PM
2452{
2453 flag aSign, bSign, cSign, zSign;
94a49d86 2454 int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
369be8f6
PM
2455 uint32_t aSig, bSig, cSig;
2456 flag pInf, pZero, pSign;
2457 uint64_t pSig64, cSig64, zSig64;
2458 uint32_t pSig;
2459 int shiftcount;
2460 flag signflip, infzero;
2461
ff32e16e
PM
2462 a = float32_squash_input_denormal(a, status);
2463 b = float32_squash_input_denormal(b, status);
2464 c = float32_squash_input_denormal(c, status);
369be8f6
PM
2465 aSig = extractFloat32Frac(a);
2466 aExp = extractFloat32Exp(a);
2467 aSign = extractFloat32Sign(a);
2468 bSig = extractFloat32Frac(b);
2469 bExp = extractFloat32Exp(b);
2470 bSign = extractFloat32Sign(b);
2471 cSig = extractFloat32Frac(c);
2472 cExp = extractFloat32Exp(c);
2473 cSign = extractFloat32Sign(c);
2474
2475 infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) ||
2476 (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0));
2477
2478 /* It is implementation-defined whether the cases of (0,inf,qnan)
2479 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
2480 * they return if they do), so we have to hand this information
2481 * off to the target-specific pick-a-NaN routine.
2482 */
2483 if (((aExp == 0xff) && aSig) ||
2484 ((bExp == 0xff) && bSig) ||
2485 ((cExp == 0xff) && cSig)) {
ff32e16e 2486 return propagateFloat32MulAddNaN(a, b, c, infzero, status);
369be8f6
PM
2487 }
2488
2489 if (infzero) {
ff32e16e 2490 float_raise(float_flag_invalid, status);
369be8f6
PM
2491 return float32_default_nan;
2492 }
2493
2494 if (flags & float_muladd_negate_c) {
2495 cSign ^= 1;
2496 }
2497
2498 signflip = (flags & float_muladd_negate_result) ? 1 : 0;
2499
2500 /* Work out the sign and type of the product */
2501 pSign = aSign ^ bSign;
2502 if (flags & float_muladd_negate_product) {
2503 pSign ^= 1;
2504 }
2505 pInf = (aExp == 0xff) || (bExp == 0xff);
2506 pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
2507
2508 if (cExp == 0xff) {
2509 if (pInf && (pSign ^ cSign)) {
2510 /* addition of opposite-signed infinities => InvalidOperation */
ff32e16e 2511 float_raise(float_flag_invalid, status);
369be8f6
PM
2512 return float32_default_nan;
2513 }
2514 /* Otherwise generate an infinity of the same sign */
2515 return packFloat32(cSign ^ signflip, 0xff, 0);
2516 }
2517
2518 if (pInf) {
2519 return packFloat32(pSign ^ signflip, 0xff, 0);
2520 }
2521
2522 if (pZero) {
2523 if (cExp == 0) {
2524 if (cSig == 0) {
2525 /* Adding two exact zeroes */
2526 if (pSign == cSign) {
2527 zSign = pSign;
a2f2d288 2528 } else if (status->float_rounding_mode == float_round_down) {
369be8f6
PM
2529 zSign = 1;
2530 } else {
2531 zSign = 0;
2532 }
2533 return packFloat32(zSign ^ signflip, 0, 0);
2534 }
2535 /* Exact zero plus a denorm */
a2f2d288 2536 if (status->flush_to_zero) {
ff32e16e 2537 float_raise(float_flag_output_denormal, status);
369be8f6
PM
2538 return packFloat32(cSign ^ signflip, 0, 0);
2539 }
2540 }
2541 /* Zero plus something non-zero : just return the something */
67d43538
PM
2542 if (flags & float_muladd_halve_result) {
2543 if (cExp == 0) {
2544 normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2545 }
2546 /* Subtract one to halve, and one again because roundAndPackFloat32
2547 * wants one less than the true exponent.
2548 */
2549 cExp -= 2;
2550 cSig = (cSig | 0x00800000) << 7;
ff32e16e 2551 return roundAndPackFloat32(cSign ^ signflip, cExp, cSig, status);
67d43538 2552 }
a6e7c184 2553 return packFloat32(cSign ^ signflip, cExp, cSig);
369be8f6
PM
2554 }
2555
2556 if (aExp == 0) {
2557 normalizeFloat32Subnormal(aSig, &aExp, &aSig);
2558 }
2559 if (bExp == 0) {
2560 normalizeFloat32Subnormal(bSig, &bExp, &bSig);
2561 }
2562
2563 /* Calculate the actual result a * b + c */
2564
2565 /* Multiply first; this is easy. */
2566 /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
2567 * because we want the true exponent, not the "one-less-than"
2568 * flavour that roundAndPackFloat32() takes.
2569 */
2570 pExp = aExp + bExp - 0x7e;
2571 aSig = (aSig | 0x00800000) << 7;
2572 bSig = (bSig | 0x00800000) << 8;
2573 pSig64 = (uint64_t)aSig * bSig;
2574 if ((int64_t)(pSig64 << 1) >= 0) {
2575 pSig64 <<= 1;
2576 pExp--;
2577 }
2578
2579 zSign = pSign ^ signflip;
2580
2581 /* Now pSig64 is the significand of the multiply, with the explicit bit in
2582 * position 62.
2583 */
2584 if (cExp == 0) {
2585 if (!cSig) {
2586 /* Throw out the special case of c being an exact zero now */
2587 shift64RightJamming(pSig64, 32, &pSig64);
2588 pSig = pSig64;
67d43538
PM
2589 if (flags & float_muladd_halve_result) {
2590 pExp--;
2591 }
369be8f6 2592 return roundAndPackFloat32(zSign, pExp - 1,
ff32e16e 2593 pSig, status);
369be8f6
PM
2594 }
2595 normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2596 }
2597
2598 cSig64 = (uint64_t)cSig << (62 - 23);
2599 cSig64 |= LIT64(0x4000000000000000);
2600 expDiff = pExp - cExp;
2601
2602 if (pSign == cSign) {
2603 /* Addition */
2604 if (expDiff > 0) {
2605 /* scale c to match p */
2606 shift64RightJamming(cSig64, expDiff, &cSig64);
2607 zExp = pExp;
2608 } else if (expDiff < 0) {
2609 /* scale p to match c */
2610 shift64RightJamming(pSig64, -expDiff, &pSig64);
2611 zExp = cExp;
2612 } else {
2613 /* no scaling needed */
2614 zExp = cExp;
2615 }
2616 /* Add significands and make sure explicit bit ends up in posn 62 */
2617 zSig64 = pSig64 + cSig64;
2618 if ((int64_t)zSig64 < 0) {
2619 shift64RightJamming(zSig64, 1, &zSig64);
2620 } else {
2621 zExp--;
2622 }
2623 } else {
2624 /* Subtraction */
2625 if (expDiff > 0) {
2626 shift64RightJamming(cSig64, expDiff, &cSig64);
2627 zSig64 = pSig64 - cSig64;
2628 zExp = pExp;
2629 } else if (expDiff < 0) {
2630 shift64RightJamming(pSig64, -expDiff, &pSig64);
2631 zSig64 = cSig64 - pSig64;
2632 zExp = cExp;
2633 zSign ^= 1;
2634 } else {
2635 zExp = pExp;
2636 if (cSig64 < pSig64) {
2637 zSig64 = pSig64 - cSig64;
2638 } else if (pSig64 < cSig64) {
2639 zSig64 = cSig64 - pSig64;
2640 zSign ^= 1;
2641 } else {
2642 /* Exact zero */
2643 zSign = signflip;
a2f2d288 2644 if (status->float_rounding_mode == float_round_down) {
369be8f6
PM
2645 zSign ^= 1;
2646 }
2647 return packFloat32(zSign, 0, 0);
2648 }
2649 }
2650 --zExp;
2651 /* Normalize to put the explicit bit back into bit 62. */
2652 shiftcount = countLeadingZeros64(zSig64) - 1;
2653 zSig64 <<= shiftcount;
2654 zExp -= shiftcount;
2655 }
67d43538
PM
2656 if (flags & float_muladd_halve_result) {
2657 zExp--;
2658 }
2659
369be8f6 2660 shift64RightJamming(zSig64, 32, &zSig64);
ff32e16e 2661 return roundAndPackFloat32(zSign, zExp, zSig64, status);
369be8f6
PM
2662}
2663
2664
158142c2
FB
2665/*----------------------------------------------------------------------------
2666| Returns the square root of the single-precision floating-point value `a'.
2667| The operation is performed according to the IEC/IEEE Standard for Binary
2668| Floating-Point Arithmetic.
2669*----------------------------------------------------------------------------*/
2670
e5a41ffa 2671float32 float32_sqrt(float32 a, float_status *status)
158142c2
FB
2672{
2673 flag aSign;
94a49d86 2674 int_fast16_t aExp, zExp;
bb98fe42
AF
2675 uint32_t aSig, zSig;
2676 uint64_t rem, term;
ff32e16e 2677 a = float32_squash_input_denormal(a, status);
158142c2
FB
2678
2679 aSig = extractFloat32Frac( a );
2680 aExp = extractFloat32Exp( a );
2681 aSign = extractFloat32Sign( a );
2682 if ( aExp == 0xFF ) {
ff32e16e
PM
2683 if (aSig) {
2684 return propagateFloat32NaN(a, float32_zero, status);
2685 }
158142c2 2686 if ( ! aSign ) return a;
ff32e16e 2687 float_raise(float_flag_invalid, status);
158142c2
FB
2688 return float32_default_nan;
2689 }
2690 if ( aSign ) {
2691 if ( ( aExp | aSig ) == 0 ) return a;
ff32e16e 2692 float_raise(float_flag_invalid, status);
158142c2
FB
2693 return float32_default_nan;
2694 }
2695 if ( aExp == 0 ) {
f090c9d4 2696 if ( aSig == 0 ) return float32_zero;
158142c2
FB
2697 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2698 }
2699 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2700 aSig = ( aSig | 0x00800000 )<<8;
2701 zSig = estimateSqrt32( aExp, aSig ) + 2;
2702 if ( ( zSig & 0x7F ) <= 5 ) {
2703 if ( zSig < 2 ) {
2704 zSig = 0x7FFFFFFF;
2705 goto roundAndPack;
2706 }
2707 aSig >>= aExp & 1;
bb98fe42
AF
2708 term = ( (uint64_t) zSig ) * zSig;
2709 rem = ( ( (uint64_t) aSig )<<32 ) - term;
2710 while ( (int64_t) rem < 0 ) {
158142c2 2711 --zSig;
bb98fe42 2712 rem += ( ( (uint64_t) zSig )<<1 ) | 1;
158142c2
FB
2713 }
2714 zSig |= ( rem != 0 );
2715 }
2716 shift32RightJamming( zSig, 1, &zSig );
2717 roundAndPack:
ff32e16e 2718 return roundAndPackFloat32(0, zExp, zSig, status);
158142c2
FB
2719
2720}
2721
8229c991
AJ
2722/*----------------------------------------------------------------------------
2723| Returns the binary exponential of the single-precision floating-point value
2724| `a'. The operation is performed according to the IEC/IEEE Standard for
2725| Binary Floating-Point Arithmetic.
2726|
2727| Uses the following identities:
2728|
2729| 1. -------------------------------------------------------------------------
2730| x x*ln(2)
2731| 2 = e
2732|
2733| 2. -------------------------------------------------------------------------
2734| 2 3 4 5 n
2735| x x x x x x x
2736| e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2737| 1! 2! 3! 4! 5! n!
2738*----------------------------------------------------------------------------*/
2739
2740static const float64 float32_exp2_coefficients[15] =
2741{
d5138cf4
PM
2742 const_float64( 0x3ff0000000000000ll ), /* 1 */
2743 const_float64( 0x3fe0000000000000ll ), /* 2 */
2744 const_float64( 0x3fc5555555555555ll ), /* 3 */
2745 const_float64( 0x3fa5555555555555ll ), /* 4 */
2746 const_float64( 0x3f81111111111111ll ), /* 5 */
2747 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
2748 const_float64( 0x3f2a01a01a01a01all ), /* 7 */
2749 const_float64( 0x3efa01a01a01a01all ), /* 8 */
2750 const_float64( 0x3ec71de3a556c734ll ), /* 9 */
2751 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2752 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2753 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2754 const_float64( 0x3de6124613a86d09ll ), /* 13 */
2755 const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2756 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
8229c991
AJ
2757};
2758
e5a41ffa 2759float32 float32_exp2(float32 a, float_status *status)
8229c991
AJ
2760{
2761 flag aSign;
94a49d86 2762 int_fast16_t aExp;
bb98fe42 2763 uint32_t aSig;
8229c991
AJ
2764 float64 r, x, xn;
2765 int i;
ff32e16e 2766 a = float32_squash_input_denormal(a, status);
8229c991
AJ
2767
2768 aSig = extractFloat32Frac( a );
2769 aExp = extractFloat32Exp( a );
2770 aSign = extractFloat32Sign( a );
2771
2772 if ( aExp == 0xFF) {
ff32e16e
PM
2773 if (aSig) {
2774 return propagateFloat32NaN(a, float32_zero, status);
2775 }
8229c991
AJ
2776 return (aSign) ? float32_zero : a;
2777 }
2778 if (aExp == 0) {
2779 if (aSig == 0) return float32_one;
2780 }
2781
ff32e16e 2782 float_raise(float_flag_inexact, status);
8229c991
AJ
2783
2784 /* ******************************* */
2785 /* using float64 for approximation */
2786 /* ******************************* */
ff32e16e
PM
2787 x = float32_to_float64(a, status);
2788 x = float64_mul(x, float64_ln2, status);
8229c991
AJ
2789
2790 xn = x;
2791 r = float64_one;
2792 for (i = 0 ; i < 15 ; i++) {
2793 float64 f;
2794
ff32e16e
PM
2795 f = float64_mul(xn, float32_exp2_coefficients[i], status);
2796 r = float64_add(r, f, status);
8229c991 2797
ff32e16e 2798 xn = float64_mul(xn, x, status);
8229c991
AJ
2799 }
2800
2801 return float64_to_float32(r, status);
2802}
2803
374dfc33
AJ
2804/*----------------------------------------------------------------------------
2805| Returns the binary log of the single-precision floating-point value `a'.
2806| The operation is performed according to the IEC/IEEE Standard for Binary
2807| Floating-Point Arithmetic.
2808*----------------------------------------------------------------------------*/
e5a41ffa 2809float32 float32_log2(float32 a, float_status *status)
374dfc33
AJ
2810{
2811 flag aSign, zSign;
94a49d86 2812 int_fast16_t aExp;
bb98fe42 2813 uint32_t aSig, zSig, i;
374dfc33 2814
ff32e16e 2815 a = float32_squash_input_denormal(a, status);
374dfc33
AJ
2816 aSig = extractFloat32Frac( a );
2817 aExp = extractFloat32Exp( a );
2818 aSign = extractFloat32Sign( a );
2819
2820 if ( aExp == 0 ) {
2821 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2822 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2823 }
2824 if ( aSign ) {
ff32e16e 2825 float_raise(float_flag_invalid, status);
374dfc33
AJ
2826 return float32_default_nan;
2827 }
2828 if ( aExp == 0xFF ) {
ff32e16e
PM
2829 if (aSig) {
2830 return propagateFloat32NaN(a, float32_zero, status);
2831 }
374dfc33
AJ
2832 return a;
2833 }
2834
2835 aExp -= 0x7F;
2836 aSig |= 0x00800000;
2837 zSign = aExp < 0;
2838 zSig = aExp << 23;
2839
2840 for (i = 1 << 22; i > 0; i >>= 1) {
bb98fe42 2841 aSig = ( (uint64_t)aSig * aSig ) >> 23;
374dfc33
AJ
2842 if ( aSig & 0x01000000 ) {
2843 aSig >>= 1;
2844 zSig |= i;
2845 }
2846 }
2847
2848 if ( zSign )
2849 zSig = -zSig;
2850
ff32e16e 2851 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
374dfc33
AJ
2852}
2853
158142c2
FB
2854/*----------------------------------------------------------------------------
2855| Returns 1 if the single-precision floating-point value `a' is equal to
b689362d
AJ
2856| the corresponding value `b', and 0 otherwise. The invalid exception is
2857| raised if either operand is a NaN. Otherwise, the comparison is performed
158142c2
FB
2858| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2859*----------------------------------------------------------------------------*/
2860
e5a41ffa 2861int float32_eq(float32 a, float32 b, float_status *status)
158142c2 2862{
b689362d 2863 uint32_t av, bv;
ff32e16e
PM
2864 a = float32_squash_input_denormal(a, status);
2865 b = float32_squash_input_denormal(b, status);
158142c2
FB
2866
2867 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2868 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2869 ) {
ff32e16e 2870 float_raise(float_flag_invalid, status);
158142c2
FB
2871 return 0;
2872 }
b689362d
AJ
2873 av = float32_val(a);
2874 bv = float32_val(b);
2875 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
158142c2
FB
2876}
2877
2878/*----------------------------------------------------------------------------
2879| Returns 1 if the single-precision floating-point value `a' is less than
f5a64251
AJ
2880| or equal to the corresponding value `b', and 0 otherwise. The invalid
2881| exception is raised if either operand is a NaN. The comparison is performed
2882| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
2883*----------------------------------------------------------------------------*/
2884
e5a41ffa 2885int float32_le(float32 a, float32 b, float_status *status)
158142c2
FB
2886{
2887 flag aSign, bSign;
bb98fe42 2888 uint32_t av, bv;
ff32e16e
PM
2889 a = float32_squash_input_denormal(a, status);
2890 b = float32_squash_input_denormal(b, status);
158142c2
FB
2891
2892 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2893 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2894 ) {
ff32e16e 2895 float_raise(float_flag_invalid, status);
158142c2
FB
2896 return 0;
2897 }
2898 aSign = extractFloat32Sign( a );
2899 bSign = extractFloat32Sign( b );
f090c9d4
PB
2900 av = float32_val(a);
2901 bv = float32_val(b);
bb98fe42 2902 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 2903 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
2904
2905}
2906
2907/*----------------------------------------------------------------------------
2908| Returns 1 if the single-precision floating-point value `a' is less than
f5a64251
AJ
2909| the corresponding value `b', and 0 otherwise. The invalid exception is
2910| raised if either operand is a NaN. The comparison is performed according
2911| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
2912*----------------------------------------------------------------------------*/
2913
e5a41ffa 2914int float32_lt(float32 a, float32 b, float_status *status)
158142c2
FB
2915{
2916 flag aSign, bSign;
bb98fe42 2917 uint32_t av, bv;
ff32e16e
PM
2918 a = float32_squash_input_denormal(a, status);
2919 b = float32_squash_input_denormal(b, status);
158142c2
FB
2920
2921 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2922 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2923 ) {
ff32e16e 2924 float_raise(float_flag_invalid, status);
158142c2
FB
2925 return 0;
2926 }
2927 aSign = extractFloat32Sign( a );
2928 bSign = extractFloat32Sign( b );
f090c9d4
PB
2929 av = float32_val(a);
2930 bv = float32_val(b);
bb98fe42 2931 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 2932 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
2933
2934}
2935
67b7861d
AJ
2936/*----------------------------------------------------------------------------
2937| Returns 1 if the single-precision floating-point values `a' and `b' cannot
f5a64251
AJ
2938| be compared, and 0 otherwise. The invalid exception is raised if either
2939| operand is a NaN. The comparison is performed according to the IEC/IEEE
2940| Standard for Binary Floating-Point Arithmetic.
67b7861d
AJ
2941*----------------------------------------------------------------------------*/
2942
e5a41ffa 2943int float32_unordered(float32 a, float32 b, float_status *status)
67b7861d 2944{
ff32e16e
PM
2945 a = float32_squash_input_denormal(a, status);
2946 b = float32_squash_input_denormal(b, status);
67b7861d
AJ
2947
2948 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2949 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2950 ) {
ff32e16e 2951 float_raise(float_flag_invalid, status);
67b7861d
AJ
2952 return 1;
2953 }
2954 return 0;
2955}
b689362d 2956
158142c2
FB
2957/*----------------------------------------------------------------------------
2958| Returns 1 if the single-precision floating-point value `a' is equal to
f5a64251
AJ
2959| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2960| exception. The comparison is performed according to the IEC/IEEE Standard
2961| for Binary Floating-Point Arithmetic.
158142c2
FB
2962*----------------------------------------------------------------------------*/
2963
e5a41ffa 2964int float32_eq_quiet(float32 a, float32 b, float_status *status)
158142c2 2965{
ff32e16e
PM
2966 a = float32_squash_input_denormal(a, status);
2967 b = float32_squash_input_denormal(b, status);
158142c2
FB
2968
2969 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2970 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2971 ) {
b689362d 2972 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
ff32e16e 2973 float_raise(float_flag_invalid, status);
b689362d 2974 }
158142c2
FB
2975 return 0;
2976 }
b689362d
AJ
2977 return ( float32_val(a) == float32_val(b) ) ||
2978 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
158142c2
FB
2979}
2980
2981/*----------------------------------------------------------------------------
2982| Returns 1 if the single-precision floating-point value `a' is less than or
2983| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2984| cause an exception. Otherwise, the comparison is performed according to the
2985| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2986*----------------------------------------------------------------------------*/
2987
e5a41ffa 2988int float32_le_quiet(float32 a, float32 b, float_status *status)
158142c2
FB
2989{
2990 flag aSign, bSign;
bb98fe42 2991 uint32_t av, bv;
ff32e16e
PM
2992 a = float32_squash_input_denormal(a, status);
2993 b = float32_squash_input_denormal(b, status);
158142c2
FB
2994
2995 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2996 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2997 ) {
2998 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
ff32e16e 2999 float_raise(float_flag_invalid, status);
158142c2
FB
3000 }
3001 return 0;
3002 }
3003 aSign = extractFloat32Sign( a );
3004 bSign = extractFloat32Sign( b );
f090c9d4
PB
3005 av = float32_val(a);
3006 bv = float32_val(b);
bb98fe42 3007 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 3008 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
3009
3010}
3011
3012/*----------------------------------------------------------------------------
3013| Returns 1 if the single-precision floating-point value `a' is less than
3014| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3015| exception. Otherwise, the comparison is performed according to the IEC/IEEE
3016| Standard for Binary Floating-Point Arithmetic.
3017*----------------------------------------------------------------------------*/
3018
e5a41ffa 3019int float32_lt_quiet(float32 a, float32 b, float_status *status)
158142c2
FB
3020{
3021 flag aSign, bSign;
bb98fe42 3022 uint32_t av, bv;
ff32e16e
PM
3023 a = float32_squash_input_denormal(a, status);
3024 b = float32_squash_input_denormal(b, status);
158142c2
FB
3025
3026 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3027 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3028 ) {
3029 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
ff32e16e 3030 float_raise(float_flag_invalid, status);
158142c2
FB
3031 }
3032 return 0;
3033 }
3034 aSign = extractFloat32Sign( a );
3035 bSign = extractFloat32Sign( b );
f090c9d4
PB
3036 av = float32_val(a);
3037 bv = float32_val(b);
bb98fe42 3038 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 3039 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
3040
3041}
3042
67b7861d
AJ
3043/*----------------------------------------------------------------------------
3044| Returns 1 if the single-precision floating-point values `a' and `b' cannot
3045| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
3046| comparison is performed according to the IEC/IEEE Standard for Binary
3047| Floating-Point Arithmetic.
3048*----------------------------------------------------------------------------*/
3049
e5a41ffa 3050int float32_unordered_quiet(float32 a, float32 b, float_status *status)
67b7861d 3051{
ff32e16e
PM
3052 a = float32_squash_input_denormal(a, status);
3053 b = float32_squash_input_denormal(b, status);
67b7861d
AJ
3054
3055 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3056 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3057 ) {
3058 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
ff32e16e 3059 float_raise(float_flag_invalid, status);
67b7861d
AJ
3060 }
3061 return 1;
3062 }
3063 return 0;
3064}
3065
158142c2
FB
3066/*----------------------------------------------------------------------------
3067| Returns the result of converting the double-precision floating-point value
3068| `a' to the 32-bit two's complement integer format. The conversion is
3069| performed according to the IEC/IEEE Standard for Binary Floating-Point
3070| Arithmetic---which means in particular that the conversion is rounded
3071| according to the current rounding mode. If `a' is a NaN, the largest
3072| positive integer is returned. Otherwise, if the conversion overflows, the
3073| largest integer with the same sign as `a' is returned.
3074*----------------------------------------------------------------------------*/
3075
f4014512 3076int32_t float64_to_int32(float64 a, float_status *status)
158142c2
FB
3077{
3078 flag aSign;
94a49d86 3079 int_fast16_t aExp, shiftCount;
bb98fe42 3080 uint64_t aSig;
ff32e16e 3081 a = float64_squash_input_denormal(a, status);
158142c2
FB
3082
3083 aSig = extractFloat64Frac( a );
3084 aExp = extractFloat64Exp( a );
3085 aSign = extractFloat64Sign( a );
3086 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3087 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3088 shiftCount = 0x42C - aExp;
3089 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
ff32e16e 3090 return roundAndPackInt32(aSign, aSig, status);
158142c2
FB
3091
3092}
3093
3094/*----------------------------------------------------------------------------
3095| Returns the result of converting the double-precision floating-point value
3096| `a' to the 32-bit two's complement integer format. The conversion is
3097| performed according to the IEC/IEEE Standard for Binary Floating-Point
3098| Arithmetic, except that the conversion is always rounded toward zero.
3099| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
3100| the conversion overflows, the largest integer with the same sign as `a' is
3101| returned.
3102*----------------------------------------------------------------------------*/
3103
f4014512 3104int32_t float64_to_int32_round_to_zero(float64 a, float_status *status)
158142c2
FB
3105{
3106 flag aSign;
94a49d86 3107 int_fast16_t aExp, shiftCount;
bb98fe42 3108 uint64_t aSig, savedASig;
b3a6a2e0 3109 int32_t z;
ff32e16e 3110 a = float64_squash_input_denormal(a, status);
158142c2
FB
3111
3112 aSig = extractFloat64Frac( a );
3113 aExp = extractFloat64Exp( a );
3114 aSign = extractFloat64Sign( a );
3115 if ( 0x41E < aExp ) {
3116 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3117 goto invalid;
3118 }
3119 else if ( aExp < 0x3FF ) {
a2f2d288
PM
3120 if (aExp || aSig) {
3121 status->float_exception_flags |= float_flag_inexact;
3122 }
158142c2
FB
3123 return 0;
3124 }
3125 aSig |= LIT64( 0x0010000000000000 );
3126 shiftCount = 0x433 - aExp;
3127 savedASig = aSig;
3128 aSig >>= shiftCount;
3129 z = aSig;
3130 if ( aSign ) z = - z;
3131 if ( ( z < 0 ) ^ aSign ) {
3132 invalid:
ff32e16e 3133 float_raise(float_flag_invalid, status);
bb98fe42 3134 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2
FB
3135 }
3136 if ( ( aSig<<shiftCount ) != savedASig ) {
a2f2d288 3137 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
3138 }
3139 return z;
3140
3141}
3142
cbcef455
PM
3143/*----------------------------------------------------------------------------
3144| Returns the result of converting the double-precision floating-point value
3145| `a' to the 16-bit two's complement integer format. The conversion is
3146| performed according to the IEC/IEEE Standard for Binary Floating-Point
3147| Arithmetic, except that the conversion is always rounded toward zero.
3148| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
3149| the conversion overflows, the largest integer with the same sign as `a' is
3150| returned.
3151*----------------------------------------------------------------------------*/
3152
e5a41ffa 3153int_fast16_t float64_to_int16_round_to_zero(float64 a, float_status *status)
cbcef455
PM
3154{
3155 flag aSign;
94a49d86 3156 int_fast16_t aExp, shiftCount;
bb98fe42 3157 uint64_t aSig, savedASig;
f4014512 3158 int32_t z;
cbcef455
PM
3159
3160 aSig = extractFloat64Frac( a );
3161 aExp = extractFloat64Exp( a );
3162 aSign = extractFloat64Sign( a );
3163 if ( 0x40E < aExp ) {
3164 if ( ( aExp == 0x7FF ) && aSig ) {
3165 aSign = 0;
3166 }
3167 goto invalid;
3168 }
3169 else if ( aExp < 0x3FF ) {
3170 if ( aExp || aSig ) {
a2f2d288 3171 status->float_exception_flags |= float_flag_inexact;
cbcef455
PM
3172 }
3173 return 0;
3174 }
3175 aSig |= LIT64( 0x0010000000000000 );
3176 shiftCount = 0x433 - aExp;
3177 savedASig = aSig;
3178 aSig >>= shiftCount;
3179 z = aSig;
3180 if ( aSign ) {
3181 z = - z;
3182 }
3183 if ( ( (int16_t)z < 0 ) ^ aSign ) {
3184 invalid:
ff32e16e 3185 float_raise(float_flag_invalid, status);
bb98fe42 3186 return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
cbcef455
PM
3187 }
3188 if ( ( aSig<<shiftCount ) != savedASig ) {
a2f2d288 3189 status->float_exception_flags |= float_flag_inexact;
cbcef455
PM
3190 }
3191 return z;
3192}
3193
158142c2
FB
3194/*----------------------------------------------------------------------------
3195| Returns the result of converting the double-precision floating-point value
3196| `a' to the 64-bit two's complement integer format. The conversion is
3197| performed according to the IEC/IEEE Standard for Binary Floating-Point
3198| Arithmetic---which means in particular that the conversion is rounded
3199| according to the current rounding mode. If `a' is a NaN, the largest
3200| positive integer is returned. Otherwise, if the conversion overflows, the
3201| largest integer with the same sign as `a' is returned.
3202*----------------------------------------------------------------------------*/
3203
f42c2224 3204int64_t float64_to_int64(float64 a, float_status *status)
158142c2
FB
3205{
3206 flag aSign;
94a49d86 3207 int_fast16_t aExp, shiftCount;
bb98fe42 3208 uint64_t aSig, aSigExtra;
ff32e16e 3209 a = float64_squash_input_denormal(a, status);
158142c2
FB
3210
3211 aSig = extractFloat64Frac( a );
3212 aExp = extractFloat64Exp( a );
3213 aSign = extractFloat64Sign( a );
3214 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3215 shiftCount = 0x433 - aExp;
3216 if ( shiftCount <= 0 ) {
3217 if ( 0x43E < aExp ) {
ff32e16e 3218 float_raise(float_flag_invalid, status);
158142c2
FB
3219 if ( ! aSign
3220 || ( ( aExp == 0x7FF )
3221 && ( aSig != LIT64( 0x0010000000000000 ) ) )
3222 ) {
3223 return LIT64( 0x7FFFFFFFFFFFFFFF );
3224 }
bb98fe42 3225 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
3226 }
3227 aSigExtra = 0;
3228 aSig <<= - shiftCount;
3229 }
3230 else {
3231 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3232 }
ff32e16e 3233 return roundAndPackInt64(aSign, aSig, aSigExtra, status);
158142c2
FB
3234
3235}
3236
3237/*----------------------------------------------------------------------------
3238| Returns the result of converting the double-precision floating-point value
3239| `a' to the 64-bit two's complement integer format. The conversion is
3240| performed according to the IEC/IEEE Standard for Binary Floating-Point
3241| Arithmetic, except that the conversion is always rounded toward zero.
3242| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
3243| the conversion overflows, the largest integer with the same sign as `a' is
3244| returned.
3245*----------------------------------------------------------------------------*/
3246
f42c2224 3247int64_t float64_to_int64_round_to_zero(float64 a, float_status *status)
158142c2
FB
3248{
3249 flag aSign;
94a49d86 3250 int_fast16_t aExp, shiftCount;
bb98fe42 3251 uint64_t aSig;
f42c2224 3252 int64_t z;
ff32e16e 3253 a = float64_squash_input_denormal(a, status);
158142c2
FB
3254
3255 aSig = extractFloat64Frac( a );
3256 aExp = extractFloat64Exp( a );
3257 aSign = extractFloat64Sign( a );
3258 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3259 shiftCount = aExp - 0x433;
3260 if ( 0 <= shiftCount ) {
3261 if ( 0x43E <= aExp ) {
f090c9d4 3262 if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
ff32e16e 3263 float_raise(float_flag_invalid, status);
158142c2
FB
3264 if ( ! aSign
3265 || ( ( aExp == 0x7FF )
3266 && ( aSig != LIT64( 0x0010000000000000 ) ) )
3267 ) {
3268 return LIT64( 0x7FFFFFFFFFFFFFFF );
3269 }
3270 }
bb98fe42 3271 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
3272 }
3273 z = aSig<<shiftCount;
3274 }
3275 else {
3276 if ( aExp < 0x3FE ) {
a2f2d288
PM
3277 if (aExp | aSig) {
3278 status->float_exception_flags |= float_flag_inexact;
3279 }
158142c2
FB
3280 return 0;
3281 }
3282 z = aSig>>( - shiftCount );
bb98fe42 3283 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
a2f2d288 3284 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
3285 }
3286 }
3287 if ( aSign ) z = - z;
3288 return z;
3289
3290}
3291
3292/*----------------------------------------------------------------------------
3293| Returns the result of converting the double-precision floating-point value
3294| `a' to the single-precision floating-point format. The conversion is
3295| performed according to the IEC/IEEE Standard for Binary Floating-Point
3296| Arithmetic.
3297*----------------------------------------------------------------------------*/
3298
e5a41ffa 3299float32 float64_to_float32(float64 a, float_status *status)
158142c2
FB
3300{
3301 flag aSign;
94a49d86 3302 int_fast16_t aExp;
bb98fe42
AF
3303 uint64_t aSig;
3304 uint32_t zSig;
ff32e16e 3305 a = float64_squash_input_denormal(a, status);
158142c2
FB
3306
3307 aSig = extractFloat64Frac( a );
3308 aExp = extractFloat64Exp( a );
3309 aSign = extractFloat64Sign( a );
3310 if ( aExp == 0x7FF ) {
ff32e16e
PM
3311 if (aSig) {
3312 return commonNaNToFloat32(float64ToCommonNaN(a, status), status);
3313 }
158142c2
FB
3314 return packFloat32( aSign, 0xFF, 0 );
3315 }
3316 shift64RightJamming( aSig, 22, &aSig );
3317 zSig = aSig;
3318 if ( aExp || zSig ) {
3319 zSig |= 0x40000000;
3320 aExp -= 0x381;
3321 }
ff32e16e 3322 return roundAndPackFloat32(aSign, aExp, zSig, status);
158142c2
FB
3323
3324}
3325
60011498
PB
3326
3327/*----------------------------------------------------------------------------
3328| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3329| half-precision floating-point value, returning the result. After being
3330| shifted into the proper positions, the three fields are simply added
3331| together to form the result. This means that any integer portion of `zSig'
3332| will be added into the exponent. Since a properly normalized significand
3333| will have an integer portion equal to 1, the `zExp' input should be 1 less
3334| than the desired result exponent whenever `zSig' is a complete, normalized
3335| significand.
3336*----------------------------------------------------------------------------*/
94a49d86 3337static float16 packFloat16(flag zSign, int_fast16_t zExp, uint16_t zSig)
60011498 3338{
bb4d4bb3 3339 return make_float16(
bb98fe42 3340 (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
60011498
PB
3341}
3342
c4a1c5e7
PM
3343/*----------------------------------------------------------------------------
3344| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3345| and significand `zSig', and returns the proper half-precision floating-
3346| point value corresponding to the abstract input. Ordinarily, the abstract
3347| value is simply rounded and packed into the half-precision format, with
3348| the inexact exception raised if the abstract input cannot be represented
3349| exactly. However, if the abstract value is too large, the overflow and
3350| inexact exceptions are raised and an infinity or maximal finite value is
3351| returned. If the abstract value is too small, the input value is rounded to
3352| a subnormal number, and the underflow and inexact exceptions are raised if
3353| the abstract input cannot be represented exactly as a subnormal half-
3354| precision floating-point number.
3355| The `ieee' flag indicates whether to use IEEE standard half precision, or
3356| ARM-style "alternative representation", which omits the NaN and Inf
3357| encodings in order to raise the maximum representable exponent by one.
3358| The input significand `zSig' has its binary point between bits 22
3359| and 23, which is 13 bits to the left of the usual location. This shifted
3360| significand must be normalized or smaller. If `zSig' is not normalized,
3361| `zExp' must be 0; in that case, the result returned is a subnormal number,
3362| and it must not require rounding. In the usual case that `zSig' is
3363| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3364| Note the slightly odd position of the binary point in zSig compared with the
3365| other roundAndPackFloat functions. This should probably be fixed if we
3366| need to implement more float16 routines than just conversion.
3367| The handling of underflow and overflow follows the IEC/IEEE Standard for
3368| Binary Floating-Point Arithmetic.
3369*----------------------------------------------------------------------------*/
3370
7ceac86f 3371static float16 roundAndPackFloat16(flag zSign, int_fast16_t zExp,
e5a41ffa
PM
3372 uint32_t zSig, flag ieee,
3373 float_status *status)
c4a1c5e7
PM
3374{
3375 int maxexp = ieee ? 29 : 30;
3376 uint32_t mask;
3377 uint32_t increment;
c4a1c5e7
PM
3378 bool rounding_bumps_exp;
3379 bool is_tiny = false;
3380
3381 /* Calculate the mask of bits of the mantissa which are not
3382 * representable in half-precision and will be lost.
3383 */
3384 if (zExp < 1) {
3385 /* Will be denormal in halfprec */
3386 mask = 0x00ffffff;
3387 if (zExp >= -11) {
3388 mask >>= 11 + zExp;
3389 }
3390 } else {
3391 /* Normal number in halfprec */
3392 mask = 0x00001fff;
3393 }
3394
a2f2d288 3395 switch (status->float_rounding_mode) {
c4a1c5e7
PM
3396 case float_round_nearest_even:
3397 increment = (mask + 1) >> 1;
3398 if ((zSig & mask) == increment) {
3399 increment = zSig & (increment << 1);
3400 }
3401 break;
f9288a76
PM
3402 case float_round_ties_away:
3403 increment = (mask + 1) >> 1;
3404 break;
c4a1c5e7
PM
3405 case float_round_up:
3406 increment = zSign ? 0 : mask;
3407 break;
3408 case float_round_down:
3409 increment = zSign ? mask : 0;
3410 break;
3411 default: /* round_to_zero */
3412 increment = 0;
3413 break;
3414 }
3415
3416 rounding_bumps_exp = (zSig + increment >= 0x01000000);
3417
3418 if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3419 if (ieee) {
ff32e16e 3420 float_raise(float_flag_overflow | float_flag_inexact, status);
c4a1c5e7
PM
3421 return packFloat16(zSign, 0x1f, 0);
3422 } else {
ff32e16e 3423 float_raise(float_flag_invalid, status);
c4a1c5e7
PM
3424 return packFloat16(zSign, 0x1f, 0x3ff);
3425 }
3426 }
3427
3428 if (zExp < 0) {
3429 /* Note that flush-to-zero does not affect half-precision results */
3430 is_tiny =
a2f2d288 3431 (status->float_detect_tininess == float_tininess_before_rounding)
c4a1c5e7
PM
3432 || (zExp < -1)
3433 || (!rounding_bumps_exp);
3434 }
3435 if (zSig & mask) {
ff32e16e 3436 float_raise(float_flag_inexact, status);
c4a1c5e7 3437 if (is_tiny) {
ff32e16e 3438 float_raise(float_flag_underflow, status);
c4a1c5e7
PM
3439 }
3440 }
3441
3442 zSig += increment;
3443 if (rounding_bumps_exp) {
3444 zSig >>= 1;
3445 zExp++;
3446 }
3447
3448 if (zExp < -10) {
3449 return packFloat16(zSign, 0, 0);
3450 }
3451 if (zExp < 0) {
3452 zSig >>= -zExp;
3453 zExp = 0;
3454 }
3455 return packFloat16(zSign, zExp, zSig >> 13);
3456}
3457
3458static void normalizeFloat16Subnormal(uint32_t aSig, int_fast16_t *zExpPtr,
3459 uint32_t *zSigPtr)
3460{
3461 int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3462 *zSigPtr = aSig << shiftCount;
3463 *zExpPtr = 1 - shiftCount;
3464}
3465
60011498
PB
3466/* Half precision floats come in two formats: standard IEEE and "ARM" format.
3467 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
bb4d4bb3 3468
e5a41ffa 3469float32 float16_to_float32(float16 a, flag ieee, float_status *status)
60011498
PB
3470{
3471 flag aSign;
94a49d86 3472 int_fast16_t aExp;
bb98fe42 3473 uint32_t aSig;
60011498 3474
bb4d4bb3
PM
3475 aSign = extractFloat16Sign(a);
3476 aExp = extractFloat16Exp(a);
3477 aSig = extractFloat16Frac(a);
60011498
PB
3478
3479 if (aExp == 0x1f && ieee) {
3480 if (aSig) {
ff32e16e 3481 return commonNaNToFloat32(float16ToCommonNaN(a, status), status);
60011498 3482 }
4be8eeac 3483 return packFloat32(aSign, 0xff, 0);
60011498
PB
3484 }
3485 if (aExp == 0) {
60011498
PB
3486 if (aSig == 0) {
3487 return packFloat32(aSign, 0, 0);
3488 }
3489
c4a1c5e7
PM
3490 normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3491 aExp--;
60011498
PB
3492 }
3493 return packFloat32( aSign, aExp + 0x70, aSig << 13);
3494}
3495
e5a41ffa 3496float16 float32_to_float16(float32 a, flag ieee, float_status *status)
60011498
PB
3497{
3498 flag aSign;
94a49d86 3499 int_fast16_t aExp;
bb98fe42 3500 uint32_t aSig;
38970efa 3501
ff32e16e 3502 a = float32_squash_input_denormal(a, status);
60011498
PB
3503
3504 aSig = extractFloat32Frac( a );
3505 aExp = extractFloat32Exp( a );
3506 aSign = extractFloat32Sign( a );
3507 if ( aExp == 0xFF ) {
3508 if (aSig) {
600e30d2 3509 /* Input is a NaN */
600e30d2 3510 if (!ieee) {
ff32e16e 3511 float_raise(float_flag_invalid, status);
600e30d2
PM
3512 return packFloat16(aSign, 0, 0);
3513 }
38970efa 3514 return commonNaNToFloat16(
ff32e16e 3515 float32ToCommonNaN(a, status), status);
60011498 3516 }
600e30d2
PM
3517 /* Infinity */
3518 if (!ieee) {
ff32e16e 3519 float_raise(float_flag_invalid, status);
600e30d2
PM
3520 return packFloat16(aSign, 0x1f, 0x3ff);
3521 }
3522 return packFloat16(aSign, 0x1f, 0);
60011498 3523 }
600e30d2 3524 if (aExp == 0 && aSig == 0) {
60011498
PB
3525 return packFloat16(aSign, 0, 0);
3526 }
38970efa
PM
3527 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3528 * even if the input is denormal; however this is harmless because
3529 * the largest possible single-precision denormal is still smaller
3530 * than the smallest representable half-precision denormal, and so we
3531 * will end up ignoring aSig and returning via the "always return zero"
3532 * codepath.
3533 */
60011498 3534 aSig |= 0x00800000;
c4a1c5e7 3535 aExp -= 0x71;
60011498 3536
ff32e16e 3537 return roundAndPackFloat16(aSign, aExp, aSig, ieee, status);
60011498
PB
3538}
3539
e5a41ffa 3540float64 float16_to_float64(float16 a, flag ieee, float_status *status)
14c9a07e
PM
3541{
3542 flag aSign;
3543 int_fast16_t aExp;
3544 uint32_t aSig;
3545
3546 aSign = extractFloat16Sign(a);
3547 aExp = extractFloat16Exp(a);
3548 aSig = extractFloat16Frac(a);
3549
3550 if (aExp == 0x1f && ieee) {
3551 if (aSig) {
3552 return commonNaNToFloat64(
ff32e16e 3553 float16ToCommonNaN(a, status), status);
14c9a07e
PM
3554 }
3555 return packFloat64(aSign, 0x7ff, 0);
3556 }
3557 if (aExp == 0) {
3558 if (aSig == 0) {
3559 return packFloat64(aSign, 0, 0);
3560 }
3561
3562 normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3563 aExp--;
3564 }
3565 return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);
3566}
3567
e5a41ffa 3568float16 float64_to_float16(float64 a, flag ieee, float_status *status)
14c9a07e
PM
3569{
3570 flag aSign;
3571 int_fast16_t aExp;
3572 uint64_t aSig;
3573 uint32_t zSig;
3574
ff32e16e 3575 a = float64_squash_input_denormal(a, status);
14c9a07e
PM
3576
3577 aSig = extractFloat64Frac(a);
3578 aExp = extractFloat64Exp(a);
3579 aSign = extractFloat64Sign(a);
3580 if (aExp == 0x7FF) {
3581 if (aSig) {
3582 /* Input is a NaN */
3583 if (!ieee) {
ff32e16e 3584 float_raise(float_flag_invalid, status);
14c9a07e
PM
3585 return packFloat16(aSign, 0, 0);
3586 }
3587 return commonNaNToFloat16(
ff32e16e 3588 float64ToCommonNaN(a, status), status);
14c9a07e
PM
3589 }
3590 /* Infinity */
3591 if (!ieee) {
ff32e16e 3592 float_raise(float_flag_invalid, status);
14c9a07e
PM
3593 return packFloat16(aSign, 0x1f, 0x3ff);
3594 }
3595 return packFloat16(aSign, 0x1f, 0);
3596 }
3597 shift64RightJamming(aSig, 29, &aSig);
3598 zSig = aSig;
3599 if (aExp == 0 && zSig == 0) {
3600 return packFloat16(aSign, 0, 0);
3601 }
3602 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3603 * even if the input is denormal; however this is harmless because
3604 * the largest possible single-precision denormal is still smaller
3605 * than the smallest representable half-precision denormal, and so we
3606 * will end up ignoring aSig and returning via the "always return zero"
3607 * codepath.
3608 */
3609 zSig |= 0x00800000;
3610 aExp -= 0x3F1;
3611
ff32e16e 3612 return roundAndPackFloat16(aSign, aExp, zSig, ieee, status);
14c9a07e
PM
3613}
3614
158142c2
FB
3615/*----------------------------------------------------------------------------
3616| Returns the result of converting the double-precision floating-point value
3617| `a' to the extended double-precision floating-point format. The conversion
3618| is performed according to the IEC/IEEE Standard for Binary Floating-Point
3619| Arithmetic.
3620*----------------------------------------------------------------------------*/
3621
e5a41ffa 3622floatx80 float64_to_floatx80(float64 a, float_status *status)
158142c2
FB
3623{
3624 flag aSign;
94a49d86 3625 int_fast16_t aExp;
bb98fe42 3626 uint64_t aSig;
158142c2 3627
ff32e16e 3628 a = float64_squash_input_denormal(a, status);
158142c2
FB
3629 aSig = extractFloat64Frac( a );
3630 aExp = extractFloat64Exp( a );
3631 aSign = extractFloat64Sign( a );
3632 if ( aExp == 0x7FF ) {
ff32e16e
PM
3633 if (aSig) {
3634 return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
3635 }
158142c2
FB
3636 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3637 }
3638 if ( aExp == 0 ) {
3639 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3640 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3641 }
3642 return
3643 packFloatx80(
3644 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
3645
3646}
3647
158142c2
FB
3648/*----------------------------------------------------------------------------
3649| Returns the result of converting the double-precision floating-point value
3650| `a' to the quadruple-precision floating-point format. The conversion is
3651| performed according to the IEC/IEEE Standard for Binary Floating-Point
3652| Arithmetic.
3653*----------------------------------------------------------------------------*/
3654
e5a41ffa 3655float128 float64_to_float128(float64 a, float_status *status)
158142c2
FB
3656{
3657 flag aSign;
94a49d86 3658 int_fast16_t aExp;
bb98fe42 3659 uint64_t aSig, zSig0, zSig1;
158142c2 3660
ff32e16e 3661 a = float64_squash_input_denormal(a, status);
158142c2
FB
3662 aSig = extractFloat64Frac( a );
3663 aExp = extractFloat64Exp( a );
3664 aSign = extractFloat64Sign( a );
3665 if ( aExp == 0x7FF ) {
ff32e16e
PM
3666 if (aSig) {
3667 return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
3668 }
158142c2
FB
3669 return packFloat128( aSign, 0x7FFF, 0, 0 );
3670 }
3671 if ( aExp == 0 ) {
3672 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3673 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3674 --aExp;
3675 }
3676 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
3677 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
3678
3679}
3680
158142c2
FB
3681/*----------------------------------------------------------------------------
3682| Rounds the double-precision floating-point value `a' to an integer, and
3683| returns the result as a double-precision floating-point value. The
3684| operation is performed according to the IEC/IEEE Standard for Binary
3685| Floating-Point Arithmetic.
3686*----------------------------------------------------------------------------*/
3687
e5a41ffa 3688float64 float64_round_to_int(float64 a, float_status *status)
158142c2
FB
3689{
3690 flag aSign;
94a49d86 3691 int_fast16_t aExp;
bb98fe42 3692 uint64_t lastBitMask, roundBitsMask;
bb98fe42 3693 uint64_t z;
ff32e16e 3694 a = float64_squash_input_denormal(a, status);
158142c2
FB
3695
3696 aExp = extractFloat64Exp( a );
3697 if ( 0x433 <= aExp ) {
3698 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
ff32e16e 3699 return propagateFloat64NaN(a, a, status);
158142c2
FB
3700 }
3701 return a;
3702 }
3703 if ( aExp < 0x3FF ) {
bb98fe42 3704 if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
a2f2d288 3705 status->float_exception_flags |= float_flag_inexact;
158142c2 3706 aSign = extractFloat64Sign( a );
a2f2d288 3707 switch (status->float_rounding_mode) {
158142c2
FB
3708 case float_round_nearest_even:
3709 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3710 return packFloat64( aSign, 0x3FF, 0 );
3711 }
3712 break;
f9288a76
PM
3713 case float_round_ties_away:
3714 if (aExp == 0x3FE) {
3715 return packFloat64(aSign, 0x3ff, 0);
3716 }
3717 break;
158142c2 3718 case float_round_down:
f090c9d4 3719 return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
158142c2 3720 case float_round_up:
f090c9d4
PB
3721 return make_float64(
3722 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
158142c2
FB
3723 }
3724 return packFloat64( aSign, 0, 0 );
3725 }
3726 lastBitMask = 1;
3727 lastBitMask <<= 0x433 - aExp;
3728 roundBitsMask = lastBitMask - 1;
f090c9d4 3729 z = float64_val(a);
a2f2d288 3730 switch (status->float_rounding_mode) {
dc355b76
PM
3731 case float_round_nearest_even:
3732 z += lastBitMask >> 1;
3733 if ((z & roundBitsMask) == 0) {
3734 z &= ~lastBitMask;
3735 }
3736 break;
f9288a76
PM
3737 case float_round_ties_away:
3738 z += lastBitMask >> 1;
3739 break;
dc355b76
PM
3740 case float_round_to_zero:
3741 break;
3742 case float_round_up:
3743 if (!extractFloat64Sign(make_float64(z))) {
3744 z += roundBitsMask;
3745 }
3746 break;
3747 case float_round_down:
3748 if (extractFloat64Sign(make_float64(z))) {
158142c2
FB
3749 z += roundBitsMask;
3750 }
dc355b76
PM
3751 break;
3752 default:
3753 abort();
158142c2
FB
3754 }
3755 z &= ~ roundBitsMask;
a2f2d288
PM
3756 if (z != float64_val(a)) {
3757 status->float_exception_flags |= float_flag_inexact;
3758 }
f090c9d4 3759 return make_float64(z);
158142c2
FB
3760
3761}
3762
e5a41ffa 3763float64 float64_trunc_to_int(float64 a, float_status *status)
e6e5906b
PB
3764{
3765 int oldmode;
3766 float64 res;
a2f2d288
PM
3767 oldmode = status->float_rounding_mode;
3768 status->float_rounding_mode = float_round_to_zero;
ff32e16e 3769 res = float64_round_to_int(a, status);
a2f2d288 3770 status->float_rounding_mode = oldmode;
e6e5906b
PB
3771 return res;
3772}
3773
158142c2
FB
3774/*----------------------------------------------------------------------------
3775| Returns the result of adding the absolute values of the double-precision
3776| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
3777| before being returned. `zSign' is ignored if the result is a NaN.
3778| The addition is performed according to the IEC/IEEE Standard for Binary
3779| Floating-Point Arithmetic.
3780*----------------------------------------------------------------------------*/
3781
e5a41ffa
PM
3782static float64 addFloat64Sigs(float64 a, float64 b, flag zSign,
3783 float_status *status)
158142c2 3784{
94a49d86 3785 int_fast16_t aExp, bExp, zExp;
bb98fe42 3786 uint64_t aSig, bSig, zSig;
94a49d86 3787 int_fast16_t expDiff;
158142c2
FB
3788
3789 aSig = extractFloat64Frac( a );
3790 aExp = extractFloat64Exp( a );
3791 bSig = extractFloat64Frac( b );
3792 bExp = extractFloat64Exp( b );
3793 expDiff = aExp - bExp;
3794 aSig <<= 9;
3795 bSig <<= 9;
3796 if ( 0 < expDiff ) {
3797 if ( aExp == 0x7FF ) {
ff32e16e
PM
3798 if (aSig) {
3799 return propagateFloat64NaN(a, b, status);
3800 }
158142c2
FB
3801 return a;
3802 }
3803 if ( bExp == 0 ) {
3804 --expDiff;
3805 }
3806 else {
3807 bSig |= LIT64( 0x2000000000000000 );
3808 }
3809 shift64RightJamming( bSig, expDiff, &bSig );
3810 zExp = aExp;
3811 }
3812 else if ( expDiff < 0 ) {
3813 if ( bExp == 0x7FF ) {
ff32e16e
PM
3814 if (bSig) {
3815 return propagateFloat64NaN(a, b, status);
3816 }
158142c2
FB
3817 return packFloat64( zSign, 0x7FF, 0 );
3818 }
3819 if ( aExp == 0 ) {
3820 ++expDiff;
3821 }
3822 else {
3823 aSig |= LIT64( 0x2000000000000000 );
3824 }
3825 shift64RightJamming( aSig, - expDiff, &aSig );
3826 zExp = bExp;
3827 }
3828 else {
3829 if ( aExp == 0x7FF ) {
ff32e16e
PM
3830 if (aSig | bSig) {
3831 return propagateFloat64NaN(a, b, status);
3832 }
158142c2
FB
3833 return a;
3834 }
fe76d976 3835 if ( aExp == 0 ) {
a2f2d288 3836 if (status->flush_to_zero) {
e6afc87f 3837 if (aSig | bSig) {
ff32e16e 3838 float_raise(float_flag_output_denormal, status);
e6afc87f
PM
3839 }
3840 return packFloat64(zSign, 0, 0);
3841 }
fe76d976
PB
3842 return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3843 }
158142c2
FB
3844 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3845 zExp = aExp;
3846 goto roundAndPack;
3847 }
3848 aSig |= LIT64( 0x2000000000000000 );
3849 zSig = ( aSig + bSig )<<1;
3850 --zExp;
bb98fe42 3851 if ( (int64_t) zSig < 0 ) {
158142c2
FB
3852 zSig = aSig + bSig;
3853 ++zExp;
3854 }
3855 roundAndPack:
ff32e16e 3856 return roundAndPackFloat64(zSign, zExp, zSig, status);
158142c2
FB
3857
3858}
3859
3860/*----------------------------------------------------------------------------
3861| Returns the result of subtracting the absolute values of the double-
3862| precision floating-point values `a' and `b'. If `zSign' is 1, the
3863| difference is negated before being returned. `zSign' is ignored if the
3864| result is a NaN. The subtraction is performed according to the IEC/IEEE
3865| Standard for Binary Floating-Point Arithmetic.
3866*----------------------------------------------------------------------------*/
3867
e5a41ffa
PM
3868static float64 subFloat64Sigs(float64 a, float64 b, flag zSign,
3869 float_status *status)
158142c2 3870{
94a49d86 3871 int_fast16_t aExp, bExp, zExp;
bb98fe42 3872 uint64_t aSig, bSig, zSig;
94a49d86 3873 int_fast16_t expDiff;
158142c2
FB
3874
3875 aSig = extractFloat64Frac( a );
3876 aExp = extractFloat64Exp( a );
3877 bSig = extractFloat64Frac( b );
3878 bExp = extractFloat64Exp( b );
3879 expDiff = aExp - bExp;
3880 aSig <<= 10;
3881 bSig <<= 10;
3882 if ( 0 < expDiff ) goto aExpBigger;
3883 if ( expDiff < 0 ) goto bExpBigger;
3884 if ( aExp == 0x7FF ) {
ff32e16e
PM
3885 if (aSig | bSig) {
3886 return propagateFloat64NaN(a, b, status);
3887 }
3888 float_raise(float_flag_invalid, status);
158142c2
FB
3889 return float64_default_nan;
3890 }
3891 if ( aExp == 0 ) {
3892 aExp = 1;
3893 bExp = 1;
3894 }
3895 if ( bSig < aSig ) goto aBigger;
3896 if ( aSig < bSig ) goto bBigger;
a2f2d288 3897 return packFloat64(status->float_rounding_mode == float_round_down, 0, 0);
158142c2
FB
3898 bExpBigger:
3899 if ( bExp == 0x7FF ) {
ff32e16e
PM
3900 if (bSig) {
3901 return propagateFloat64NaN(a, b, status);
3902 }
158142c2
FB
3903 return packFloat64( zSign ^ 1, 0x7FF, 0 );
3904 }
3905 if ( aExp == 0 ) {
3906 ++expDiff;
3907 }
3908 else {
3909 aSig |= LIT64( 0x4000000000000000 );
3910 }
3911 shift64RightJamming( aSig, - expDiff, &aSig );
3912 bSig |= LIT64( 0x4000000000000000 );
3913 bBigger:
3914 zSig = bSig - aSig;
3915 zExp = bExp;
3916 zSign ^= 1;
3917 goto normalizeRoundAndPack;
3918 aExpBigger:
3919 if ( aExp == 0x7FF ) {
ff32e16e
PM
3920 if (aSig) {
3921 return propagateFloat64NaN(a, b, status);
3922 }
158142c2
FB
3923 return a;
3924 }
3925 if ( bExp == 0 ) {
3926 --expDiff;
3927 }
3928 else {
3929 bSig |= LIT64( 0x4000000000000000 );
3930 }
3931 shift64RightJamming( bSig, expDiff, &bSig );
3932 aSig |= LIT64( 0x4000000000000000 );
3933 aBigger:
3934 zSig = aSig - bSig;
3935 zExp = aExp;
3936 normalizeRoundAndPack:
3937 --zExp;
ff32e16e 3938 return normalizeRoundAndPackFloat64(zSign, zExp, zSig, status);
158142c2
FB
3939
3940}
3941
3942/*----------------------------------------------------------------------------
3943| Returns the result of adding the double-precision floating-point values `a'
3944| and `b'. The operation is performed according to the IEC/IEEE Standard for
3945| Binary Floating-Point Arithmetic.
3946*----------------------------------------------------------------------------*/
3947
e5a41ffa 3948float64 float64_add(float64 a, float64 b, float_status *status)
158142c2
FB
3949{
3950 flag aSign, bSign;
ff32e16e
PM
3951 a = float64_squash_input_denormal(a, status);
3952 b = float64_squash_input_denormal(b, status);
158142c2
FB
3953
3954 aSign = extractFloat64Sign( a );
3955 bSign = extractFloat64Sign( b );
3956 if ( aSign == bSign ) {
ff32e16e 3957 return addFloat64Sigs(a, b, aSign, status);
158142c2
FB
3958 }
3959 else {
ff32e16e 3960 return subFloat64Sigs(a, b, aSign, status);
158142c2
FB
3961 }
3962
3963}
3964
3965/*----------------------------------------------------------------------------
3966| Returns the result of subtracting the double-precision floating-point values
3967| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3968| for Binary Floating-Point Arithmetic.
3969*----------------------------------------------------------------------------*/
3970
e5a41ffa 3971float64 float64_sub(float64 a, float64 b, float_status *status)
158142c2
FB
3972{
3973 flag aSign, bSign;
ff32e16e
PM
3974 a = float64_squash_input_denormal(a, status);
3975 b = float64_squash_input_denormal(b, status);
158142c2
FB
3976
3977 aSign = extractFloat64Sign( a );
3978 bSign = extractFloat64Sign( b );
3979 if ( aSign == bSign ) {
ff32e16e 3980 return subFloat64Sigs(a, b, aSign, status);
158142c2
FB
3981 }
3982 else {
ff32e16e 3983 return addFloat64Sigs(a, b, aSign, status);
158142c2
FB
3984 }
3985
3986}
3987
3988/*----------------------------------------------------------------------------
3989| Returns the result of multiplying the double-precision floating-point values
3990| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3991| for Binary Floating-Point Arithmetic.
3992*----------------------------------------------------------------------------*/
3993
e5a41ffa 3994float64 float64_mul(float64 a, float64 b, float_status *status)
158142c2
FB
3995{
3996 flag aSign, bSign, zSign;
94a49d86 3997 int_fast16_t aExp, bExp, zExp;
bb98fe42 3998 uint64_t aSig, bSig, zSig0, zSig1;
158142c2 3999
ff32e16e
PM
4000 a = float64_squash_input_denormal(a, status);
4001 b = float64_squash_input_denormal(b, status);
37d18660 4002
158142c2
FB
4003 aSig = extractFloat64Frac( a );
4004 aExp = extractFloat64Exp( a );
4005 aSign = extractFloat64Sign( a );
4006 bSig = extractFloat64Frac( b );
4007 bExp = extractFloat64Exp( b );
4008 bSign = extractFloat64Sign( b );
4009 zSign = aSign ^ bSign;
4010 if ( aExp == 0x7FF ) {
4011 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
ff32e16e 4012 return propagateFloat64NaN(a, b, status);
158142c2
FB
4013 }
4014 if ( ( bExp | bSig ) == 0 ) {
ff32e16e 4015 float_raise(float_flag_invalid, status);
158142c2
FB
4016 return float64_default_nan;
4017 }
4018 return packFloat64( zSign, 0x7FF, 0 );
4019 }
4020 if ( bExp == 0x7FF ) {
ff32e16e
PM
4021 if (bSig) {
4022 return propagateFloat64NaN(a, b, status);
4023 }
158142c2 4024 if ( ( aExp | aSig ) == 0 ) {
ff32e16e 4025 float_raise(float_flag_invalid, status);
158142c2
FB
4026 return float64_default_nan;
4027 }
4028 return packFloat64( zSign, 0x7FF, 0 );
4029 }
4030 if ( aExp == 0 ) {
4031 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
4032 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4033 }
4034 if ( bExp == 0 ) {
4035 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
4036 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4037 }
4038 zExp = aExp + bExp - 0x3FF;
4039 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
4040 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4041 mul64To128( aSig, bSig, &zSig0, &zSig1 );
4042 zSig0 |= ( zSig1 != 0 );
bb98fe42 4043 if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
158142c2
FB
4044 zSig0 <<= 1;
4045 --zExp;
4046 }
ff32e16e 4047 return roundAndPackFloat64(zSign, zExp, zSig0, status);
158142c2
FB
4048
4049}
4050
4051/*----------------------------------------------------------------------------
4052| Returns the result of dividing the double-precision floating-point value `a'
4053| by the corresponding value `b'. The operation is performed according to
4054| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4055*----------------------------------------------------------------------------*/
4056
e5a41ffa 4057float64 float64_div(float64 a, float64 b, float_status *status)
158142c2
FB
4058{
4059 flag aSign, bSign, zSign;
94a49d86 4060 int_fast16_t aExp, bExp, zExp;
bb98fe42
AF
4061 uint64_t aSig, bSig, zSig;
4062 uint64_t rem0, rem1;
4063 uint64_t term0, term1;
ff32e16e
PM
4064 a = float64_squash_input_denormal(a, status);
4065 b = float64_squash_input_denormal(b, status);
158142c2
FB
4066
4067 aSig = extractFloat64Frac( a );
4068 aExp = extractFloat64Exp( a );
4069 aSign = extractFloat64Sign( a );
4070 bSig = extractFloat64Frac( b );
4071 bExp = extractFloat64Exp( b );
4072 bSign = extractFloat64Sign( b );
4073 zSign = aSign ^ bSign;
4074 if ( aExp == 0x7FF ) {
ff32e16e
PM
4075 if (aSig) {
4076 return propagateFloat64NaN(a, b, status);
4077 }
158142c2 4078 if ( bExp == 0x7FF ) {
ff32e16e
PM
4079 if (bSig) {
4080 return propagateFloat64NaN(a, b, status);
4081 }
4082 float_raise(float_flag_invalid, status);
158142c2
FB
4083 return float64_default_nan;
4084 }
4085 return packFloat64( zSign, 0x7FF, 0 );
4086 }
4087 if ( bExp == 0x7FF ) {
ff32e16e
PM
4088 if (bSig) {
4089 return propagateFloat64NaN(a, b, status);
4090 }
158142c2
FB
4091 return packFloat64( zSign, 0, 0 );
4092 }
4093 if ( bExp == 0 ) {
4094 if ( bSig == 0 ) {
4095 if ( ( aExp | aSig ) == 0 ) {
ff32e16e 4096 float_raise(float_flag_invalid, status);
158142c2
FB
4097 return float64_default_nan;
4098 }
ff32e16e 4099 float_raise(float_flag_divbyzero, status);
158142c2
FB
4100 return packFloat64( zSign, 0x7FF, 0 );
4101 }
4102 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4103 }
4104 if ( aExp == 0 ) {
4105 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
4106 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4107 }
4108 zExp = aExp - bExp + 0x3FD;
4109 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
4110 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4111 if ( bSig <= ( aSig + aSig ) ) {
4112 aSig >>= 1;
4113 ++zExp;
4114 }
4115 zSig = estimateDiv128To64( aSig, 0, bSig );
4116 if ( ( zSig & 0x1FF ) <= 2 ) {
4117 mul64To128( bSig, zSig, &term0, &term1 );
4118 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
bb98fe42 4119 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
4120 --zSig;
4121 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4122 }
4123 zSig |= ( rem1 != 0 );
4124 }
ff32e16e 4125 return roundAndPackFloat64(zSign, zExp, zSig, status);
158142c2
FB
4126
4127}
4128
4129/*----------------------------------------------------------------------------
4130| Returns the remainder of the double-precision floating-point value `a'
4131| with respect to the corresponding value `b'. The operation is performed
4132| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4133*----------------------------------------------------------------------------*/
4134
e5a41ffa 4135float64 float64_rem(float64 a, float64 b, float_status *status)
158142c2 4136{
ed086f3d 4137 flag aSign, zSign;
94a49d86 4138 int_fast16_t aExp, bExp, expDiff;
bb98fe42
AF
4139 uint64_t aSig, bSig;
4140 uint64_t q, alternateASig;
4141 int64_t sigMean;
158142c2 4142
ff32e16e
PM
4143 a = float64_squash_input_denormal(a, status);
4144 b = float64_squash_input_denormal(b, status);
158142c2
FB
4145 aSig = extractFloat64Frac( a );
4146 aExp = extractFloat64Exp( a );
4147 aSign = extractFloat64Sign( a );
4148 bSig = extractFloat64Frac( b );
4149 bExp = extractFloat64Exp( b );
158142c2
FB
4150 if ( aExp == 0x7FF ) {
4151 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
ff32e16e 4152 return propagateFloat64NaN(a, b, status);
158142c2 4153 }
ff32e16e 4154 float_raise(float_flag_invalid, status);
158142c2
FB
4155 return float64_default_nan;
4156 }
4157 if ( bExp == 0x7FF ) {
ff32e16e
PM
4158 if (bSig) {
4159 return propagateFloat64NaN(a, b, status);
4160 }
158142c2
FB
4161 return a;
4162 }
4163 if ( bExp == 0 ) {
4164 if ( bSig == 0 ) {
ff32e16e 4165 float_raise(float_flag_invalid, status);
158142c2
FB
4166 return float64_default_nan;
4167 }
4168 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4169 }
4170 if ( aExp == 0 ) {
4171 if ( aSig == 0 ) return a;
4172 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4173 }
4174 expDiff = aExp - bExp;
4175 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4176 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4177 if ( expDiff < 0 ) {
4178 if ( expDiff < -1 ) return a;
4179 aSig >>= 1;
4180 }
4181 q = ( bSig <= aSig );
4182 if ( q ) aSig -= bSig;
4183 expDiff -= 64;
4184 while ( 0 < expDiff ) {
4185 q = estimateDiv128To64( aSig, 0, bSig );
4186 q = ( 2 < q ) ? q - 2 : 0;
4187 aSig = - ( ( bSig>>2 ) * q );
4188 expDiff -= 62;
4189 }
4190 expDiff += 64;
4191 if ( 0 < expDiff ) {
4192 q = estimateDiv128To64( aSig, 0, bSig );
4193 q = ( 2 < q ) ? q - 2 : 0;
4194 q >>= 64 - expDiff;
4195 bSig >>= 2;
4196 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4197 }
4198 else {
4199 aSig >>= 2;
4200 bSig >>= 2;
4201 }
4202 do {
4203 alternateASig = aSig;
4204 ++q;
4205 aSig -= bSig;
bb98fe42 4206 } while ( 0 <= (int64_t) aSig );
158142c2
FB
4207 sigMean = aSig + alternateASig;
4208 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4209 aSig = alternateASig;
4210 }
bb98fe42 4211 zSign = ( (int64_t) aSig < 0 );
158142c2 4212 if ( zSign ) aSig = - aSig;
ff32e16e 4213 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
158142c2
FB
4214
4215}
4216
369be8f6
PM
4217/*----------------------------------------------------------------------------
4218| Returns the result of multiplying the double-precision floating-point values
4219| `a' and `b' then adding 'c', with no intermediate rounding step after the
4220| multiplication. The operation is performed according to the IEC/IEEE
4221| Standard for Binary Floating-Point Arithmetic 754-2008.
4222| The flags argument allows the caller to select negation of the
4223| addend, the intermediate product, or the final result. (The difference
4224| between this and having the caller do a separate negation is that negating
4225| externally will flip the sign bit on NaNs.)
4226*----------------------------------------------------------------------------*/
4227
e5a41ffa
PM
4228float64 float64_muladd(float64 a, float64 b, float64 c, int flags,
4229 float_status *status)
369be8f6
PM
4230{
4231 flag aSign, bSign, cSign, zSign;
94a49d86 4232 int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
369be8f6
PM
4233 uint64_t aSig, bSig, cSig;
4234 flag pInf, pZero, pSign;
4235 uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1;
4236 int shiftcount;
4237 flag signflip, infzero;
4238
ff32e16e
PM
4239 a = float64_squash_input_denormal(a, status);
4240 b = float64_squash_input_denormal(b, status);
4241 c = float64_squash_input_denormal(c, status);
369be8f6
PM
4242 aSig = extractFloat64Frac(a);
4243 aExp = extractFloat64Exp(a);
4244 aSign = extractFloat64Sign(a);
4245 bSig = extractFloat64Frac(b);
4246 bExp = extractFloat64Exp(b);
4247 bSign = extractFloat64Sign(b);
4248 cSig = extractFloat64Frac(c);
4249 cExp = extractFloat64Exp(c);
4250 cSign = extractFloat64Sign(c);
4251
4252 infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) ||
4253 (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0));
4254
4255 /* It is implementation-defined whether the cases of (0,inf,qnan)
4256 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
4257 * they return if they do), so we have to hand this information
4258 * off to the target-specific pick-a-NaN routine.
4259 */
4260 if (((aExp == 0x7ff) && aSig) ||
4261 ((bExp == 0x7ff) && bSig) ||
4262 ((cExp == 0x7ff) && cSig)) {
ff32e16e 4263 return propagateFloat64MulAddNaN(a, b, c, infzero, status);
369be8f6
PM
4264 }
4265
4266 if (infzero) {
ff32e16e 4267 float_raise(float_flag_invalid, status);
369be8f6
PM
4268 return float64_default_nan;
4269 }
4270
4271 if (flags & float_muladd_negate_c) {
4272 cSign ^= 1;
4273 }
4274
4275 signflip = (flags & float_muladd_negate_result) ? 1 : 0;
4276
4277 /* Work out the sign and type of the product */
4278 pSign = aSign ^ bSign;
4279 if (flags & float_muladd_negate_product) {
4280 pSign ^= 1;
4281 }
4282 pInf = (aExp == 0x7ff) || (bExp == 0x7ff);
4283 pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
4284
4285 if (cExp == 0x7ff) {
4286 if (pInf && (pSign ^ cSign)) {
4287 /* addition of opposite-signed infinities => InvalidOperation */
ff32e16e 4288 float_raise(float_flag_invalid, status);
369be8f6
PM
4289 return float64_default_nan;
4290 }
4291 /* Otherwise generate an infinity of the same sign */
4292 return packFloat64(cSign ^ signflip, 0x7ff, 0);
4293 }
4294
4295 if (pInf) {
4296 return packFloat64(pSign ^ signflip, 0x7ff, 0);
4297 }
4298
4299 if (pZero) {
4300 if (cExp == 0) {
4301 if (cSig == 0) {
4302 /* Adding two exact zeroes */
4303 if (pSign == cSign) {
4304 zSign = pSign;
a2f2d288 4305 } else if (status->float_rounding_mode == float_round_down) {
369be8f6
PM
4306 zSign = 1;
4307 } else {
4308 zSign = 0;
4309 }
4310 return packFloat64(zSign ^ signflip, 0, 0);
4311 }
4312 /* Exact zero plus a denorm */
a2f2d288 4313 if (status->flush_to_zero) {
ff32e16e 4314 float_raise(float_flag_output_denormal, status);
369be8f6
PM
4315 return packFloat64(cSign ^ signflip, 0, 0);
4316 }
4317 }
4318 /* Zero plus something non-zero : just return the something */
67d43538
PM
4319 if (flags & float_muladd_halve_result) {
4320 if (cExp == 0) {
4321 normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4322 }
4323 /* Subtract one to halve, and one again because roundAndPackFloat64
4324 * wants one less than the true exponent.
4325 */
4326 cExp -= 2;
4327 cSig = (cSig | 0x0010000000000000ULL) << 10;
ff32e16e 4328 return roundAndPackFloat64(cSign ^ signflip, cExp, cSig, status);
67d43538 4329 }
a6e7c184 4330 return packFloat64(cSign ^ signflip, cExp, cSig);
369be8f6
PM
4331 }
4332
4333 if (aExp == 0) {
4334 normalizeFloat64Subnormal(aSig, &aExp, &aSig);
4335 }
4336 if (bExp == 0) {
4337 normalizeFloat64Subnormal(bSig, &bExp, &bSig);
4338 }
4339
4340 /* Calculate the actual result a * b + c */
4341
4342 /* Multiply first; this is easy. */
4343 /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff
4344 * because we want the true exponent, not the "one-less-than"
4345 * flavour that roundAndPackFloat64() takes.
4346 */
4347 pExp = aExp + bExp - 0x3fe;
4348 aSig = (aSig | LIT64(0x0010000000000000))<<10;
4349 bSig = (bSig | LIT64(0x0010000000000000))<<11;
4350 mul64To128(aSig, bSig, &pSig0, &pSig1);
4351 if ((int64_t)(pSig0 << 1) >= 0) {
4352 shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1);
4353 pExp--;
4354 }
4355
4356 zSign = pSign ^ signflip;
4357
4358 /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit
4359 * bit in position 126.
4360 */
4361 if (cExp == 0) {
4362 if (!cSig) {
4363 /* Throw out the special case of c being an exact zero now */
4364 shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1);
67d43538
PM
4365 if (flags & float_muladd_halve_result) {
4366 pExp--;
4367 }
369be8f6 4368 return roundAndPackFloat64(zSign, pExp - 1,
ff32e16e 4369 pSig1, status);
369be8f6
PM
4370 }
4371 normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4372 }
4373
4374 /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the
4375 * significand of the addend, with the explicit bit in position 126.
4376 */
4377 cSig0 = cSig << (126 - 64 - 52);
4378 cSig1 = 0;
4379 cSig0 |= LIT64(0x4000000000000000);
4380 expDiff = pExp - cExp;
4381
4382 if (pSign == cSign) {
4383 /* Addition */
4384 if (expDiff > 0) {
4385 /* scale c to match p */
4386 shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4387 zExp = pExp;
4388 } else if (expDiff < 0) {
4389 /* scale p to match c */
4390 shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4391 zExp = cExp;
4392 } else {
4393 /* no scaling needed */
4394 zExp = cExp;
4395 }
4396 /* Add significands and make sure explicit bit ends up in posn 126 */
4397 add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4398 if ((int64_t)zSig0 < 0) {
4399 shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);
4400 } else {
4401 zExp--;
4402 }
4403 shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1);
67d43538
PM
4404 if (flags & float_muladd_halve_result) {
4405 zExp--;
4406 }
ff32e16e 4407 return roundAndPackFloat64(zSign, zExp, zSig1, status);
369be8f6
PM
4408 } else {
4409 /* Subtraction */
4410 if (expDiff > 0) {
4411 shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4412 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4413 zExp = pExp;
4414 } else if (expDiff < 0) {
4415 shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4416 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4417 zExp = cExp;
4418 zSign ^= 1;
4419 } else {
4420 zExp = pExp;
4421 if (lt128(cSig0, cSig1, pSig0, pSig1)) {
4422 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4423 } else if (lt128(pSig0, pSig1, cSig0, cSig1)) {
4424 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4425 zSign ^= 1;
4426 } else {
4427 /* Exact zero */
4428 zSign = signflip;
a2f2d288 4429 if (status->float_rounding_mode == float_round_down) {
369be8f6
PM
4430 zSign ^= 1;
4431 }
4432 return packFloat64(zSign, 0, 0);
4433 }
4434 }
4435 --zExp;
4436 /* Do the equivalent of normalizeRoundAndPackFloat64() but
4437 * starting with the significand in a pair of uint64_t.
4438 */
4439 if (zSig0) {
4440 shiftcount = countLeadingZeros64(zSig0) - 1;
4441 shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1);
4442 if (zSig1) {
4443 zSig0 |= 1;
4444 }
4445 zExp -= shiftcount;
4446 } else {
e3d142d0
PM
4447 shiftcount = countLeadingZeros64(zSig1);
4448 if (shiftcount == 0) {
4449 zSig0 = (zSig1 >> 1) | (zSig1 & 1);
4450 zExp -= 63;
4451 } else {
4452 shiftcount--;
4453 zSig0 = zSig1 << shiftcount;
4454 zExp -= (shiftcount + 64);
4455 }
369be8f6 4456 }
67d43538
PM
4457 if (flags & float_muladd_halve_result) {
4458 zExp--;
4459 }
ff32e16e 4460 return roundAndPackFloat64(zSign, zExp, zSig0, status);
369be8f6
PM
4461 }
4462}
4463
158142c2
FB
4464/*----------------------------------------------------------------------------
4465| Returns the square root of the double-precision floating-point value `a'.
4466| The operation is performed according to the IEC/IEEE Standard for Binary
4467| Floating-Point Arithmetic.
4468*----------------------------------------------------------------------------*/
4469
e5a41ffa 4470float64 float64_sqrt(float64 a, float_status *status)
158142c2
FB
4471{
4472 flag aSign;
94a49d86 4473 int_fast16_t aExp, zExp;
bb98fe42
AF
4474 uint64_t aSig, zSig, doubleZSig;
4475 uint64_t rem0, rem1, term0, term1;
ff32e16e 4476 a = float64_squash_input_denormal(a, status);
158142c2
FB
4477
4478 aSig = extractFloat64Frac( a );
4479 aExp = extractFloat64Exp( a );
4480 aSign = extractFloat64Sign( a );
4481 if ( aExp == 0x7FF ) {
ff32e16e
PM
4482 if (aSig) {
4483 return propagateFloat64NaN(a, a, status);
4484 }
158142c2 4485 if ( ! aSign ) return a;
ff32e16e 4486 float_raise(float_flag_invalid, status);
158142c2
FB
4487 return float64_default_nan;
4488 }
4489 if ( aSign ) {
4490 if ( ( aExp | aSig ) == 0 ) return a;
ff32e16e 4491 float_raise(float_flag_invalid, status);
158142c2
FB
4492 return float64_default_nan;
4493 }
4494 if ( aExp == 0 ) {
f090c9d4 4495 if ( aSig == 0 ) return float64_zero;
158142c2
FB
4496 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4497 }
4498 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
4499 aSig |= LIT64( 0x0010000000000000 );
4500 zSig = estimateSqrt32( aExp, aSig>>21 );
4501 aSig <<= 9 - ( aExp & 1 );
4502 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
4503 if ( ( zSig & 0x1FF ) <= 5 ) {
4504 doubleZSig = zSig<<1;
4505 mul64To128( zSig, zSig, &term0, &term1 );
4506 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
bb98fe42 4507 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
4508 --zSig;
4509 doubleZSig -= 2;
4510 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
4511 }
4512 zSig |= ( ( rem0 | rem1 ) != 0 );
4513 }
ff32e16e 4514 return roundAndPackFloat64(0, zExp, zSig, status);
158142c2
FB
4515
4516}
4517
374dfc33
AJ
4518/*----------------------------------------------------------------------------
4519| Returns the binary log of the double-precision floating-point value `a'.
4520| The operation is performed according to the IEC/IEEE Standard for Binary
4521| Floating-Point Arithmetic.
4522*----------------------------------------------------------------------------*/
e5a41ffa 4523float64 float64_log2(float64 a, float_status *status)
374dfc33
AJ
4524{
4525 flag aSign, zSign;
94a49d86 4526 int_fast16_t aExp;
bb98fe42 4527 uint64_t aSig, aSig0, aSig1, zSig, i;
ff32e16e 4528 a = float64_squash_input_denormal(a, status);
374dfc33
AJ
4529
4530 aSig = extractFloat64Frac( a );
4531 aExp = extractFloat64Exp( a );
4532 aSign = extractFloat64Sign( a );
4533
4534 if ( aExp == 0 ) {
4535 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4536 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4537 }
4538 if ( aSign ) {
ff32e16e 4539 float_raise(float_flag_invalid, status);
374dfc33
AJ
4540 return float64_default_nan;
4541 }
4542 if ( aExp == 0x7FF ) {
ff32e16e
PM
4543 if (aSig) {
4544 return propagateFloat64NaN(a, float64_zero, status);
4545 }
374dfc33
AJ
4546 return a;
4547 }
4548
4549 aExp -= 0x3FF;
4550 aSig |= LIT64( 0x0010000000000000 );
4551 zSign = aExp < 0;
bb98fe42 4552 zSig = (uint64_t)aExp << 52;
374dfc33
AJ
4553 for (i = 1LL << 51; i > 0; i >>= 1) {
4554 mul64To128( aSig, aSig, &aSig0, &aSig1 );
4555 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4556 if ( aSig & LIT64( 0x0020000000000000 ) ) {
4557 aSig >>= 1;
4558 zSig |= i;
4559 }
4560 }
4561
4562 if ( zSign )
4563 zSig = -zSig;
ff32e16e 4564 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
374dfc33
AJ
4565}
4566
158142c2
FB
4567/*----------------------------------------------------------------------------
4568| Returns 1 if the double-precision floating-point value `a' is equal to the
b689362d
AJ
4569| corresponding value `b', and 0 otherwise. The invalid exception is raised
4570| if either operand is a NaN. Otherwise, the comparison is performed
158142c2
FB
4571| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4572*----------------------------------------------------------------------------*/
4573
e5a41ffa 4574int float64_eq(float64 a, float64 b, float_status *status)
158142c2 4575{
bb98fe42 4576 uint64_t av, bv;
ff32e16e
PM
4577 a = float64_squash_input_denormal(a, status);
4578 b = float64_squash_input_denormal(b, status);
158142c2
FB
4579
4580 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4581 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4582 ) {
ff32e16e 4583 float_raise(float_flag_invalid, status);
158142c2
FB
4584 return 0;
4585 }
f090c9d4 4586 av = float64_val(a);
a1b91bb4 4587 bv = float64_val(b);
bb98fe42 4588 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
158142c2
FB
4589
4590}
4591
4592/*----------------------------------------------------------------------------
4593| Returns 1 if the double-precision floating-point value `a' is less than or
f5a64251
AJ
4594| equal to the corresponding value `b', and 0 otherwise. The invalid
4595| exception is raised if either operand is a NaN. The comparison is performed
4596| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
4597*----------------------------------------------------------------------------*/
4598
e5a41ffa 4599int float64_le(float64 a, float64 b, float_status *status)
158142c2
FB
4600{
4601 flag aSign, bSign;
bb98fe42 4602 uint64_t av, bv;
ff32e16e
PM
4603 a = float64_squash_input_denormal(a, status);
4604 b = float64_squash_input_denormal(b, status);
158142c2
FB
4605
4606 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4607 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4608 ) {
ff32e16e 4609 float_raise(float_flag_invalid, status);
158142c2
FB
4610 return 0;
4611 }
4612 aSign = extractFloat64Sign( a );
4613 bSign = extractFloat64Sign( b );
f090c9d4 4614 av = float64_val(a);
a1b91bb4 4615 bv = float64_val(b);
bb98fe42 4616 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 4617 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
4618
4619}
4620
4621/*----------------------------------------------------------------------------
4622| Returns 1 if the double-precision floating-point value `a' is less than
f5a64251
AJ
4623| the corresponding value `b', and 0 otherwise. The invalid exception is
4624| raised if either operand is a NaN. The comparison is performed according
4625| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
4626*----------------------------------------------------------------------------*/
4627
e5a41ffa 4628int float64_lt(float64 a, float64 b, float_status *status)
158142c2
FB
4629{
4630 flag aSign, bSign;
bb98fe42 4631 uint64_t av, bv;
158142c2 4632
ff32e16e
PM
4633 a = float64_squash_input_denormal(a, status);
4634 b = float64_squash_input_denormal(b, status);
158142c2
FB
4635 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4636 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4637 ) {
ff32e16e 4638 float_raise(float_flag_invalid, status);
158142c2
FB
4639 return 0;
4640 }
4641 aSign = extractFloat64Sign( a );
4642 bSign = extractFloat64Sign( b );
f090c9d4 4643 av = float64_val(a);
a1b91bb4 4644 bv = float64_val(b);
bb98fe42 4645 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 4646 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
4647
4648}
4649
67b7861d
AJ
4650/*----------------------------------------------------------------------------
4651| Returns 1 if the double-precision floating-point values `a' and `b' cannot
f5a64251
AJ
4652| be compared, and 0 otherwise. The invalid exception is raised if either
4653| operand is a NaN. The comparison is performed according to the IEC/IEEE
4654| Standard for Binary Floating-Point Arithmetic.
67b7861d
AJ
4655*----------------------------------------------------------------------------*/
4656
e5a41ffa 4657int float64_unordered(float64 a, float64 b, float_status *status)
67b7861d 4658{
ff32e16e
PM
4659 a = float64_squash_input_denormal(a, status);
4660 b = float64_squash_input_denormal(b, status);
67b7861d
AJ
4661
4662 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4663 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4664 ) {
ff32e16e 4665 float_raise(float_flag_invalid, status);
67b7861d
AJ
4666 return 1;
4667 }
4668 return 0;
4669}
4670
158142c2
FB
4671/*----------------------------------------------------------------------------
4672| Returns 1 if the double-precision floating-point value `a' is equal to the
f5a64251
AJ
4673| corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4674| exception.The comparison is performed according to the IEC/IEEE Standard
4675| for Binary Floating-Point Arithmetic.
158142c2
FB
4676*----------------------------------------------------------------------------*/
4677
e5a41ffa 4678int float64_eq_quiet(float64 a, float64 b, float_status *status)
158142c2 4679{
bb98fe42 4680 uint64_t av, bv;
ff32e16e
PM
4681 a = float64_squash_input_denormal(a, status);
4682 b = float64_squash_input_denormal(b, status);
158142c2
FB
4683
4684 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4685 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4686 ) {
b689362d 4687 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
ff32e16e 4688 float_raise(float_flag_invalid, status);
b689362d 4689 }
158142c2
FB
4690 return 0;
4691 }
f090c9d4 4692 av = float64_val(a);
a1b91bb4 4693 bv = float64_val(b);
bb98fe42 4694 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
158142c2
FB
4695
4696}
4697
4698/*----------------------------------------------------------------------------
4699| Returns 1 if the double-precision floating-point value `a' is less than or
4700| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4701| cause an exception. Otherwise, the comparison is performed according to the
4702| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4703*----------------------------------------------------------------------------*/
4704
e5a41ffa 4705int float64_le_quiet(float64 a, float64 b, float_status *status)
158142c2
FB
4706{
4707 flag aSign, bSign;
bb98fe42 4708 uint64_t av, bv;
ff32e16e
PM
4709 a = float64_squash_input_denormal(a, status);
4710 b = float64_squash_input_denormal(b, status);
158142c2
FB
4711
4712 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4713 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4714 ) {
4715 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
ff32e16e 4716 float_raise(float_flag_invalid, status);
158142c2
FB
4717 }
4718 return 0;
4719 }
4720 aSign = extractFloat64Sign( a );
4721 bSign = extractFloat64Sign( b );
f090c9d4 4722 av = float64_val(a);
a1b91bb4 4723 bv = float64_val(b);
bb98fe42 4724 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 4725 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
4726
4727}
4728
4729/*----------------------------------------------------------------------------
4730| Returns 1 if the double-precision floating-point value `a' is less than
4731| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4732| exception. Otherwise, the comparison is performed according to the IEC/IEEE
4733| Standard for Binary Floating-Point Arithmetic.
4734*----------------------------------------------------------------------------*/
4735
e5a41ffa 4736int float64_lt_quiet(float64 a, float64 b, float_status *status)
158142c2
FB
4737{
4738 flag aSign, bSign;
bb98fe42 4739 uint64_t av, bv;
ff32e16e
PM
4740 a = float64_squash_input_denormal(a, status);
4741 b = float64_squash_input_denormal(b, status);
158142c2
FB
4742
4743 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4744 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4745 ) {
4746 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
ff32e16e 4747 float_raise(float_flag_invalid, status);
158142c2
FB
4748 }
4749 return 0;
4750 }
4751 aSign = extractFloat64Sign( a );
4752 bSign = extractFloat64Sign( b );
f090c9d4 4753 av = float64_val(a);
a1b91bb4 4754 bv = float64_val(b);
bb98fe42 4755 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 4756 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
4757
4758}
4759
67b7861d
AJ
4760/*----------------------------------------------------------------------------
4761| Returns 1 if the double-precision floating-point values `a' and `b' cannot
4762| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4763| comparison is performed according to the IEC/IEEE Standard for Binary
4764| Floating-Point Arithmetic.
4765*----------------------------------------------------------------------------*/
4766
e5a41ffa 4767int float64_unordered_quiet(float64 a, float64 b, float_status *status)
67b7861d 4768{
ff32e16e
PM
4769 a = float64_squash_input_denormal(a, status);
4770 b = float64_squash_input_denormal(b, status);
67b7861d
AJ
4771
4772 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4773 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4774 ) {
4775 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
ff32e16e 4776 float_raise(float_flag_invalid, status);
67b7861d
AJ
4777 }
4778 return 1;
4779 }
4780 return 0;
4781}
4782
158142c2
FB
4783/*----------------------------------------------------------------------------
4784| Returns the result of converting the extended double-precision floating-
4785| point value `a' to the 32-bit two's complement integer format. The
4786| conversion is performed according to the IEC/IEEE Standard for Binary
4787| Floating-Point Arithmetic---which means in particular that the conversion
4788| is rounded according to the current rounding mode. If `a' is a NaN, the
4789| largest positive integer is returned. Otherwise, if the conversion
4790| overflows, the largest integer with the same sign as `a' is returned.
4791*----------------------------------------------------------------------------*/
4792
f4014512 4793int32_t floatx80_to_int32(floatx80 a, float_status *status)
158142c2
FB
4794{
4795 flag aSign;
f4014512 4796 int32_t aExp, shiftCount;
bb98fe42 4797 uint64_t aSig;
158142c2
FB
4798
4799 aSig = extractFloatx80Frac( a );
4800 aExp = extractFloatx80Exp( a );
4801 aSign = extractFloatx80Sign( a );
bb98fe42 4802 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
158142c2
FB
4803 shiftCount = 0x4037 - aExp;
4804 if ( shiftCount <= 0 ) shiftCount = 1;
4805 shift64RightJamming( aSig, shiftCount, &aSig );
ff32e16e 4806 return roundAndPackInt32(aSign, aSig, status);
158142c2
FB
4807
4808}
4809
4810/*----------------------------------------------------------------------------
4811| Returns the result of converting the extended double-precision floating-
4812| point value `a' to the 32-bit two's complement integer format. The
4813| conversion is performed according to the IEC/IEEE Standard for Binary
4814| Floating-Point Arithmetic, except that the conversion is always rounded
4815| toward zero. If `a' is a NaN, the largest positive integer is returned.
4816| Otherwise, if the conversion overflows, the largest integer with the same
4817| sign as `a' is returned.
4818*----------------------------------------------------------------------------*/
4819
f4014512 4820int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
158142c2
FB
4821{
4822 flag aSign;
f4014512 4823 int32_t aExp, shiftCount;
bb98fe42 4824 uint64_t aSig, savedASig;
b3a6a2e0 4825 int32_t z;
158142c2
FB
4826
4827 aSig = extractFloatx80Frac( a );
4828 aExp = extractFloatx80Exp( a );
4829 aSign = extractFloatx80Sign( a );
4830 if ( 0x401E < aExp ) {
bb98fe42 4831 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
158142c2
FB
4832 goto invalid;
4833 }
4834 else if ( aExp < 0x3FFF ) {
a2f2d288
PM
4835 if (aExp || aSig) {
4836 status->float_exception_flags |= float_flag_inexact;
4837 }
158142c2
FB
4838 return 0;
4839 }
4840 shiftCount = 0x403E - aExp;
4841 savedASig = aSig;
4842 aSig >>= shiftCount;
4843 z = aSig;
4844 if ( aSign ) z = - z;
4845 if ( ( z < 0 ) ^ aSign ) {
4846 invalid:
ff32e16e 4847 float_raise(float_flag_invalid, status);
bb98fe42 4848 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2
FB
4849 }
4850 if ( ( aSig<<shiftCount ) != savedASig ) {
a2f2d288 4851 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
4852 }
4853 return z;
4854
4855}
4856
4857/*----------------------------------------------------------------------------
4858| Returns the result of converting the extended double-precision floating-
4859| point value `a' to the 64-bit two's complement integer format. The
4860| conversion is performed according to the IEC/IEEE Standard for Binary
4861| Floating-Point Arithmetic---which means in particular that the conversion
4862| is rounded according to the current rounding mode. If `a' is a NaN,
4863| the largest positive integer is returned. Otherwise, if the conversion
4864| overflows, the largest integer with the same sign as `a' is returned.
4865*----------------------------------------------------------------------------*/
4866
f42c2224 4867int64_t floatx80_to_int64(floatx80 a, float_status *status)
158142c2
FB
4868{
4869 flag aSign;
f4014512 4870 int32_t aExp, shiftCount;
bb98fe42 4871 uint64_t aSig, aSigExtra;
158142c2
FB
4872
4873 aSig = extractFloatx80Frac( a );
4874 aExp = extractFloatx80Exp( a );
4875 aSign = extractFloatx80Sign( a );
4876 shiftCount = 0x403E - aExp;
4877 if ( shiftCount <= 0 ) {
4878 if ( shiftCount ) {
ff32e16e 4879 float_raise(float_flag_invalid, status);
158142c2
FB
4880 if ( ! aSign
4881 || ( ( aExp == 0x7FFF )
4882 && ( aSig != LIT64( 0x8000000000000000 ) ) )
4883 ) {
4884 return LIT64( 0x7FFFFFFFFFFFFFFF );
4885 }
bb98fe42 4886 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
4887 }
4888 aSigExtra = 0;
4889 }
4890 else {
4891 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4892 }
ff32e16e 4893 return roundAndPackInt64(aSign, aSig, aSigExtra, status);
158142c2
FB
4894
4895}
4896
4897/*----------------------------------------------------------------------------
4898| Returns the result of converting the extended double-precision floating-
4899| point value `a' to the 64-bit two's complement integer format. The
4900| conversion is performed according to the IEC/IEEE Standard for Binary
4901| Floating-Point Arithmetic, except that the conversion is always rounded
4902| toward zero. If `a' is a NaN, the largest positive integer is returned.
4903| Otherwise, if the conversion overflows, the largest integer with the same
4904| sign as `a' is returned.
4905*----------------------------------------------------------------------------*/
4906
f42c2224 4907int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
158142c2
FB
4908{
4909 flag aSign;
f4014512 4910 int32_t aExp, shiftCount;
bb98fe42 4911 uint64_t aSig;
f42c2224 4912 int64_t z;
158142c2
FB
4913
4914 aSig = extractFloatx80Frac( a );
4915 aExp = extractFloatx80Exp( a );
4916 aSign = extractFloatx80Sign( a );
4917 shiftCount = aExp - 0x403E;
4918 if ( 0 <= shiftCount ) {
4919 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4920 if ( ( a.high != 0xC03E ) || aSig ) {
ff32e16e 4921 float_raise(float_flag_invalid, status);
158142c2
FB
4922 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4923 return LIT64( 0x7FFFFFFFFFFFFFFF );
4924 }
4925 }
bb98fe42 4926 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
4927 }
4928 else if ( aExp < 0x3FFF ) {
a2f2d288
PM
4929 if (aExp | aSig) {
4930 status->float_exception_flags |= float_flag_inexact;
4931 }
158142c2
FB
4932 return 0;
4933 }
4934 z = aSig>>( - shiftCount );
bb98fe42 4935 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
a2f2d288 4936 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
4937 }
4938 if ( aSign ) z = - z;
4939 return z;
4940
4941}
4942
4943/*----------------------------------------------------------------------------
4944| Returns the result of converting the extended double-precision floating-
4945| point value `a' to the single-precision floating-point format. The
4946| conversion is performed according to the IEC/IEEE Standard for Binary
4947| Floating-Point Arithmetic.
4948*----------------------------------------------------------------------------*/
4949
e5a41ffa 4950float32 floatx80_to_float32(floatx80 a, float_status *status)
158142c2
FB
4951{
4952 flag aSign;
f4014512 4953 int32_t aExp;
bb98fe42 4954 uint64_t aSig;
158142c2
FB
4955
4956 aSig = extractFloatx80Frac( a );
4957 aExp = extractFloatx80Exp( a );
4958 aSign = extractFloatx80Sign( a );
4959 if ( aExp == 0x7FFF ) {
bb98fe42 4960 if ( (uint64_t) ( aSig<<1 ) ) {
ff32e16e 4961 return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
158142c2
FB
4962 }
4963 return packFloat32( aSign, 0xFF, 0 );
4964 }
4965 shift64RightJamming( aSig, 33, &aSig );
4966 if ( aExp || aSig ) aExp -= 0x3F81;
ff32e16e 4967 return roundAndPackFloat32(aSign, aExp, aSig, status);
158142c2
FB
4968
4969}
4970
4971/*----------------------------------------------------------------------------
4972| Returns the result of converting the extended double-precision floating-
4973| point value `a' to the double-precision floating-point format. The
4974| conversion is performed according to the IEC/IEEE Standard for Binary
4975| Floating-Point Arithmetic.
4976*----------------------------------------------------------------------------*/
4977
e5a41ffa 4978float64 floatx80_to_float64(floatx80 a, float_status *status)
158142c2
FB
4979{
4980 flag aSign;
f4014512 4981 int32_t aExp;
bb98fe42 4982 uint64_t aSig, zSig;
158142c2
FB
4983
4984 aSig = extractFloatx80Frac( a );
4985 aExp = extractFloatx80Exp( a );
4986 aSign = extractFloatx80Sign( a );
4987 if ( aExp == 0x7FFF ) {
bb98fe42 4988 if ( (uint64_t) ( aSig<<1 ) ) {
ff32e16e 4989 return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
158142c2
FB
4990 }
4991 return packFloat64( aSign, 0x7FF, 0 );
4992 }
4993 shift64RightJamming( aSig, 1, &zSig );
4994 if ( aExp || aSig ) aExp -= 0x3C01;
ff32e16e 4995 return roundAndPackFloat64(aSign, aExp, zSig, status);
158142c2
FB
4996
4997}
4998
158142c2
FB
4999/*----------------------------------------------------------------------------
5000| Returns the result of converting the extended double-precision floating-
5001| point value `a' to the quadruple-precision floating-point format. The
5002| conversion is performed according to the IEC/IEEE Standard for Binary
5003| Floating-Point Arithmetic.
5004*----------------------------------------------------------------------------*/
5005
e5a41ffa 5006float128 floatx80_to_float128(floatx80 a, float_status *status)
158142c2
FB
5007{
5008 flag aSign;
94a49d86 5009 int_fast16_t aExp;
bb98fe42 5010 uint64_t aSig, zSig0, zSig1;
158142c2
FB
5011
5012 aSig = extractFloatx80Frac( a );
5013 aExp = extractFloatx80Exp( a );
5014 aSign = extractFloatx80Sign( a );
bb98fe42 5015 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
ff32e16e 5016 return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
158142c2
FB
5017 }
5018 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5019 return packFloat128( aSign, aExp, zSig0, zSig1 );
5020
5021}
5022
158142c2
FB
5023/*----------------------------------------------------------------------------
5024| Rounds the extended double-precision floating-point value `a' to an integer,
5025| and returns the result as an extended quadruple-precision floating-point
5026| value. The operation is performed according to the IEC/IEEE Standard for
5027| Binary Floating-Point Arithmetic.
5028*----------------------------------------------------------------------------*/
5029
e5a41ffa 5030floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
158142c2
FB
5031{
5032 flag aSign;
f4014512 5033 int32_t aExp;
bb98fe42 5034 uint64_t lastBitMask, roundBitsMask;
158142c2
FB
5035 floatx80 z;
5036
5037 aExp = extractFloatx80Exp( a );
5038 if ( 0x403E <= aExp ) {
bb98fe42 5039 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
ff32e16e 5040 return propagateFloatx80NaN(a, a, status);
158142c2
FB
5041 }
5042 return a;
5043 }
5044 if ( aExp < 0x3FFF ) {
5045 if ( ( aExp == 0 )
bb98fe42 5046 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
158142c2
FB
5047 return a;
5048 }
a2f2d288 5049 status->float_exception_flags |= float_flag_inexact;
158142c2 5050 aSign = extractFloatx80Sign( a );
a2f2d288 5051 switch (status->float_rounding_mode) {
158142c2 5052 case float_round_nearest_even:
bb98fe42 5053 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
158142c2
FB
5054 ) {
5055 return
5056 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
5057 }
5058 break;
f9288a76
PM
5059 case float_round_ties_away:
5060 if (aExp == 0x3FFE) {
5061 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
5062 }
5063 break;
158142c2
FB
5064 case float_round_down:
5065 return
5066 aSign ?
5067 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
5068 : packFloatx80( 0, 0, 0 );
5069 case float_round_up:
5070 return
5071 aSign ? packFloatx80( 1, 0, 0 )
5072 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
5073 }
5074 return packFloatx80( aSign, 0, 0 );
5075 }
5076 lastBitMask = 1;
5077 lastBitMask <<= 0x403E - aExp;
5078 roundBitsMask = lastBitMask - 1;
5079 z = a;
a2f2d288 5080 switch (status->float_rounding_mode) {
dc355b76 5081 case float_round_nearest_even:
158142c2 5082 z.low += lastBitMask>>1;
dc355b76
PM
5083 if ((z.low & roundBitsMask) == 0) {
5084 z.low &= ~lastBitMask;
5085 }
5086 break;
f9288a76
PM
5087 case float_round_ties_away:
5088 z.low += lastBitMask >> 1;
5089 break;
dc355b76
PM
5090 case float_round_to_zero:
5091 break;
5092 case float_round_up:
5093 if (!extractFloatx80Sign(z)) {
5094 z.low += roundBitsMask;
5095 }
5096 break;
5097 case float_round_down:
5098 if (extractFloatx80Sign(z)) {
158142c2
FB
5099 z.low += roundBitsMask;
5100 }
dc355b76
PM
5101 break;
5102 default:
5103 abort();
158142c2
FB
5104 }
5105 z.low &= ~ roundBitsMask;
5106 if ( z.low == 0 ) {
5107 ++z.high;
5108 z.low = LIT64( 0x8000000000000000 );
5109 }
a2f2d288
PM
5110 if (z.low != a.low) {
5111 status->float_exception_flags |= float_flag_inexact;
5112 }
158142c2
FB
5113 return z;
5114
5115}
5116
5117/*----------------------------------------------------------------------------
5118| Returns the result of adding the absolute values of the extended double-
5119| precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
5120| negated before being returned. `zSign' is ignored if the result is a NaN.
5121| The addition is performed according to the IEC/IEEE Standard for Binary
5122| Floating-Point Arithmetic.
5123*----------------------------------------------------------------------------*/
5124
e5a41ffa
PM
5125static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5126 float_status *status)
158142c2 5127{
f4014512 5128 int32_t aExp, bExp, zExp;
bb98fe42 5129 uint64_t aSig, bSig, zSig0, zSig1;
f4014512 5130 int32_t expDiff;
158142c2
FB
5131
5132 aSig = extractFloatx80Frac( a );
5133 aExp = extractFloatx80Exp( a );
5134 bSig = extractFloatx80Frac( b );
5135 bExp = extractFloatx80Exp( b );
5136 expDiff = aExp - bExp;
5137 if ( 0 < expDiff ) {
5138 if ( aExp == 0x7FFF ) {
ff32e16e
PM
5139 if ((uint64_t)(aSig << 1)) {
5140 return propagateFloatx80NaN(a, b, status);
5141 }
158142c2
FB
5142 return a;
5143 }
5144 if ( bExp == 0 ) --expDiff;
5145 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5146 zExp = aExp;
5147 }
5148 else if ( expDiff < 0 ) {
5149 if ( bExp == 0x7FFF ) {
ff32e16e
PM
5150 if ((uint64_t)(bSig << 1)) {
5151 return propagateFloatx80NaN(a, b, status);
5152 }
158142c2
FB
5153 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5154 }
5155 if ( aExp == 0 ) ++expDiff;
5156 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5157 zExp = bExp;
5158 }
5159 else {
5160 if ( aExp == 0x7FFF ) {
bb98fe42 5161 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
ff32e16e 5162 return propagateFloatx80NaN(a, b, status);
158142c2
FB
5163 }
5164 return a;
5165 }
5166 zSig1 = 0;
5167 zSig0 = aSig + bSig;
5168 if ( aExp == 0 ) {
5169 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5170 goto roundAndPack;
5171 }
5172 zExp = aExp;
5173 goto shiftRight1;
5174 }
5175 zSig0 = aSig + bSig;
bb98fe42 5176 if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
158142c2
FB
5177 shiftRight1:
5178 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5179 zSig0 |= LIT64( 0x8000000000000000 );
5180 ++zExp;
5181 roundAndPack:
a2f2d288 5182 return roundAndPackFloatx80(status->floatx80_rounding_precision,
ff32e16e 5183 zSign, zExp, zSig0, zSig1, status);
158142c2
FB
5184}
5185
5186/*----------------------------------------------------------------------------
5187| Returns the result of subtracting the absolute values of the extended
5188| double-precision floating-point values `a' and `b'. If `zSign' is 1, the
5189| difference is negated before being returned. `zSign' is ignored if the
5190| result is a NaN. The subtraction is performed according to the IEC/IEEE
5191| Standard for Binary Floating-Point Arithmetic.
5192*----------------------------------------------------------------------------*/
5193
e5a41ffa
PM
5194static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5195 float_status *status)
158142c2 5196{
f4014512 5197 int32_t aExp, bExp, zExp;
bb98fe42 5198 uint64_t aSig, bSig, zSig0, zSig1;
f4014512 5199 int32_t expDiff;
158142c2
FB
5200 floatx80 z;
5201
5202 aSig = extractFloatx80Frac( a );
5203 aExp = extractFloatx80Exp( a );
5204 bSig = extractFloatx80Frac( b );
5205 bExp = extractFloatx80Exp( b );
5206 expDiff = aExp - bExp;
5207 if ( 0 < expDiff ) goto aExpBigger;
5208 if ( expDiff < 0 ) goto bExpBigger;
5209 if ( aExp == 0x7FFF ) {
bb98fe42 5210 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
ff32e16e 5211 return propagateFloatx80NaN(a, b, status);
158142c2 5212 }
ff32e16e 5213 float_raise(float_flag_invalid, status);
158142c2
FB
5214 z.low = floatx80_default_nan_low;
5215 z.high = floatx80_default_nan_high;
5216 return z;
5217 }
5218 if ( aExp == 0 ) {
5219 aExp = 1;
5220 bExp = 1;
5221 }
5222 zSig1 = 0;
5223 if ( bSig < aSig ) goto aBigger;
5224 if ( aSig < bSig ) goto bBigger;
a2f2d288 5225 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
158142c2
FB
5226 bExpBigger:
5227 if ( bExp == 0x7FFF ) {
ff32e16e
PM
5228 if ((uint64_t)(bSig << 1)) {
5229 return propagateFloatx80NaN(a, b, status);
5230 }
158142c2
FB
5231 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
5232 }
5233 if ( aExp == 0 ) ++expDiff;
5234 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5235 bBigger:
5236 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5237 zExp = bExp;
5238 zSign ^= 1;
5239 goto normalizeRoundAndPack;
5240 aExpBigger:
5241 if ( aExp == 0x7FFF ) {
ff32e16e
PM
5242 if ((uint64_t)(aSig << 1)) {
5243 return propagateFloatx80NaN(a, b, status);
5244 }
158142c2
FB
5245 return a;
5246 }
5247 if ( bExp == 0 ) --expDiff;
5248 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5249 aBigger:
5250 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5251 zExp = aExp;
5252 normalizeRoundAndPack:
a2f2d288 5253 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
ff32e16e 5254 zSign, zExp, zSig0, zSig1, status);
158142c2
FB
5255}
5256
5257/*----------------------------------------------------------------------------
5258| Returns the result of adding the extended double-precision floating-point
5259| values `a' and `b'. The operation is performed according to the IEC/IEEE
5260| Standard for Binary Floating-Point Arithmetic.
5261*----------------------------------------------------------------------------*/
5262
e5a41ffa 5263floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5264{
5265 flag aSign, bSign;
5266
5267 aSign = extractFloatx80Sign( a );
5268 bSign = extractFloatx80Sign( b );
5269 if ( aSign == bSign ) {
ff32e16e 5270 return addFloatx80Sigs(a, b, aSign, status);
158142c2
FB
5271 }
5272 else {
ff32e16e 5273 return subFloatx80Sigs(a, b, aSign, status);
158142c2
FB
5274 }
5275
5276}
5277
5278/*----------------------------------------------------------------------------
5279| Returns the result of subtracting the extended double-precision floating-
5280| point values `a' and `b'. The operation is performed according to the
5281| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5282*----------------------------------------------------------------------------*/
5283
e5a41ffa 5284floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5285{
5286 flag aSign, bSign;
5287
5288 aSign = extractFloatx80Sign( a );
5289 bSign = extractFloatx80Sign( b );
5290 if ( aSign == bSign ) {
ff32e16e 5291 return subFloatx80Sigs(a, b, aSign, status);
158142c2
FB
5292 }
5293 else {
ff32e16e 5294 return addFloatx80Sigs(a, b, aSign, status);
158142c2
FB
5295 }
5296
5297}
5298
5299/*----------------------------------------------------------------------------
5300| Returns the result of multiplying the extended double-precision floating-
5301| point values `a' and `b'. The operation is performed according to the
5302| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5303*----------------------------------------------------------------------------*/
5304
e5a41ffa 5305floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5306{
5307 flag aSign, bSign, zSign;
f4014512 5308 int32_t aExp, bExp, zExp;
bb98fe42 5309 uint64_t aSig, bSig, zSig0, zSig1;
158142c2
FB
5310 floatx80 z;
5311
5312 aSig = extractFloatx80Frac( a );
5313 aExp = extractFloatx80Exp( a );
5314 aSign = extractFloatx80Sign( a );
5315 bSig = extractFloatx80Frac( b );
5316 bExp = extractFloatx80Exp( b );
5317 bSign = extractFloatx80Sign( b );
5318 zSign = aSign ^ bSign;
5319 if ( aExp == 0x7FFF ) {
bb98fe42
AF
5320 if ( (uint64_t) ( aSig<<1 )
5321 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
ff32e16e 5322 return propagateFloatx80NaN(a, b, status);
158142c2
FB
5323 }
5324 if ( ( bExp | bSig ) == 0 ) goto invalid;
5325 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5326 }
5327 if ( bExp == 0x7FFF ) {
ff32e16e
PM
5328 if ((uint64_t)(bSig << 1)) {
5329 return propagateFloatx80NaN(a, b, status);
5330 }
158142c2
FB
5331 if ( ( aExp | aSig ) == 0 ) {
5332 invalid:
ff32e16e 5333 float_raise(float_flag_invalid, status);
158142c2
FB
5334 z.low = floatx80_default_nan_low;
5335 z.high = floatx80_default_nan_high;
5336 return z;
5337 }
5338 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5339 }
5340 if ( aExp == 0 ) {
5341 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5342 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5343 }
5344 if ( bExp == 0 ) {
5345 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5346 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5347 }
5348 zExp = aExp + bExp - 0x3FFE;
5349 mul64To128( aSig, bSig, &zSig0, &zSig1 );
bb98fe42 5350 if ( 0 < (int64_t) zSig0 ) {
158142c2
FB
5351 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5352 --zExp;
5353 }
a2f2d288 5354 return roundAndPackFloatx80(status->floatx80_rounding_precision,
ff32e16e 5355 zSign, zExp, zSig0, zSig1, status);
158142c2
FB
5356}
5357
5358/*----------------------------------------------------------------------------
5359| Returns the result of dividing the extended double-precision floating-point
5360| value `a' by the corresponding value `b'. The operation is performed
5361| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5362*----------------------------------------------------------------------------*/
5363
e5a41ffa 5364floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5365{
5366 flag aSign, bSign, zSign;
f4014512 5367 int32_t aExp, bExp, zExp;
bb98fe42
AF
5368 uint64_t aSig, bSig, zSig0, zSig1;
5369 uint64_t rem0, rem1, rem2, term0, term1, term2;
158142c2
FB
5370 floatx80 z;
5371
5372 aSig = extractFloatx80Frac( a );
5373 aExp = extractFloatx80Exp( a );
5374 aSign = extractFloatx80Sign( a );
5375 bSig = extractFloatx80Frac( b );
5376 bExp = extractFloatx80Exp( b );
5377 bSign = extractFloatx80Sign( b );
5378 zSign = aSign ^ bSign;
5379 if ( aExp == 0x7FFF ) {
ff32e16e
PM
5380 if ((uint64_t)(aSig << 1)) {
5381 return propagateFloatx80NaN(a, b, status);
5382 }
158142c2 5383 if ( bExp == 0x7FFF ) {
ff32e16e
PM
5384 if ((uint64_t)(bSig << 1)) {
5385 return propagateFloatx80NaN(a, b, status);
5386 }
158142c2
FB
5387 goto invalid;
5388 }
5389 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5390 }
5391 if ( bExp == 0x7FFF ) {
ff32e16e
PM
5392 if ((uint64_t)(bSig << 1)) {
5393 return propagateFloatx80NaN(a, b, status);
5394 }
158142c2
FB
5395 return packFloatx80( zSign, 0, 0 );
5396 }
5397 if ( bExp == 0 ) {
5398 if ( bSig == 0 ) {
5399 if ( ( aExp | aSig ) == 0 ) {
5400 invalid:
ff32e16e 5401 float_raise(float_flag_invalid, status);
158142c2
FB
5402 z.low = floatx80_default_nan_low;
5403 z.high = floatx80_default_nan_high;
5404 return z;
5405 }
ff32e16e 5406 float_raise(float_flag_divbyzero, status);
158142c2
FB
5407 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5408 }
5409 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5410 }
5411 if ( aExp == 0 ) {
5412 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5413 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5414 }
5415 zExp = aExp - bExp + 0x3FFE;
5416 rem1 = 0;
5417 if ( bSig <= aSig ) {
5418 shift128Right( aSig, 0, 1, &aSig, &rem1 );
5419 ++zExp;
5420 }
5421 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5422 mul64To128( bSig, zSig0, &term0, &term1 );
5423 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
bb98fe42 5424 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
5425 --zSig0;
5426 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5427 }
5428 zSig1 = estimateDiv128To64( rem1, 0, bSig );
bb98fe42 5429 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
158142c2
FB
5430 mul64To128( bSig, zSig1, &term1, &term2 );
5431 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
bb98fe42 5432 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
5433 --zSig1;
5434 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5435 }
5436 zSig1 |= ( ( rem1 | rem2 ) != 0 );
5437 }
a2f2d288 5438 return roundAndPackFloatx80(status->floatx80_rounding_precision,
ff32e16e 5439 zSign, zExp, zSig0, zSig1, status);
158142c2
FB
5440}
5441
5442/*----------------------------------------------------------------------------
5443| Returns the remainder of the extended double-precision floating-point value
5444| `a' with respect to the corresponding value `b'. The operation is performed
5445| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5446*----------------------------------------------------------------------------*/
5447
e5a41ffa 5448floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
158142c2 5449{
ed086f3d 5450 flag aSign, zSign;
f4014512 5451 int32_t aExp, bExp, expDiff;
bb98fe42
AF
5452 uint64_t aSig0, aSig1, bSig;
5453 uint64_t q, term0, term1, alternateASig0, alternateASig1;
158142c2
FB
5454 floatx80 z;
5455
5456 aSig0 = extractFloatx80Frac( a );
5457 aExp = extractFloatx80Exp( a );
5458 aSign = extractFloatx80Sign( a );
5459 bSig = extractFloatx80Frac( b );
5460 bExp = extractFloatx80Exp( b );
158142c2 5461 if ( aExp == 0x7FFF ) {
bb98fe42
AF
5462 if ( (uint64_t) ( aSig0<<1 )
5463 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
ff32e16e 5464 return propagateFloatx80NaN(a, b, status);
158142c2
FB
5465 }
5466 goto invalid;
5467 }
5468 if ( bExp == 0x7FFF ) {
ff32e16e
PM
5469 if ((uint64_t)(bSig << 1)) {
5470 return propagateFloatx80NaN(a, b, status);
5471 }
158142c2
FB
5472 return a;
5473 }
5474 if ( bExp == 0 ) {
5475 if ( bSig == 0 ) {
5476 invalid:
ff32e16e 5477 float_raise(float_flag_invalid, status);
158142c2
FB
5478 z.low = floatx80_default_nan_low;
5479 z.high = floatx80_default_nan_high;
5480 return z;
5481 }
5482 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5483 }
5484 if ( aExp == 0 ) {
bb98fe42 5485 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
158142c2
FB
5486 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5487 }
5488 bSig |= LIT64( 0x8000000000000000 );
5489 zSign = aSign;
5490 expDiff = aExp - bExp;
5491 aSig1 = 0;
5492 if ( expDiff < 0 ) {
5493 if ( expDiff < -1 ) return a;
5494 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5495 expDiff = 0;
5496 }
5497 q = ( bSig <= aSig0 );
5498 if ( q ) aSig0 -= bSig;
5499 expDiff -= 64;
5500 while ( 0 < expDiff ) {
5501 q = estimateDiv128To64( aSig0, aSig1, bSig );
5502 q = ( 2 < q ) ? q - 2 : 0;
5503 mul64To128( bSig, q, &term0, &term1 );
5504 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5505 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5506 expDiff -= 62;
5507 }
5508 expDiff += 64;
5509 if ( 0 < expDiff ) {
5510 q = estimateDiv128To64( aSig0, aSig1, bSig );
5511 q = ( 2 < q ) ? q - 2 : 0;
5512 q >>= 64 - expDiff;
5513 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5514 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5515 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5516 while ( le128( term0, term1, aSig0, aSig1 ) ) {
5517 ++q;
5518 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5519 }
5520 }
5521 else {
5522 term1 = 0;
5523 term0 = bSig;
5524 }
5525 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5526 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5527 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5528 && ( q & 1 ) )
5529 ) {
5530 aSig0 = alternateASig0;
5531 aSig1 = alternateASig1;
5532 zSign = ! zSign;
5533 }
5534 return
5535 normalizeRoundAndPackFloatx80(
ff32e16e 5536 80, zSign, bExp + expDiff, aSig0, aSig1, status);
158142c2
FB
5537
5538}
5539
5540/*----------------------------------------------------------------------------
5541| Returns the square root of the extended double-precision floating-point
5542| value `a'. The operation is performed according to the IEC/IEEE Standard
5543| for Binary Floating-Point Arithmetic.
5544*----------------------------------------------------------------------------*/
5545
e5a41ffa 5546floatx80 floatx80_sqrt(floatx80 a, float_status *status)
158142c2
FB
5547{
5548 flag aSign;
f4014512 5549 int32_t aExp, zExp;
bb98fe42
AF
5550 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5551 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
158142c2
FB
5552 floatx80 z;
5553
5554 aSig0 = extractFloatx80Frac( a );
5555 aExp = extractFloatx80Exp( a );
5556 aSign = extractFloatx80Sign( a );
5557 if ( aExp == 0x7FFF ) {
ff32e16e
PM
5558 if ((uint64_t)(aSig0 << 1)) {
5559 return propagateFloatx80NaN(a, a, status);
5560 }
158142c2
FB
5561 if ( ! aSign ) return a;
5562 goto invalid;
5563 }
5564 if ( aSign ) {
5565 if ( ( aExp | aSig0 ) == 0 ) return a;
5566 invalid:
ff32e16e 5567 float_raise(float_flag_invalid, status);
158142c2
FB
5568 z.low = floatx80_default_nan_low;
5569 z.high = floatx80_default_nan_high;
5570 return z;
5571 }
5572 if ( aExp == 0 ) {
5573 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5574 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5575 }
5576 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5577 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5578 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5579 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5580 doubleZSig0 = zSig0<<1;
5581 mul64To128( zSig0, zSig0, &term0, &term1 );
5582 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
bb98fe42 5583 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
5584 --zSig0;
5585 doubleZSig0 -= 2;
5586 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5587 }
5588 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5589 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5590 if ( zSig1 == 0 ) zSig1 = 1;
5591 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5592 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5593 mul64To128( zSig1, zSig1, &term2, &term3 );
5594 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
bb98fe42 5595 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
5596 --zSig1;
5597 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5598 term3 |= 1;
5599 term2 |= doubleZSig0;
5600 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5601 }
5602 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5603 }
5604 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5605 zSig0 |= doubleZSig0;
a2f2d288
PM
5606 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5607 0, zExp, zSig0, zSig1, status);
158142c2
FB
5608}
5609
5610/*----------------------------------------------------------------------------
b689362d
AJ
5611| Returns 1 if the extended double-precision floating-point value `a' is equal
5612| to the corresponding value `b', and 0 otherwise. The invalid exception is
5613| raised if either operand is a NaN. Otherwise, the comparison is performed
5614| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
5615*----------------------------------------------------------------------------*/
5616
e5a41ffa 5617int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5618{
5619
5620 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5621 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5622 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5623 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2 5624 ) {
ff32e16e 5625 float_raise(float_flag_invalid, status);
158142c2
FB
5626 return 0;
5627 }
5628 return
5629 ( a.low == b.low )
5630 && ( ( a.high == b.high )
5631 || ( ( a.low == 0 )
bb98fe42 5632 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
5633 );
5634
5635}
5636
5637/*----------------------------------------------------------------------------
5638| Returns 1 if the extended double-precision floating-point value `a' is
5639| less than or equal to the corresponding value `b', and 0 otherwise. The
f5a64251
AJ
5640| invalid exception is raised if either operand is a NaN. The comparison is
5641| performed according to the IEC/IEEE Standard for Binary Floating-Point
5642| Arithmetic.
158142c2
FB
5643*----------------------------------------------------------------------------*/
5644
e5a41ffa 5645int floatx80_le(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5646{
5647 flag aSign, bSign;
5648
5649 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5650 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5651 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5652 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2 5653 ) {
ff32e16e 5654 float_raise(float_flag_invalid, status);
158142c2
FB
5655 return 0;
5656 }
5657 aSign = extractFloatx80Sign( a );
5658 bSign = extractFloatx80Sign( b );
5659 if ( aSign != bSign ) {
5660 return
5661 aSign
bb98fe42 5662 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5663 == 0 );
5664 }
5665 return
5666 aSign ? le128( b.high, b.low, a.high, a.low )
5667 : le128( a.high, a.low, b.high, b.low );
5668
5669}
5670
5671/*----------------------------------------------------------------------------
5672| Returns 1 if the extended double-precision floating-point value `a' is
f5a64251
AJ
5673| less than the corresponding value `b', and 0 otherwise. The invalid
5674| exception is raised if either operand is a NaN. The comparison is performed
5675| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
5676*----------------------------------------------------------------------------*/
5677
e5a41ffa 5678int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5679{
5680 flag aSign, bSign;
5681
5682 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5683 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5684 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5685 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2 5686 ) {
ff32e16e 5687 float_raise(float_flag_invalid, status);
158142c2
FB
5688 return 0;
5689 }
5690 aSign = extractFloatx80Sign( a );
5691 bSign = extractFloatx80Sign( b );
5692 if ( aSign != bSign ) {
5693 return
5694 aSign
bb98fe42 5695 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5696 != 0 );
5697 }
5698 return
5699 aSign ? lt128( b.high, b.low, a.high, a.low )
5700 : lt128( a.high, a.low, b.high, b.low );
5701
5702}
5703
67b7861d
AJ
5704/*----------------------------------------------------------------------------
5705| Returns 1 if the extended double-precision floating-point values `a' and `b'
f5a64251
AJ
5706| cannot be compared, and 0 otherwise. The invalid exception is raised if
5707| either operand is a NaN. The comparison is performed according to the
5708| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
67b7861d 5709*----------------------------------------------------------------------------*/
e5a41ffa 5710int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
67b7861d
AJ
5711{
5712 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5713 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5714 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5715 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5716 ) {
ff32e16e 5717 float_raise(float_flag_invalid, status);
67b7861d
AJ
5718 return 1;
5719 }
5720 return 0;
5721}
5722
158142c2 5723/*----------------------------------------------------------------------------
b689362d 5724| Returns 1 if the extended double-precision floating-point value `a' is
f5a64251
AJ
5725| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5726| cause an exception. The comparison is performed according to the IEC/IEEE
5727| Standard for Binary Floating-Point Arithmetic.
158142c2
FB
5728*----------------------------------------------------------------------------*/
5729
e5a41ffa 5730int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5731{
5732
5733 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5734 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5735 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5736 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2 5737 ) {
b689362d
AJ
5738 if ( floatx80_is_signaling_nan( a )
5739 || floatx80_is_signaling_nan( b ) ) {
ff32e16e 5740 float_raise(float_flag_invalid, status);
b689362d 5741 }
158142c2
FB
5742 return 0;
5743 }
5744 return
5745 ( a.low == b.low )
5746 && ( ( a.high == b.high )
5747 || ( ( a.low == 0 )
bb98fe42 5748 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
5749 );
5750
5751}
5752
5753/*----------------------------------------------------------------------------
5754| Returns 1 if the extended double-precision floating-point value `a' is less
5755| than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
5756| do not cause an exception. Otherwise, the comparison is performed according
5757| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5758*----------------------------------------------------------------------------*/
5759
e5a41ffa 5760int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5761{
5762 flag aSign, bSign;
5763
5764 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5765 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5766 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5767 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2
FB
5768 ) {
5769 if ( floatx80_is_signaling_nan( a )
5770 || floatx80_is_signaling_nan( b ) ) {
ff32e16e 5771 float_raise(float_flag_invalid, status);
158142c2
FB
5772 }
5773 return 0;
5774 }
5775 aSign = extractFloatx80Sign( a );
5776 bSign = extractFloatx80Sign( b );
5777 if ( aSign != bSign ) {
5778 return
5779 aSign
bb98fe42 5780 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5781 == 0 );
5782 }
5783 return
5784 aSign ? le128( b.high, b.low, a.high, a.low )
5785 : le128( a.high, a.low, b.high, b.low );
5786
5787}
5788
5789/*----------------------------------------------------------------------------
5790| Returns 1 if the extended double-precision floating-point value `a' is less
5791| than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
5792| an exception. Otherwise, the comparison is performed according to the
5793| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5794*----------------------------------------------------------------------------*/
5795
e5a41ffa 5796int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5797{
5798 flag aSign, bSign;
5799
5800 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5801 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5802 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5803 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2
FB
5804 ) {
5805 if ( floatx80_is_signaling_nan( a )
5806 || floatx80_is_signaling_nan( b ) ) {
ff32e16e 5807 float_raise(float_flag_invalid, status);
158142c2
FB
5808 }
5809 return 0;
5810 }
5811 aSign = extractFloatx80Sign( a );
5812 bSign = extractFloatx80Sign( b );
5813 if ( aSign != bSign ) {
5814 return
5815 aSign
bb98fe42 5816 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5817 != 0 );
5818 }
5819 return
5820 aSign ? lt128( b.high, b.low, a.high, a.low )
5821 : lt128( a.high, a.low, b.high, b.low );
5822
5823}
5824
67b7861d
AJ
5825/*----------------------------------------------------------------------------
5826| Returns 1 if the extended double-precision floating-point values `a' and `b'
5827| cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
5828| The comparison is performed according to the IEC/IEEE Standard for Binary
5829| Floating-Point Arithmetic.
5830*----------------------------------------------------------------------------*/
e5a41ffa 5831int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
67b7861d
AJ
5832{
5833 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5834 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5835 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5836 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5837 ) {
5838 if ( floatx80_is_signaling_nan( a )
5839 || floatx80_is_signaling_nan( b ) ) {
ff32e16e 5840 float_raise(float_flag_invalid, status);
67b7861d
AJ
5841 }
5842 return 1;
5843 }
5844 return 0;
5845}
5846
158142c2
FB
5847/*----------------------------------------------------------------------------
5848| Returns the result of converting the quadruple-precision floating-point
5849| value `a' to the 32-bit two's complement integer format. The conversion
5850| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5851| Arithmetic---which means in particular that the conversion is rounded
5852| according to the current rounding mode. If `a' is a NaN, the largest
5853| positive integer is returned. Otherwise, if the conversion overflows, the
5854| largest integer with the same sign as `a' is returned.
5855*----------------------------------------------------------------------------*/
5856
f4014512 5857int32_t float128_to_int32(float128 a, float_status *status)
158142c2
FB
5858{
5859 flag aSign;
f4014512 5860 int32_t aExp, shiftCount;
bb98fe42 5861 uint64_t aSig0, aSig1;
158142c2
FB
5862
5863 aSig1 = extractFloat128Frac1( a );
5864 aSig0 = extractFloat128Frac0( a );
5865 aExp = extractFloat128Exp( a );
5866 aSign = extractFloat128Sign( a );
5867 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5868 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5869 aSig0 |= ( aSig1 != 0 );
5870 shiftCount = 0x4028 - aExp;
5871 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
ff32e16e 5872 return roundAndPackInt32(aSign, aSig0, status);
158142c2
FB
5873
5874}
5875
5876/*----------------------------------------------------------------------------
5877| Returns the result of converting the quadruple-precision floating-point
5878| value `a' to the 32-bit two's complement integer format. The conversion
5879| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5880| Arithmetic, except that the conversion is always rounded toward zero. If
5881| `a' is a NaN, the largest positive integer is returned. Otherwise, if the
5882| conversion overflows, the largest integer with the same sign as `a' is
5883| returned.
5884*----------------------------------------------------------------------------*/
5885
f4014512 5886int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
158142c2
FB
5887{
5888 flag aSign;
f4014512 5889 int32_t aExp, shiftCount;
bb98fe42 5890 uint64_t aSig0, aSig1, savedASig;
b3a6a2e0 5891 int32_t z;
158142c2
FB
5892
5893 aSig1 = extractFloat128Frac1( a );
5894 aSig0 = extractFloat128Frac0( a );
5895 aExp = extractFloat128Exp( a );
5896 aSign = extractFloat128Sign( a );
5897 aSig0 |= ( aSig1 != 0 );
5898 if ( 0x401E < aExp ) {
5899 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5900 goto invalid;
5901 }
5902 else if ( aExp < 0x3FFF ) {
a2f2d288
PM
5903 if (aExp || aSig0) {
5904 status->float_exception_flags |= float_flag_inexact;
5905 }
158142c2
FB
5906 return 0;
5907 }
5908 aSig0 |= LIT64( 0x0001000000000000 );
5909 shiftCount = 0x402F - aExp;
5910 savedASig = aSig0;
5911 aSig0 >>= shiftCount;
5912 z = aSig0;
5913 if ( aSign ) z = - z;
5914 if ( ( z < 0 ) ^ aSign ) {
5915 invalid:
ff32e16e 5916 float_raise(float_flag_invalid, status);
bb98fe42 5917 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2
FB
5918 }
5919 if ( ( aSig0<<shiftCount ) != savedASig ) {
a2f2d288 5920 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
5921 }
5922 return z;
5923
5924}
5925
5926/*----------------------------------------------------------------------------
5927| Returns the result of converting the quadruple-precision floating-point
5928| value `a' to the 64-bit two's complement integer format. The conversion
5929| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5930| Arithmetic---which means in particular that the conversion is rounded
5931| according to the current rounding mode. If `a' is a NaN, the largest
5932| positive integer is returned. Otherwise, if the conversion overflows, the
5933| largest integer with the same sign as `a' is returned.
5934*----------------------------------------------------------------------------*/
5935
f42c2224 5936int64_t float128_to_int64(float128 a, float_status *status)
158142c2
FB
5937{
5938 flag aSign;
f4014512 5939 int32_t aExp, shiftCount;
bb98fe42 5940 uint64_t aSig0, aSig1;
158142c2
FB
5941
5942 aSig1 = extractFloat128Frac1( a );
5943 aSig0 = extractFloat128Frac0( a );
5944 aExp = extractFloat128Exp( a );
5945 aSign = extractFloat128Sign( a );
5946 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5947 shiftCount = 0x402F - aExp;
5948 if ( shiftCount <= 0 ) {
5949 if ( 0x403E < aExp ) {
ff32e16e 5950 float_raise(float_flag_invalid, status);
158142c2
FB
5951 if ( ! aSign
5952 || ( ( aExp == 0x7FFF )
5953 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5954 )
5955 ) {
5956 return LIT64( 0x7FFFFFFFFFFFFFFF );
5957 }
bb98fe42 5958 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
5959 }
5960 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5961 }
5962 else {
5963 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5964 }
ff32e16e 5965 return roundAndPackInt64(aSign, aSig0, aSig1, status);
158142c2
FB
5966
5967}
5968
5969/*----------------------------------------------------------------------------
5970| Returns the result of converting the quadruple-precision floating-point
5971| value `a' to the 64-bit two's complement integer format. The conversion
5972| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5973| Arithmetic, except that the conversion is always rounded toward zero.
5974| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
5975| the conversion overflows, the largest integer with the same sign as `a' is
5976| returned.
5977*----------------------------------------------------------------------------*/
5978
f42c2224 5979int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
158142c2
FB
5980{
5981 flag aSign;
f4014512 5982 int32_t aExp, shiftCount;
bb98fe42 5983 uint64_t aSig0, aSig1;
f42c2224 5984 int64_t z;
158142c2
FB
5985
5986 aSig1 = extractFloat128Frac1( a );
5987 aSig0 = extractFloat128Frac0( a );
5988 aExp = extractFloat128Exp( a );
5989 aSign = extractFloat128Sign( a );
5990 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5991 shiftCount = aExp - 0x402F;
5992 if ( 0 < shiftCount ) {
5993 if ( 0x403E <= aExp ) {
5994 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5995 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
5996 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
a2f2d288
PM
5997 if (aSig1) {
5998 status->float_exception_flags |= float_flag_inexact;
5999 }
158142c2
FB
6000 }
6001 else {
ff32e16e 6002 float_raise(float_flag_invalid, status);
158142c2
FB
6003 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
6004 return LIT64( 0x7FFFFFFFFFFFFFFF );
6005 }
6006 }
bb98fe42 6007 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
6008 }
6009 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
bb98fe42 6010 if ( (uint64_t) ( aSig1<<shiftCount ) ) {
a2f2d288 6011 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
6012 }
6013 }
6014 else {
6015 if ( aExp < 0x3FFF ) {
6016 if ( aExp | aSig0 | aSig1 ) {
a2f2d288 6017 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
6018 }
6019 return 0;
6020 }
6021 z = aSig0>>( - shiftCount );
6022 if ( aSig1
bb98fe42 6023 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
a2f2d288 6024 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
6025 }
6026 }
6027 if ( aSign ) z = - z;
6028 return z;
6029
6030}
6031
6032/*----------------------------------------------------------------------------
6033| Returns the result of converting the quadruple-precision floating-point
6034| value `a' to the single-precision floating-point format. The conversion
6035| is performed according to the IEC/IEEE Standard for Binary Floating-Point
6036| Arithmetic.
6037*----------------------------------------------------------------------------*/
6038
e5a41ffa 6039float32 float128_to_float32(float128 a, float_status *status)
158142c2
FB
6040{
6041 flag aSign;
f4014512 6042 int32_t aExp;
bb98fe42
AF
6043 uint64_t aSig0, aSig1;
6044 uint32_t zSig;
158142c2
FB
6045
6046 aSig1 = extractFloat128Frac1( a );
6047 aSig0 = extractFloat128Frac0( a );
6048 aExp = extractFloat128Exp( a );
6049 aSign = extractFloat128Sign( a );
6050 if ( aExp == 0x7FFF ) {
6051 if ( aSig0 | aSig1 ) {
ff32e16e 6052 return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
158142c2
FB
6053 }
6054 return packFloat32( aSign, 0xFF, 0 );
6055 }
6056 aSig0 |= ( aSig1 != 0 );
6057 shift64RightJamming( aSig0, 18, &aSig0 );
6058 zSig = aSig0;
6059 if ( aExp || zSig ) {
6060 zSig |= 0x40000000;
6061 aExp -= 0x3F81;
6062 }
ff32e16e 6063 return roundAndPackFloat32(aSign, aExp, zSig, status);
158142c2
FB
6064
6065}
6066
6067/*----------------------------------------------------------------------------
6068| Returns the result of converting the quadruple-precision floating-point
6069| value `a' to the double-precision floating-point format. The conversion
6070| is performed according to the IEC/IEEE Standard for Binary Floating-Point
6071| Arithmetic.
6072*----------------------------------------------------------------------------*/
6073
e5a41ffa 6074float64 float128_to_float64(float128 a, float_status *status)
158142c2
FB
6075{
6076 flag aSign;
f4014512 6077 int32_t aExp;
bb98fe42 6078 uint64_t aSig0, aSig1;
158142c2
FB
6079
6080 aSig1 = extractFloat128Frac1( a );
6081 aSig0 = extractFloat128Frac0( a );
6082 aExp = extractFloat128Exp( a );
6083 aSign = extractFloat128Sign( a );
6084 if ( aExp == 0x7FFF ) {
6085 if ( aSig0 | aSig1 ) {
ff32e16e 6086 return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
158142c2
FB
6087 }
6088 return packFloat64( aSign, 0x7FF, 0 );
6089 }
6090 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6091 aSig0 |= ( aSig1 != 0 );
6092 if ( aExp || aSig0 ) {
6093 aSig0 |= LIT64( 0x4000000000000000 );
6094 aExp -= 0x3C01;
6095 }
ff32e16e 6096 return roundAndPackFloat64(aSign, aExp, aSig0, status);
158142c2
FB
6097
6098}
6099
158142c2
FB
6100/*----------------------------------------------------------------------------
6101| Returns the result of converting the quadruple-precision floating-point
6102| value `a' to the extended double-precision floating-point format. The
6103| conversion is performed according to the IEC/IEEE Standard for Binary
6104| Floating-Point Arithmetic.
6105*----------------------------------------------------------------------------*/
6106
e5a41ffa 6107floatx80 float128_to_floatx80(float128 a, float_status *status)
158142c2
FB
6108{
6109 flag aSign;
f4014512 6110 int32_t aExp;
bb98fe42 6111 uint64_t aSig0, aSig1;
158142c2
FB
6112
6113 aSig1 = extractFloat128Frac1( a );
6114 aSig0 = extractFloat128Frac0( a );
6115 aExp = extractFloat128Exp( a );
6116 aSign = extractFloat128Sign( a );
6117 if ( aExp == 0x7FFF ) {
6118 if ( aSig0 | aSig1 ) {
ff32e16e 6119 return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
158142c2
FB
6120 }
6121 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
6122 }
6123 if ( aExp == 0 ) {
6124 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6125 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6126 }
6127 else {
6128 aSig0 |= LIT64( 0x0001000000000000 );
6129 }
6130 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
ff32e16e 6131 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
158142c2
FB
6132
6133}
6134
158142c2
FB
6135/*----------------------------------------------------------------------------
6136| Rounds the quadruple-precision floating-point value `a' to an integer, and
6137| returns the result as a quadruple-precision floating-point value. The
6138| operation is performed according to the IEC/IEEE Standard for Binary
6139| Floating-Point Arithmetic.
6140*----------------------------------------------------------------------------*/
6141
e5a41ffa 6142float128 float128_round_to_int(float128 a, float_status *status)
158142c2
FB
6143{
6144 flag aSign;
f4014512 6145 int32_t aExp;
bb98fe42 6146 uint64_t lastBitMask, roundBitsMask;
158142c2
FB
6147 float128 z;
6148
6149 aExp = extractFloat128Exp( a );
6150 if ( 0x402F <= aExp ) {
6151 if ( 0x406F <= aExp ) {
6152 if ( ( aExp == 0x7FFF )
6153 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6154 ) {
ff32e16e 6155 return propagateFloat128NaN(a, a, status);
158142c2
FB
6156 }
6157 return a;
6158 }
6159 lastBitMask = 1;
6160 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6161 roundBitsMask = lastBitMask - 1;
6162 z = a;
a2f2d288 6163 switch (status->float_rounding_mode) {
dc355b76 6164 case float_round_nearest_even:
158142c2
FB
6165 if ( lastBitMask ) {
6166 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6167 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6168 }
6169 else {
bb98fe42 6170 if ( (int64_t) z.low < 0 ) {
158142c2 6171 ++z.high;
bb98fe42 6172 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
158142c2
FB
6173 }
6174 }
dc355b76 6175 break;
f9288a76
PM
6176 case float_round_ties_away:
6177 if (lastBitMask) {
6178 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6179 } else {
6180 if ((int64_t) z.low < 0) {
6181 ++z.high;
6182 }
6183 }
6184 break;
dc355b76
PM
6185 case float_round_to_zero:
6186 break;
6187 case float_round_up:
6188 if (!extractFloat128Sign(z)) {
6189 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6190 }
6191 break;
6192 case float_round_down:
6193 if (extractFloat128Sign(z)) {
6194 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
158142c2 6195 }
dc355b76
PM
6196 break;
6197 default:
6198 abort();
158142c2
FB
6199 }
6200 z.low &= ~ roundBitsMask;
6201 }
6202 else {
6203 if ( aExp < 0x3FFF ) {
bb98fe42 6204 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
a2f2d288 6205 status->float_exception_flags |= float_flag_inexact;
158142c2 6206 aSign = extractFloat128Sign( a );
a2f2d288 6207 switch (status->float_rounding_mode) {
158142c2
FB
6208 case float_round_nearest_even:
6209 if ( ( aExp == 0x3FFE )
6210 && ( extractFloat128Frac0( a )
6211 | extractFloat128Frac1( a ) )
6212 ) {
6213 return packFloat128( aSign, 0x3FFF, 0, 0 );
6214 }
6215 break;
f9288a76
PM
6216 case float_round_ties_away:
6217 if (aExp == 0x3FFE) {
6218 return packFloat128(aSign, 0x3FFF, 0, 0);
6219 }
6220 break;
158142c2
FB
6221 case float_round_down:
6222 return
6223 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6224 : packFloat128( 0, 0, 0, 0 );
6225 case float_round_up:
6226 return
6227 aSign ? packFloat128( 1, 0, 0, 0 )
6228 : packFloat128( 0, 0x3FFF, 0, 0 );
6229 }
6230 return packFloat128( aSign, 0, 0, 0 );
6231 }
6232 lastBitMask = 1;
6233 lastBitMask <<= 0x402F - aExp;
6234 roundBitsMask = lastBitMask - 1;
6235 z.low = 0;
6236 z.high = a.high;
a2f2d288 6237 switch (status->float_rounding_mode) {
dc355b76 6238 case float_round_nearest_even:
158142c2
FB
6239 z.high += lastBitMask>>1;
6240 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6241 z.high &= ~ lastBitMask;
6242 }
dc355b76 6243 break;
f9288a76
PM
6244 case float_round_ties_away:
6245 z.high += lastBitMask>>1;
6246 break;
dc355b76
PM
6247 case float_round_to_zero:
6248 break;
6249 case float_round_up:
6250 if (!extractFloat128Sign(z)) {
158142c2
FB
6251 z.high |= ( a.low != 0 );
6252 z.high += roundBitsMask;
6253 }
dc355b76
PM
6254 break;
6255 case float_round_down:
6256 if (extractFloat128Sign(z)) {
6257 z.high |= (a.low != 0);
6258 z.high += roundBitsMask;
6259 }
6260 break;
6261 default:
6262 abort();
158142c2
FB
6263 }
6264 z.high &= ~ roundBitsMask;
6265 }
6266 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
a2f2d288 6267 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
6268 }
6269 return z;
6270
6271}
6272
6273/*----------------------------------------------------------------------------
6274| Returns the result of adding the absolute values of the quadruple-precision
6275| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
6276| before being returned. `zSign' is ignored if the result is a NaN.
6277| The addition is performed according to the IEC/IEEE Standard for Binary
6278| Floating-Point Arithmetic.
6279*----------------------------------------------------------------------------*/
6280
e5a41ffa
PM
6281static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
6282 float_status *status)
158142c2 6283{
f4014512 6284 int32_t aExp, bExp, zExp;
bb98fe42 6285 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
f4014512 6286 int32_t expDiff;
158142c2
FB
6287
6288 aSig1 = extractFloat128Frac1( a );
6289 aSig0 = extractFloat128Frac0( a );
6290 aExp = extractFloat128Exp( a );
6291 bSig1 = extractFloat128Frac1( b );
6292 bSig0 = extractFloat128Frac0( b );
6293 bExp = extractFloat128Exp( b );
6294 expDiff = aExp - bExp;
6295 if ( 0 < expDiff ) {
6296 if ( aExp == 0x7FFF ) {
ff32e16e
PM
6297 if (aSig0 | aSig1) {
6298 return propagateFloat128NaN(a, b, status);
6299 }
158142c2
FB
6300 return a;
6301 }
6302 if ( bExp == 0 ) {
6303 --expDiff;
6304 }
6305 else {
6306 bSig0 |= LIT64( 0x0001000000000000 );
6307 }
6308 shift128ExtraRightJamming(
6309 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6310 zExp = aExp;
6311 }
6312 else if ( expDiff < 0 ) {
6313 if ( bExp == 0x7FFF ) {
ff32e16e
PM
6314 if (bSig0 | bSig1) {
6315 return propagateFloat128NaN(a, b, status);
6316 }
158142c2
FB
6317 return packFloat128( zSign, 0x7FFF, 0, 0 );
6318 }
6319 if ( aExp == 0 ) {
6320 ++expDiff;
6321 }
6322 else {
6323 aSig0 |= LIT64( 0x0001000000000000 );
6324 }
6325 shift128ExtraRightJamming(
6326 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6327 zExp = bExp;
6328 }
6329 else {
6330 if ( aExp == 0x7FFF ) {
6331 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
ff32e16e 6332 return propagateFloat128NaN(a, b, status);
158142c2
FB
6333 }
6334 return a;
6335 }
6336 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
fe76d976 6337 if ( aExp == 0 ) {
a2f2d288 6338 if (status->flush_to_zero) {
e6afc87f 6339 if (zSig0 | zSig1) {
ff32e16e 6340 float_raise(float_flag_output_denormal, status);
e6afc87f
PM
6341 }
6342 return packFloat128(zSign, 0, 0, 0);
6343 }
fe76d976
PB
6344 return packFloat128( zSign, 0, zSig0, zSig1 );
6345 }
158142c2
FB
6346 zSig2 = 0;
6347 zSig0 |= LIT64( 0x0002000000000000 );
6348 zExp = aExp;
6349 goto shiftRight1;
6350 }
6351 aSig0 |= LIT64( 0x0001000000000000 );
6352 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6353 --zExp;
6354 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6355 ++zExp;
6356 shiftRight1:
6357 shift128ExtraRightJamming(
6358 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6359 roundAndPack:
ff32e16e 6360 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
158142c2
FB
6361
6362}
6363
6364/*----------------------------------------------------------------------------
6365| Returns the result of subtracting the absolute values of the quadruple-
6366| precision floating-point values `a' and `b'. If `zSign' is 1, the
6367| difference is negated before being returned. `zSign' is ignored if the
6368| result is a NaN. The subtraction is performed according to the IEC/IEEE
6369| Standard for Binary Floating-Point Arithmetic.
6370*----------------------------------------------------------------------------*/
6371
e5a41ffa
PM
6372static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6373 float_status *status)
158142c2 6374{
f4014512 6375 int32_t aExp, bExp, zExp;
bb98fe42 6376 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
f4014512 6377 int32_t expDiff;
158142c2
FB
6378 float128 z;
6379
6380 aSig1 = extractFloat128Frac1( a );
6381 aSig0 = extractFloat128Frac0( a );
6382 aExp = extractFloat128Exp( a );
6383 bSig1 = extractFloat128Frac1( b );
6384 bSig0 = extractFloat128Frac0( b );
6385 bExp = extractFloat128Exp( b );
6386 expDiff = aExp - bExp;
6387 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6388 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6389 if ( 0 < expDiff ) goto aExpBigger;
6390 if ( expDiff < 0 ) goto bExpBigger;
6391 if ( aExp == 0x7FFF ) {
6392 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
ff32e16e 6393 return propagateFloat128NaN(a, b, status);
158142c2 6394 }
ff32e16e 6395 float_raise(float_flag_invalid, status);
158142c2
FB
6396 z.low = float128_default_nan_low;
6397 z.high = float128_default_nan_high;
6398 return z;
6399 }
6400 if ( aExp == 0 ) {
6401 aExp = 1;
6402 bExp = 1;
6403 }
6404 if ( bSig0 < aSig0 ) goto aBigger;
6405 if ( aSig0 < bSig0 ) goto bBigger;
6406 if ( bSig1 < aSig1 ) goto aBigger;
6407 if ( aSig1 < bSig1 ) goto bBigger;
a2f2d288
PM
6408 return packFloat128(status->float_rounding_mode == float_round_down,
6409 0, 0, 0);
158142c2
FB
6410 bExpBigger:
6411 if ( bExp == 0x7FFF ) {
ff32e16e
PM
6412 if (bSig0 | bSig1) {
6413 return propagateFloat128NaN(a, b, status);
6414 }
158142c2
FB
6415 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6416 }
6417 if ( aExp == 0 ) {
6418 ++expDiff;
6419 }
6420 else {
6421 aSig0 |= LIT64( 0x4000000000000000 );
6422 }
6423 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6424 bSig0 |= LIT64( 0x4000000000000000 );
6425 bBigger:
6426 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6427 zExp = bExp;
6428 zSign ^= 1;
6429 goto normalizeRoundAndPack;
6430 aExpBigger:
6431 if ( aExp == 0x7FFF ) {
ff32e16e
PM
6432 if (aSig0 | aSig1) {
6433 return propagateFloat128NaN(a, b, status);
6434 }
158142c2
FB
6435 return a;
6436 }
6437 if ( bExp == 0 ) {
6438 --expDiff;
6439 }
6440 else {
6441 bSig0 |= LIT64( 0x4000000000000000 );
6442 }
6443 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6444 aSig0 |= LIT64( 0x4000000000000000 );
6445 aBigger:
6446 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6447 zExp = aExp;
6448 normalizeRoundAndPack:
6449 --zExp;
ff32e16e
PM
6450 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6451 status);
158142c2
FB
6452
6453}
6454
6455/*----------------------------------------------------------------------------
6456| Returns the result of adding the quadruple-precision floating-point values
6457| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
6458| for Binary Floating-Point Arithmetic.
6459*----------------------------------------------------------------------------*/
6460
e5a41ffa 6461float128 float128_add(float128 a, float128 b, float_status *status)
158142c2
FB
6462{
6463 flag aSign, bSign;
6464
6465 aSign = extractFloat128Sign( a );
6466 bSign = extractFloat128Sign( b );
6467 if ( aSign == bSign ) {
ff32e16e 6468 return addFloat128Sigs(a, b, aSign, status);
158142c2
FB
6469 }
6470 else {
ff32e16e 6471 return subFloat128Sigs(a, b, aSign, status);
158142c2
FB
6472 }
6473
6474}
6475
6476/*----------------------------------------------------------------------------
6477| Returns the result of subtracting the quadruple-precision floating-point
6478| values `a' and `b'. The operation is performed according to the IEC/IEEE
6479| Standard for Binary Floating-Point Arithmetic.
6480*----------------------------------------------------------------------------*/
6481
e5a41ffa 6482float128 float128_sub(float128 a, float128 b, float_status *status)
158142c2
FB
6483{
6484 flag aSign, bSign;
6485
6486 aSign = extractFloat128Sign( a );
6487 bSign = extractFloat128Sign( b );
6488 if ( aSign == bSign ) {
ff32e16e 6489 return subFloat128Sigs(a, b, aSign, status);
158142c2
FB
6490 }
6491 else {
ff32e16e 6492 return addFloat128Sigs(a, b, aSign, status);
158142c2
FB
6493 }
6494
6495}
6496
6497/*----------------------------------------------------------------------------
6498| Returns the result of multiplying the quadruple-precision floating-point
6499| values `a' and `b'. The operation is performed according to the IEC/IEEE
6500| Standard for Binary Floating-Point Arithmetic.
6501*----------------------------------------------------------------------------*/
6502
e5a41ffa 6503float128 float128_mul(float128 a, float128 b, float_status *status)
158142c2
FB
6504{
6505 flag aSign, bSign, zSign;
f4014512 6506 int32_t aExp, bExp, zExp;
bb98fe42 6507 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
158142c2
FB
6508 float128 z;
6509
6510 aSig1 = extractFloat128Frac1( a );
6511 aSig0 = extractFloat128Frac0( a );
6512 aExp = extractFloat128Exp( a );
6513 aSign = extractFloat128Sign( a );
6514 bSig1 = extractFloat128Frac1( b );
6515 bSig0 = extractFloat128Frac0( b );
6516 bExp = extractFloat128Exp( b );
6517 bSign = extractFloat128Sign( b );
6518 zSign = aSign ^ bSign;
6519 if ( aExp == 0x7FFF ) {
6520 if ( ( aSig0 | aSig1 )
6521 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
ff32e16e 6522 return propagateFloat128NaN(a, b, status);
158142c2
FB
6523 }
6524 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6525 return packFloat128( zSign, 0x7FFF, 0, 0 );
6526 }
6527 if ( bExp == 0x7FFF ) {
ff32e16e
PM
6528 if (bSig0 | bSig1) {
6529 return propagateFloat128NaN(a, b, status);
6530 }
158142c2
FB
6531 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6532 invalid:
ff32e16e 6533 float_raise(float_flag_invalid, status);
158142c2
FB
6534 z.low = float128_default_nan_low;
6535 z.high = float128_default_nan_high;
6536 return z;
6537 }
6538 return packFloat128( zSign, 0x7FFF, 0, 0 );
6539 }
6540 if ( aExp == 0 ) {
6541 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6542 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6543 }
6544 if ( bExp == 0 ) {
6545 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6546 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6547 }
6548 zExp = aExp + bExp - 0x4000;
6549 aSig0 |= LIT64( 0x0001000000000000 );
6550 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6551 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6552 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6553 zSig2 |= ( zSig3 != 0 );
6554 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6555 shift128ExtraRightJamming(
6556 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6557 ++zExp;
6558 }
ff32e16e 6559 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
158142c2
FB
6560
6561}
6562
6563/*----------------------------------------------------------------------------
6564| Returns the result of dividing the quadruple-precision floating-point value
6565| `a' by the corresponding value `b'. The operation is performed according to
6566| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6567*----------------------------------------------------------------------------*/
6568
e5a41ffa 6569float128 float128_div(float128 a, float128 b, float_status *status)
158142c2
FB
6570{
6571 flag aSign, bSign, zSign;
f4014512 6572 int32_t aExp, bExp, zExp;
bb98fe42
AF
6573 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6574 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
158142c2
FB
6575 float128 z;
6576
6577 aSig1 = extractFloat128Frac1( a );
6578 aSig0 = extractFloat128Frac0( a );
6579 aExp = extractFloat128Exp( a );
6580 aSign = extractFloat128Sign( a );
6581 bSig1 = extractFloat128Frac1( b );
6582 bSig0 = extractFloat128Frac0( b );
6583 bExp = extractFloat128Exp( b );
6584 bSign = extractFloat128Sign( b );
6585 zSign = aSign ^ bSign;
6586 if ( aExp == 0x7FFF ) {
ff32e16e
PM
6587 if (aSig0 | aSig1) {
6588 return propagateFloat128NaN(a, b, status);
6589 }
158142c2 6590 if ( bExp == 0x7FFF ) {
ff32e16e
PM
6591 if (bSig0 | bSig1) {
6592 return propagateFloat128NaN(a, b, status);
6593 }
158142c2
FB
6594 goto invalid;
6595 }
6596 return packFloat128( zSign, 0x7FFF, 0, 0 );
6597 }
6598 if ( bExp == 0x7FFF ) {
ff32e16e
PM
6599 if (bSig0 | bSig1) {
6600 return propagateFloat128NaN(a, b, status);
6601 }
158142c2
FB
6602 return packFloat128( zSign, 0, 0, 0 );
6603 }
6604 if ( bExp == 0 ) {
6605 if ( ( bSig0 | bSig1 ) == 0 ) {
6606 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6607 invalid:
ff32e16e 6608 float_raise(float_flag_invalid, status);
158142c2
FB
6609 z.low = float128_default_nan_low;
6610 z.high = float128_default_nan_high;
6611 return z;
6612 }
ff32e16e 6613 float_raise(float_flag_divbyzero, status);
158142c2
FB
6614 return packFloat128( zSign, 0x7FFF, 0, 0 );
6615 }
6616 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6617 }
6618 if ( aExp == 0 ) {
6619 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6620 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6621 }
6622 zExp = aExp - bExp + 0x3FFD;
6623 shortShift128Left(
6624 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6625 shortShift128Left(
6626 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6627 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6628 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6629 ++zExp;
6630 }
6631 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6632 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6633 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
bb98fe42 6634 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
6635 --zSig0;
6636 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6637 }
6638 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6639 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6640 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6641 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
bb98fe42 6642 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
6643 --zSig1;
6644 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6645 }
6646 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6647 }
6648 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
ff32e16e 6649 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
158142c2
FB
6650
6651}
6652
6653/*----------------------------------------------------------------------------
6654| Returns the remainder of the quadruple-precision floating-point value `a'
6655| with respect to the corresponding value `b'. The operation is performed
6656| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6657*----------------------------------------------------------------------------*/
6658
e5a41ffa 6659float128 float128_rem(float128 a, float128 b, float_status *status)
158142c2 6660{
ed086f3d 6661 flag aSign, zSign;
f4014512 6662 int32_t aExp, bExp, expDiff;
bb98fe42
AF
6663 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6664 uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6665 int64_t sigMean0;
158142c2
FB
6666 float128 z;
6667
6668 aSig1 = extractFloat128Frac1( a );
6669 aSig0 = extractFloat128Frac0( a );
6670 aExp = extractFloat128Exp( a );
6671 aSign = extractFloat128Sign( a );
6672 bSig1 = extractFloat128Frac1( b );
6673 bSig0 = extractFloat128Frac0( b );
6674 bExp = extractFloat128Exp( b );
158142c2
FB
6675 if ( aExp == 0x7FFF ) {
6676 if ( ( aSig0 | aSig1 )
6677 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
ff32e16e 6678 return propagateFloat128NaN(a, b, status);
158142c2
FB
6679 }
6680 goto invalid;
6681 }
6682 if ( bExp == 0x7FFF ) {
ff32e16e
PM
6683 if (bSig0 | bSig1) {
6684 return propagateFloat128NaN(a, b, status);
6685 }
158142c2
FB
6686 return a;
6687 }
6688 if ( bExp == 0 ) {
6689 if ( ( bSig0 | bSig1 ) == 0 ) {
6690 invalid:
ff32e16e 6691 float_raise(float_flag_invalid, status);
158142c2
FB
6692 z.low = float128_default_nan_low;
6693 z.high = float128_default_nan_high;
6694 return z;
6695 }
6696 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6697 }
6698 if ( aExp == 0 ) {
6699 if ( ( aSig0 | aSig1 ) == 0 ) return a;
6700 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6701 }
6702 expDiff = aExp - bExp;
6703 if ( expDiff < -1 ) return a;
6704 shortShift128Left(
6705 aSig0 | LIT64( 0x0001000000000000 ),
6706 aSig1,
6707 15 - ( expDiff < 0 ),
6708 &aSig0,
6709 &aSig1
6710 );
6711 shortShift128Left(
6712 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6713 q = le128( bSig0, bSig1, aSig0, aSig1 );
6714 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6715 expDiff -= 64;
6716 while ( 0 < expDiff ) {
6717 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6718 q = ( 4 < q ) ? q - 4 : 0;
6719 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6720 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6721 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6722 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6723 expDiff -= 61;
6724 }
6725 if ( -64 < expDiff ) {
6726 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6727 q = ( 4 < q ) ? q - 4 : 0;
6728 q >>= - expDiff;
6729 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6730 expDiff += 52;
6731 if ( expDiff < 0 ) {
6732 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6733 }
6734 else {
6735 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6736 }
6737 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6738 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6739 }
6740 else {
6741 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6742 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6743 }
6744 do {
6745 alternateASig0 = aSig0;
6746 alternateASig1 = aSig1;
6747 ++q;
6748 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
bb98fe42 6749 } while ( 0 <= (int64_t) aSig0 );
158142c2 6750 add128(
bb98fe42 6751 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
158142c2
FB
6752 if ( ( sigMean0 < 0 )
6753 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6754 aSig0 = alternateASig0;
6755 aSig1 = alternateASig1;
6756 }
bb98fe42 6757 zSign = ( (int64_t) aSig0 < 0 );
158142c2 6758 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
ff32e16e
PM
6759 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
6760 status);
158142c2
FB
6761}
6762
6763/*----------------------------------------------------------------------------
6764| Returns the square root of the quadruple-precision floating-point value `a'.
6765| The operation is performed according to the IEC/IEEE Standard for Binary
6766| Floating-Point Arithmetic.
6767*----------------------------------------------------------------------------*/
6768
e5a41ffa 6769float128 float128_sqrt(float128 a, float_status *status)
158142c2
FB
6770{
6771 flag aSign;
f4014512 6772 int32_t aExp, zExp;
bb98fe42
AF
6773 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6774 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
158142c2
FB
6775 float128 z;
6776
6777 aSig1 = extractFloat128Frac1( a );
6778 aSig0 = extractFloat128Frac0( a );
6779 aExp = extractFloat128Exp( a );
6780 aSign = extractFloat128Sign( a );
6781 if ( aExp == 0x7FFF ) {
ff32e16e
PM
6782 if (aSig0 | aSig1) {
6783 return propagateFloat128NaN(a, a, status);
6784 }
158142c2
FB
6785 if ( ! aSign ) return a;
6786 goto invalid;
6787 }
6788 if ( aSign ) {
6789 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6790 invalid:
ff32e16e 6791 float_raise(float_flag_invalid, status);
158142c2
FB
6792 z.low = float128_default_nan_low;
6793 z.high = float128_default_nan_high;
6794 return z;
6795 }
6796 if ( aExp == 0 ) {
6797 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6798 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6799 }
6800 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6801 aSig0 |= LIT64( 0x0001000000000000 );
6802 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6803 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6804 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6805 doubleZSig0 = zSig0<<1;
6806 mul64To128( zSig0, zSig0, &term0, &term1 );
6807 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
bb98fe42 6808 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
6809 --zSig0;
6810 doubleZSig0 -= 2;
6811 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6812 }
6813 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6814 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6815 if ( zSig1 == 0 ) zSig1 = 1;
6816 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6817 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6818 mul64To128( zSig1, zSig1, &term2, &term3 );
6819 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
bb98fe42 6820 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
6821 --zSig1;
6822 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6823 term3 |= 1;
6824 term2 |= doubleZSig0;
6825 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6826 }
6827 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6828 }
6829 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
ff32e16e 6830 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
158142c2
FB
6831
6832}
6833
6834/*----------------------------------------------------------------------------
6835| Returns 1 if the quadruple-precision floating-point value `a' is equal to
b689362d
AJ
6836| the corresponding value `b', and 0 otherwise. The invalid exception is
6837| raised if either operand is a NaN. Otherwise, the comparison is performed
158142c2
FB
6838| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6839*----------------------------------------------------------------------------*/
6840
e5a41ffa 6841int float128_eq(float128 a, float128 b, float_status *status)
158142c2
FB
6842{
6843
6844 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6845 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6846 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6847 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6848 ) {
ff32e16e 6849 float_raise(float_flag_invalid, status);
158142c2
FB
6850 return 0;
6851 }
6852 return
6853 ( a.low == b.low )
6854 && ( ( a.high == b.high )
6855 || ( ( a.low == 0 )
bb98fe42 6856 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
6857 );
6858
6859}
6860
6861/*----------------------------------------------------------------------------
6862| Returns 1 if the quadruple-precision floating-point value `a' is less than
f5a64251
AJ
6863| or equal to the corresponding value `b', and 0 otherwise. The invalid
6864| exception is raised if either operand is a NaN. The comparison is performed
6865| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
6866*----------------------------------------------------------------------------*/
6867
e5a41ffa 6868int float128_le(float128 a, float128 b, float_status *status)
158142c2
FB
6869{
6870 flag aSign, bSign;
6871
6872 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6873 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6874 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6875 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6876 ) {
ff32e16e 6877 float_raise(float_flag_invalid, status);
158142c2
FB
6878 return 0;
6879 }
6880 aSign = extractFloat128Sign( a );
6881 bSign = extractFloat128Sign( b );
6882 if ( aSign != bSign ) {
6883 return
6884 aSign
bb98fe42 6885 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
6886 == 0 );
6887 }
6888 return
6889 aSign ? le128( b.high, b.low, a.high, a.low )
6890 : le128( a.high, a.low, b.high, b.low );
6891
6892}
6893
6894/*----------------------------------------------------------------------------
6895| Returns 1 if the quadruple-precision floating-point value `a' is less than
f5a64251
AJ
6896| the corresponding value `b', and 0 otherwise. The invalid exception is
6897| raised if either operand is a NaN. The comparison is performed according
6898| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
6899*----------------------------------------------------------------------------*/
6900
e5a41ffa 6901int float128_lt(float128 a, float128 b, float_status *status)
158142c2
FB
6902{
6903 flag aSign, bSign;
6904
6905 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6906 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6907 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6908 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6909 ) {
ff32e16e 6910 float_raise(float_flag_invalid, status);
158142c2
FB
6911 return 0;
6912 }
6913 aSign = extractFloat128Sign( a );
6914 bSign = extractFloat128Sign( b );
6915 if ( aSign != bSign ) {
6916 return
6917 aSign
bb98fe42 6918 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
6919 != 0 );
6920 }
6921 return
6922 aSign ? lt128( b.high, b.low, a.high, a.low )
6923 : lt128( a.high, a.low, b.high, b.low );
6924
6925}
6926
67b7861d
AJ
6927/*----------------------------------------------------------------------------
6928| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
f5a64251
AJ
6929| be compared, and 0 otherwise. The invalid exception is raised if either
6930| operand is a NaN. The comparison is performed according to the IEC/IEEE
6931| Standard for Binary Floating-Point Arithmetic.
67b7861d
AJ
6932*----------------------------------------------------------------------------*/
6933
e5a41ffa 6934int float128_unordered(float128 a, float128 b, float_status *status)
67b7861d
AJ
6935{
6936 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6937 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6938 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6939 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6940 ) {
ff32e16e 6941 float_raise(float_flag_invalid, status);
67b7861d
AJ
6942 return 1;
6943 }
6944 return 0;
6945}
6946
158142c2
FB
6947/*----------------------------------------------------------------------------
6948| Returns 1 if the quadruple-precision floating-point value `a' is equal to
f5a64251
AJ
6949| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6950| exception. The comparison is performed according to the IEC/IEEE Standard
6951| for Binary Floating-Point Arithmetic.
158142c2
FB
6952*----------------------------------------------------------------------------*/
6953
e5a41ffa 6954int float128_eq_quiet(float128 a, float128 b, float_status *status)
158142c2
FB
6955{
6956
6957 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6958 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6959 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6960 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6961 ) {
b689362d
AJ
6962 if ( float128_is_signaling_nan( a )
6963 || float128_is_signaling_nan( b ) ) {
ff32e16e 6964 float_raise(float_flag_invalid, status);
b689362d 6965 }
158142c2
FB
6966 return 0;
6967 }
6968 return
6969 ( a.low == b.low )
6970 && ( ( a.high == b.high )
6971 || ( ( a.low == 0 )
bb98fe42 6972 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
6973 );
6974
6975}
6976
6977/*----------------------------------------------------------------------------
6978| Returns 1 if the quadruple-precision floating-point value `a' is less than
6979| or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
6980| cause an exception. Otherwise, the comparison is performed according to the
6981| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6982*----------------------------------------------------------------------------*/
6983
e5a41ffa 6984int float128_le_quiet(float128 a, float128 b, float_status *status)
158142c2
FB
6985{
6986 flag aSign, bSign;
6987
6988 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6989 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6990 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6991 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6992 ) {
6993 if ( float128_is_signaling_nan( a )
6994 || float128_is_signaling_nan( b ) ) {
ff32e16e 6995 float_raise(float_flag_invalid, status);
158142c2
FB
6996 }
6997 return 0;
6998 }
6999 aSign = extractFloat128Sign( a );
7000 bSign = extractFloat128Sign( b );
7001 if ( aSign != bSign ) {
7002 return
7003 aSign
bb98fe42 7004 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
7005 == 0 );
7006 }
7007 return
7008 aSign ? le128( b.high, b.low, a.high, a.low )
7009 : le128( a.high, a.low, b.high, b.low );
7010
7011}
7012
7013/*----------------------------------------------------------------------------
7014| Returns 1 if the quadruple-precision floating-point value `a' is less than
7015| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
7016| exception. Otherwise, the comparison is performed according to the IEC/IEEE
7017| Standard for Binary Floating-Point Arithmetic.
7018*----------------------------------------------------------------------------*/
7019
e5a41ffa 7020int float128_lt_quiet(float128 a, float128 b, float_status *status)
158142c2
FB
7021{
7022 flag aSign, bSign;
7023
7024 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
7025 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7026 || ( ( extractFloat128Exp( b ) == 0x7FFF )
7027 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7028 ) {
7029 if ( float128_is_signaling_nan( a )
7030 || float128_is_signaling_nan( b ) ) {
ff32e16e 7031 float_raise(float_flag_invalid, status);
158142c2
FB
7032 }
7033 return 0;
7034 }
7035 aSign = extractFloat128Sign( a );
7036 bSign = extractFloat128Sign( b );
7037 if ( aSign != bSign ) {
7038 return
7039 aSign
bb98fe42 7040 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
7041 != 0 );
7042 }
7043 return
7044 aSign ? lt128( b.high, b.low, a.high, a.low )
7045 : lt128( a.high, a.low, b.high, b.low );
7046
7047}
7048
67b7861d
AJ
7049/*----------------------------------------------------------------------------
7050| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7051| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
7052| comparison is performed according to the IEC/IEEE Standard for Binary
7053| Floating-Point Arithmetic.
7054*----------------------------------------------------------------------------*/
7055
e5a41ffa 7056int float128_unordered_quiet(float128 a, float128 b, float_status *status)
67b7861d
AJ
7057{
7058 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
7059 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7060 || ( ( extractFloat128Exp( b ) == 0x7FFF )
7061 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7062 ) {
7063 if ( float128_is_signaling_nan( a )
7064 || float128_is_signaling_nan( b ) ) {
ff32e16e 7065 float_raise(float_flag_invalid, status);
67b7861d
AJ
7066 }
7067 return 1;
7068 }
7069 return 0;
7070}
7071
1d6bda35 7072/* misc functions */
e5a41ffa 7073float32 uint32_to_float32(uint32_t a, float_status *status)
1d6bda35 7074{
ff32e16e 7075 return int64_to_float32(a, status);
1d6bda35
FB
7076}
7077
e5a41ffa 7078float64 uint32_to_float64(uint32_t a, float_status *status)
1d6bda35 7079{
ff32e16e 7080 return int64_to_float64(a, status);
1d6bda35
FB
7081}
7082
3a87d009 7083uint32_t float32_to_uint32(float32 a, float_status *status)
1d6bda35
FB
7084{
7085 int64_t v;
3a87d009 7086 uint32_t res;
34e1c27b 7087 int old_exc_flags = get_float_exception_flags(status);
1d6bda35 7088
ff32e16e 7089 v = float32_to_int64(a, status);
1d6bda35
FB
7090 if (v < 0) {
7091 res = 0;
1d6bda35
FB
7092 } else if (v > 0xffffffff) {
7093 res = 0xffffffff;
1d6bda35 7094 } else {
34e1c27b 7095 return v;
1d6bda35 7096 }
34e1c27b 7097 set_float_exception_flags(old_exc_flags, status);
ff32e16e 7098 float_raise(float_flag_invalid, status);
1d6bda35
FB
7099 return res;
7100}
7101
3a87d009 7102uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *status)
1d6bda35
FB
7103{
7104 int64_t v;
3a87d009 7105 uint32_t res;
34e1c27b 7106 int old_exc_flags = get_float_exception_flags(status);
1d6bda35 7107
ff32e16e 7108 v = float32_to_int64_round_to_zero(a, status);
1d6bda35
FB
7109 if (v < 0) {
7110 res = 0;
1d6bda35
FB
7111 } else if (v > 0xffffffff) {
7112 res = 0xffffffff;
1d6bda35 7113 } else {
34e1c27b 7114 return v;
1d6bda35 7115 }
34e1c27b 7116 set_float_exception_flags(old_exc_flags, status);
ff32e16e 7117 float_raise(float_flag_invalid, status);
1d6bda35
FB
7118 return res;
7119}
7120
e5a41ffa 7121int_fast16_t float32_to_int16(float32 a, float_status *status)
f581bf54
WN
7122{
7123 int32_t v;
7124 int_fast16_t res;
7125 int old_exc_flags = get_float_exception_flags(status);
7126
ff32e16e 7127 v = float32_to_int32(a, status);
f581bf54
WN
7128 if (v < -0x8000) {
7129 res = -0x8000;
7130 } else if (v > 0x7fff) {
7131 res = 0x7fff;
7132 } else {
7133 return v;
7134 }
7135
7136 set_float_exception_flags(old_exc_flags, status);
ff32e16e 7137 float_raise(float_flag_invalid, status);
f581bf54
WN
7138 return res;
7139}
7140
e5a41ffa 7141uint_fast16_t float32_to_uint16(float32 a, float_status *status)
f581bf54
WN
7142{
7143 int32_t v;
7144 uint_fast16_t res;
7145 int old_exc_flags = get_float_exception_flags(status);
7146
ff32e16e 7147 v = float32_to_int32(a, status);
f581bf54
WN
7148 if (v < 0) {
7149 res = 0;
7150 } else if (v > 0xffff) {
7151 res = 0xffff;
7152 } else {
7153 return v;
7154 }
7155
7156 set_float_exception_flags(old_exc_flags, status);
ff32e16e 7157 float_raise(float_flag_invalid, status);
f581bf54
WN
7158 return res;
7159}
7160
e5a41ffa 7161uint_fast16_t float32_to_uint16_round_to_zero(float32 a, float_status *status)
cbcef455
PM
7162{
7163 int64_t v;
5aea4c58 7164 uint_fast16_t res;
34e1c27b 7165 int old_exc_flags = get_float_exception_flags(status);
cbcef455 7166
ff32e16e 7167 v = float32_to_int64_round_to_zero(a, status);
cbcef455
PM
7168 if (v < 0) {
7169 res = 0;
cbcef455
PM
7170 } else if (v > 0xffff) {
7171 res = 0xffff;
cbcef455 7172 } else {
34e1c27b 7173 return v;
cbcef455 7174 }
34e1c27b 7175 set_float_exception_flags(old_exc_flags, status);
ff32e16e 7176 float_raise(float_flag_invalid, status);
cbcef455
PM
7177 return res;
7178}
7179
3a87d009 7180uint32_t float64_to_uint32(float64 a, float_status *status)
1d6bda35 7181{
5e7f654f 7182 uint64_t v;
3a87d009 7183 uint32_t res;
5e7f654f 7184 int old_exc_flags = get_float_exception_flags(status);
1d6bda35 7185
ff32e16e 7186 v = float64_to_uint64(a, status);
5e7f654f 7187 if (v > 0xffffffff) {
1d6bda35 7188 res = 0xffffffff;
1d6bda35 7189 } else {
5e7f654f 7190 return v;
1d6bda35 7191 }
5e7f654f 7192 set_float_exception_flags(old_exc_flags, status);
ff32e16e 7193 float_raise(float_flag_invalid, status);
1d6bda35
FB
7194 return res;
7195}
7196
3a87d009 7197uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *status)
1d6bda35 7198{
fd728f2f 7199 uint64_t v;
3a87d009 7200 uint32_t res;
fd728f2f 7201 int old_exc_flags = get_float_exception_flags(status);
1d6bda35 7202
ff32e16e 7203 v = float64_to_uint64_round_to_zero(a, status);
fd728f2f 7204 if (v > 0xffffffff) {
1d6bda35 7205 res = 0xffffffff;
1d6bda35 7206 } else {
fd728f2f 7207 return v;
1d6bda35 7208 }
fd728f2f 7209 set_float_exception_flags(old_exc_flags, status);
ff32e16e 7210 float_raise(float_flag_invalid, status);
1d6bda35
FB
7211 return res;
7212}
7213
e5a41ffa 7214int_fast16_t float64_to_int16(float64 a, float_status *status)
f581bf54
WN
7215{
7216 int64_t v;
7217 int_fast16_t res;
7218 int old_exc_flags = get_float_exception_flags(status);
7219
ff32e16e 7220 v = float64_to_int32(a, status);
f581bf54
WN
7221 if (v < -0x8000) {
7222 res = -0x8000;
7223 } else if (v > 0x7fff) {
7224 res = 0x7fff;
7225 } else {
7226 return v;
7227 }
7228
7229 set_float_exception_flags(old_exc_flags, status);
ff32e16e 7230 float_raise(float_flag_invalid, status);
f581bf54
WN
7231 return res;
7232}
7233
e5a41ffa 7234uint_fast16_t float64_to_uint16(float64 a, float_status *status)
f581bf54
WN
7235{
7236 int64_t v;
7237 uint_fast16_t res;
7238 int old_exc_flags = get_float_exception_flags(status);
7239
ff32e16e 7240 v = float64_to_int32(a, status);
f581bf54
WN
7241 if (v < 0) {
7242 res = 0;
7243 } else if (v > 0xffff) {
7244 res = 0xffff;
7245 } else {
7246 return v;
7247 }
7248
7249 set_float_exception_flags(old_exc_flags, status);
ff32e16e 7250 float_raise(float_flag_invalid, status);
f581bf54
WN
7251 return res;
7252}
7253
e5a41ffa 7254uint_fast16_t float64_to_uint16_round_to_zero(float64 a, float_status *status)
cbcef455
PM
7255{
7256 int64_t v;
5aea4c58 7257 uint_fast16_t res;
34e1c27b 7258 int old_exc_flags = get_float_exception_flags(status);
cbcef455 7259
ff32e16e 7260 v = float64_to_int64_round_to_zero(a, status);
cbcef455
PM
7261 if (v < 0) {
7262 res = 0;
cbcef455
PM
7263 } else if (v > 0xffff) {
7264 res = 0xffff;
cbcef455 7265 } else {
34e1c27b 7266 return v;
cbcef455 7267 }
34e1c27b 7268 set_float_exception_flags(old_exc_flags, status);
ff32e16e 7269 float_raise(float_flag_invalid, status);
cbcef455
PM
7270 return res;
7271}
7272
fb3ea83a
TM
7273/*----------------------------------------------------------------------------
7274| Returns the result of converting the double-precision floating-point value
7275| `a' to the 64-bit unsigned integer format. The conversion is
7276| performed according to the IEC/IEEE Standard for Binary Floating-Point
7277| Arithmetic---which means in particular that the conversion is rounded
7278| according to the current rounding mode. If `a' is a NaN, the largest
7279| positive integer is returned. If the conversion overflows, the
7280| largest unsigned integer is returned. If 'a' is negative, the value is
7281| rounded and zero is returned; negative values that do not round to zero
7282| will raise the inexact exception.
7283*----------------------------------------------------------------------------*/
75d62a58 7284
e5a41ffa 7285uint64_t float64_to_uint64(float64 a, float_status *status)
fb3ea83a
TM
7286{
7287 flag aSign;
7288 int_fast16_t aExp, shiftCount;
7289 uint64_t aSig, aSigExtra;
ff32e16e 7290 a = float64_squash_input_denormal(a, status);
75d62a58 7291
fb3ea83a
TM
7292 aSig = extractFloat64Frac(a);
7293 aExp = extractFloat64Exp(a);
7294 aSign = extractFloat64Sign(a);
7295 if (aSign && (aExp > 1022)) {
ff32e16e 7296 float_raise(float_flag_invalid, status);
fb3ea83a
TM
7297 if (float64_is_any_nan(a)) {
7298 return LIT64(0xFFFFFFFFFFFFFFFF);
7299 } else {
7300 return 0;
7301 }
7302 }
7303 if (aExp) {
7304 aSig |= LIT64(0x0010000000000000);
7305 }
7306 shiftCount = 0x433 - aExp;
7307 if (shiftCount <= 0) {
7308 if (0x43E < aExp) {
ff32e16e 7309 float_raise(float_flag_invalid, status);
fb3ea83a
TM
7310 return LIT64(0xFFFFFFFFFFFFFFFF);
7311 }
7312 aSigExtra = 0;
7313 aSig <<= -shiftCount;
7314 } else {
7315 shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra);
7316 }
ff32e16e 7317 return roundAndPackUint64(aSign, aSig, aSigExtra, status);
75d62a58
JM
7318}
7319
e5a41ffa 7320uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *status)
75d62a58 7321{
a2f2d288 7322 signed char current_rounding_mode = status->float_rounding_mode;
ff32e16e
PM
7323 set_float_rounding_mode(float_round_to_zero, status);
7324 int64_t v = float64_to_uint64(a, status);
7325 set_float_rounding_mode(current_rounding_mode, status);
0a87a310 7326 return v;
75d62a58
JM
7327}
7328
1d6bda35 7329#define COMPARE(s, nan_exp) \
e5a41ffa
PM
7330static inline int float ## s ## _compare_internal(float ## s a, float ## s b,\
7331 int is_quiet, float_status *status) \
1d6bda35
FB
7332{ \
7333 flag aSign, bSign; \
bb98fe42 7334 uint ## s ## _t av, bv; \
ff32e16e
PM
7335 a = float ## s ## _squash_input_denormal(a, status); \
7336 b = float ## s ## _squash_input_denormal(b, status); \
1d6bda35
FB
7337 \
7338 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
7339 extractFloat ## s ## Frac( a ) ) || \
7340 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
7341 extractFloat ## s ## Frac( b ) )) { \
7342 if (!is_quiet || \
7343 float ## s ## _is_signaling_nan( a ) || \
7344 float ## s ## _is_signaling_nan( b ) ) { \
ff32e16e 7345 float_raise(float_flag_invalid, status); \
1d6bda35
FB
7346 } \
7347 return float_relation_unordered; \
7348 } \
7349 aSign = extractFloat ## s ## Sign( a ); \
7350 bSign = extractFloat ## s ## Sign( b ); \
f090c9d4 7351 av = float ## s ## _val(a); \
cd8a2533 7352 bv = float ## s ## _val(b); \
1d6bda35 7353 if ( aSign != bSign ) { \
bb98fe42 7354 if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) { \
1d6bda35
FB
7355 /* zero case */ \
7356 return float_relation_equal; \
7357 } else { \
7358 return 1 - (2 * aSign); \
7359 } \
7360 } else { \
f090c9d4 7361 if (av == bv) { \
1d6bda35
FB
7362 return float_relation_equal; \
7363 } else { \
f090c9d4 7364 return 1 - 2 * (aSign ^ ( av < bv )); \
1d6bda35
FB
7365 } \
7366 } \
7367} \
7368 \
e5a41ffa 7369int float ## s ## _compare(float ## s a, float ## s b, float_status *status) \
1d6bda35 7370{ \
ff32e16e 7371 return float ## s ## _compare_internal(a, b, 0, status); \
1d6bda35
FB
7372} \
7373 \
e5a41ffa
PM
7374int float ## s ## _compare_quiet(float ## s a, float ## s b, \
7375 float_status *status) \
1d6bda35 7376{ \
ff32e16e 7377 return float ## s ## _compare_internal(a, b, 1, status); \
1d6bda35
FB
7378}
7379
7380COMPARE(32, 0xff)
7381COMPARE(64, 0x7ff)
9ee6e8bb 7382
e5a41ffa
PM
7383static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
7384 int is_quiet, float_status *status)
f6714d36
AJ
7385{
7386 flag aSign, bSign;
7387
7388 if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7389 ( extractFloatx80Frac( a )<<1 ) ) ||
7390 ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7391 ( extractFloatx80Frac( b )<<1 ) )) {
7392 if (!is_quiet ||
7393 floatx80_is_signaling_nan( a ) ||
7394 floatx80_is_signaling_nan( b ) ) {
ff32e16e 7395 float_raise(float_flag_invalid, status);
f6714d36
AJ
7396 }
7397 return float_relation_unordered;
7398 }
7399 aSign = extractFloatx80Sign( a );
7400 bSign = extractFloatx80Sign( b );
7401 if ( aSign != bSign ) {
7402
7403 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7404 ( ( a.low | b.low ) == 0 ) ) {
7405 /* zero case */
7406 return float_relation_equal;
7407 } else {
7408 return 1 - (2 * aSign);
7409 }
7410 } else {
7411 if (a.low == b.low && a.high == b.high) {
7412 return float_relation_equal;
7413 } else {
7414 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7415 }
7416 }
7417}
7418
e5a41ffa 7419int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
f6714d36 7420{
ff32e16e 7421 return floatx80_compare_internal(a, b, 0, status);
f6714d36
AJ
7422}
7423
e5a41ffa 7424int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
f6714d36 7425{
ff32e16e 7426 return floatx80_compare_internal(a, b, 1, status);
f6714d36
AJ
7427}
7428
e5a41ffa
PM
7429static inline int float128_compare_internal(float128 a, float128 b,
7430 int is_quiet, float_status *status)
1f587329
BS
7431{
7432 flag aSign, bSign;
7433
7434 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7435 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7436 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7437 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7438 if (!is_quiet ||
7439 float128_is_signaling_nan( a ) ||
7440 float128_is_signaling_nan( b ) ) {
ff32e16e 7441 float_raise(float_flag_invalid, status);
1f587329
BS
7442 }
7443 return float_relation_unordered;
7444 }
7445 aSign = extractFloat128Sign( a );
7446 bSign = extractFloat128Sign( b );
7447 if ( aSign != bSign ) {
7448 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7449 /* zero case */
7450 return float_relation_equal;
7451 } else {
7452 return 1 - (2 * aSign);
7453 }
7454 } else {
7455 if (a.low == b.low && a.high == b.high) {
7456 return float_relation_equal;
7457 } else {
7458 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7459 }
7460 }
7461}
7462
e5a41ffa 7463int float128_compare(float128 a, float128 b, float_status *status)
1f587329 7464{
ff32e16e 7465 return float128_compare_internal(a, b, 0, status);
1f587329
BS
7466}
7467
e5a41ffa 7468int float128_compare_quiet(float128 a, float128 b, float_status *status)
1f587329 7469{
ff32e16e 7470 return float128_compare_internal(a, b, 1, status);
1f587329
BS
7471}
7472
274f1b04
PM
7473/* min() and max() functions. These can't be implemented as
7474 * 'compare and pick one input' because that would mishandle
7475 * NaNs and +0 vs -0.
e17ab310
WN
7476 *
7477 * minnum() and maxnum() functions. These are similar to the min()
7478 * and max() functions but if one of the arguments is a QNaN and
7479 * the other is numerical then the numerical argument is returned.
7480 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
7481 * and maxNum() operations. min() and max() are the typical min/max
7482 * semantics provided by many CPUs which predate that specification.
2d31e060
LA
7483 *
7484 * minnummag() and maxnummag() functions correspond to minNumMag()
7485 * and minNumMag() from the IEEE-754 2008.
274f1b04 7486 */
e70614ea 7487#define MINMAX(s) \
a49db98d 7488static inline float ## s float ## s ## _minmax(float ## s a, float ## s b, \
2d31e060 7489 int ismin, int isieee, \
e5a41ffa
PM
7490 int ismag, \
7491 float_status *status) \
274f1b04
PM
7492{ \
7493 flag aSign, bSign; \
2d31e060 7494 uint ## s ## _t av, bv, aav, abv; \
ff32e16e
PM
7495 a = float ## s ## _squash_input_denormal(a, status); \
7496 b = float ## s ## _squash_input_denormal(b, status); \
274f1b04
PM
7497 if (float ## s ## _is_any_nan(a) || \
7498 float ## s ## _is_any_nan(b)) { \
e17ab310
WN
7499 if (isieee) { \
7500 if (float ## s ## _is_quiet_nan(a) && \
7501 !float ## s ##_is_any_nan(b)) { \
7502 return b; \
7503 } else if (float ## s ## _is_quiet_nan(b) && \
7504 !float ## s ## _is_any_nan(a)) { \
7505 return a; \
7506 } \
7507 } \
ff32e16e 7508 return propagateFloat ## s ## NaN(a, b, status); \
274f1b04
PM
7509 } \
7510 aSign = extractFloat ## s ## Sign(a); \
7511 bSign = extractFloat ## s ## Sign(b); \
7512 av = float ## s ## _val(a); \
7513 bv = float ## s ## _val(b); \
2d31e060
LA
7514 if (ismag) { \
7515 aav = float ## s ## _abs(av); \
7516 abv = float ## s ## _abs(bv); \
7517 if (aav != abv) { \
7518 if (ismin) { \
7519 return (aav < abv) ? a : b; \
7520 } else { \
7521 return (aav < abv) ? b : a; \
7522 } \
7523 } \
7524 } \
274f1b04
PM
7525 if (aSign != bSign) { \
7526 if (ismin) { \
7527 return aSign ? a : b; \
7528 } else { \
7529 return aSign ? b : a; \
7530 } \
7531 } else { \
7532 if (ismin) { \
7533 return (aSign ^ (av < bv)) ? a : b; \
7534 } else { \
7535 return (aSign ^ (av < bv)) ? b : a; \
7536 } \
7537 } \
7538} \
7539 \
e5a41ffa
PM
7540float ## s float ## s ## _min(float ## s a, float ## s b, \
7541 float_status *status) \
274f1b04 7542{ \
ff32e16e 7543 return float ## s ## _minmax(a, b, 1, 0, 0, status); \
274f1b04
PM
7544} \
7545 \
e5a41ffa
PM
7546float ## s float ## s ## _max(float ## s a, float ## s b, \
7547 float_status *status) \
274f1b04 7548{ \
ff32e16e 7549 return float ## s ## _minmax(a, b, 0, 0, 0, status); \
e17ab310
WN
7550} \
7551 \
e5a41ffa
PM
7552float ## s float ## s ## _minnum(float ## s a, float ## s b, \
7553 float_status *status) \
e17ab310 7554{ \
ff32e16e 7555 return float ## s ## _minmax(a, b, 1, 1, 0, status); \
e17ab310
WN
7556} \
7557 \
e5a41ffa
PM
7558float ## s float ## s ## _maxnum(float ## s a, float ## s b, \
7559 float_status *status) \
e17ab310 7560{ \
ff32e16e 7561 return float ## s ## _minmax(a, b, 0, 1, 0, status); \
2d31e060
LA
7562} \
7563 \
e5a41ffa
PM
7564float ## s float ## s ## _minnummag(float ## s a, float ## s b, \
7565 float_status *status) \
2d31e060 7566{ \
ff32e16e 7567 return float ## s ## _minmax(a, b, 1, 1, 1, status); \
2d31e060
LA
7568} \
7569 \
e5a41ffa
PM
7570float ## s float ## s ## _maxnummag(float ## s a, float ## s b, \
7571 float_status *status) \
2d31e060 7572{ \
ff32e16e 7573 return float ## s ## _minmax(a, b, 0, 1, 1, status); \
274f1b04
PM
7574}
7575
e70614ea
WN
7576MINMAX(32)
7577MINMAX(64)
274f1b04
PM
7578
7579
9ee6e8bb 7580/* Multiply A by 2 raised to the power N. */
e5a41ffa 7581float32 float32_scalbn(float32 a, int n, float_status *status)
9ee6e8bb
PB
7582{
7583 flag aSign;
326b9e98 7584 int16_t aExp;
bb98fe42 7585 uint32_t aSig;
9ee6e8bb 7586
ff32e16e 7587 a = float32_squash_input_denormal(a, status);
9ee6e8bb
PB
7588 aSig = extractFloat32Frac( a );
7589 aExp = extractFloat32Exp( a );
7590 aSign = extractFloat32Sign( a );
7591
7592 if ( aExp == 0xFF ) {
326b9e98 7593 if ( aSig ) {
ff32e16e 7594 return propagateFloat32NaN(a, a, status);
326b9e98 7595 }
9ee6e8bb
PB
7596 return a;
7597 }
3c85c37f 7598 if (aExp != 0) {
69397542 7599 aSig |= 0x00800000;
3c85c37f 7600 } else if (aSig == 0) {
69397542 7601 return a;
3c85c37f
PM
7602 } else {
7603 aExp++;
7604 }
69397542 7605
326b9e98
AJ
7606 if (n > 0x200) {
7607 n = 0x200;
7608 } else if (n < -0x200) {
7609 n = -0x200;
7610 }
7611
69397542
PB
7612 aExp += n - 1;
7613 aSig <<= 7;
ff32e16e 7614 return normalizeRoundAndPackFloat32(aSign, aExp, aSig, status);
9ee6e8bb
PB
7615}
7616
e5a41ffa 7617float64 float64_scalbn(float64 a, int n, float_status *status)
9ee6e8bb
PB
7618{
7619 flag aSign;
326b9e98 7620 int16_t aExp;
bb98fe42 7621 uint64_t aSig;
9ee6e8bb 7622
ff32e16e 7623 a = float64_squash_input_denormal(a, status);
9ee6e8bb
PB
7624 aSig = extractFloat64Frac( a );
7625 aExp = extractFloat64Exp( a );
7626 aSign = extractFloat64Sign( a );
7627
7628 if ( aExp == 0x7FF ) {
326b9e98 7629 if ( aSig ) {
ff32e16e 7630 return propagateFloat64NaN(a, a, status);
326b9e98 7631 }
9ee6e8bb
PB
7632 return a;
7633 }
3c85c37f 7634 if (aExp != 0) {
69397542 7635 aSig |= LIT64( 0x0010000000000000 );
3c85c37f 7636 } else if (aSig == 0) {
69397542 7637 return a;
3c85c37f
PM
7638 } else {
7639 aExp++;
7640 }
69397542 7641
326b9e98
AJ
7642 if (n > 0x1000) {
7643 n = 0x1000;
7644 } else if (n < -0x1000) {
7645 n = -0x1000;
7646 }
7647
69397542
PB
7648 aExp += n - 1;
7649 aSig <<= 10;
ff32e16e 7650 return normalizeRoundAndPackFloat64(aSign, aExp, aSig, status);
9ee6e8bb
PB
7651}
7652
e5a41ffa 7653floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
9ee6e8bb
PB
7654{
7655 flag aSign;
326b9e98 7656 int32_t aExp;
bb98fe42 7657 uint64_t aSig;
9ee6e8bb
PB
7658
7659 aSig = extractFloatx80Frac( a );
7660 aExp = extractFloatx80Exp( a );
7661 aSign = extractFloatx80Sign( a );
7662
326b9e98
AJ
7663 if ( aExp == 0x7FFF ) {
7664 if ( aSig<<1 ) {
ff32e16e 7665 return propagateFloatx80NaN(a, a, status);
326b9e98 7666 }
9ee6e8bb
PB
7667 return a;
7668 }
326b9e98 7669
3c85c37f
PM
7670 if (aExp == 0) {
7671 if (aSig == 0) {
7672 return a;
7673 }
7674 aExp++;
7675 }
69397542 7676
326b9e98
AJ
7677 if (n > 0x10000) {
7678 n = 0x10000;
7679 } else if (n < -0x10000) {
7680 n = -0x10000;
7681 }
7682
9ee6e8bb 7683 aExp += n;
a2f2d288
PM
7684 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7685 aSign, aExp, aSig, 0, status);
9ee6e8bb 7686}
9ee6e8bb 7687
e5a41ffa 7688float128 float128_scalbn(float128 a, int n, float_status *status)
9ee6e8bb
PB
7689{
7690 flag aSign;
326b9e98 7691 int32_t aExp;
bb98fe42 7692 uint64_t aSig0, aSig1;
9ee6e8bb
PB
7693
7694 aSig1 = extractFloat128Frac1( a );
7695 aSig0 = extractFloat128Frac0( a );
7696 aExp = extractFloat128Exp( a );
7697 aSign = extractFloat128Sign( a );
7698 if ( aExp == 0x7FFF ) {
326b9e98 7699 if ( aSig0 | aSig1 ) {
ff32e16e 7700 return propagateFloat128NaN(a, a, status);
326b9e98 7701 }
9ee6e8bb
PB
7702 return a;
7703 }
3c85c37f 7704 if (aExp != 0) {
69397542 7705 aSig0 |= LIT64( 0x0001000000000000 );
3c85c37f 7706 } else if (aSig0 == 0 && aSig1 == 0) {
69397542 7707 return a;
3c85c37f
PM
7708 } else {
7709 aExp++;
7710 }
69397542 7711
326b9e98
AJ
7712 if (n > 0x10000) {
7713 n = 0x10000;
7714 } else if (n < -0x10000) {
7715 n = -0x10000;
7716 }
7717
69397542
PB
7718 aExp += n - 1;
7719 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
ff32e16e 7720 , status);
9ee6e8bb
PB
7721
7722}