]> git.proxmox.com Git - mirror_qemu.git/blame - fpu/softfloat.c
softfloat: expand out STATUS_VAR
[mirror_qemu.git] / fpu / softfloat.c
CommitLineData
8d725fac
AF
1/*
2 * QEMU float support
3 *
16017c48
PM
4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
10 * the BSD license
11 * GPL-v2-or-later
12 *
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
8d725fac 16 */
158142c2 17
a7d1ac78
PM
18/*
19===============================================================================
20This C source file is part of the SoftFloat IEC/IEEE Floating-point
21Arithmetic Package, Release 2a.
158142c2
FB
22
23Written by John R. Hauser. This work was made possible in part by the
24International Computer Science Institute, located at Suite 600, 1947 Center
25Street, Berkeley, California 94704. Funding was partially provided by the
26National Science Foundation under grant MIP-9311980. The original version
27of this code was written as part of a project to build a fixed-point vector
28processor in collaboration with the University of California at Berkeley,
29overseen by Profs. Nelson Morgan and John Wawrzynek. More information
a7d1ac78 30is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
158142c2
FB
31arithmetic/SoftFloat.html'.
32
a7d1ac78
PM
33THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
158142c2
FB
38
39Derivative works are acceptable, even for commercial purposes, so long as
a7d1ac78
PM
40(1) they include prominent notice that the work is derivative, and (2) they
41include prominent notice akin to these four paragraphs for those parts of
42this code that are retained.
158142c2 43
a7d1ac78
PM
44===============================================================================
45*/
158142c2 46
16017c48
PM
47/* BSD licensing:
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
50 *
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
53 *
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
56 *
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
60 *
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
64 *
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
76 */
77
78/* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
80 */
81
2ac8bd03
PM
82/* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
84 */
85#include "config.h"
86
6b4c305c 87#include "fpu/softfloat.h"
158142c2 88
dc355b76
PM
89/* We only need stdlib for abort() */
90#include <stdlib.h>
91
158142c2
FB
92/*----------------------------------------------------------------------------
93| Primitive arithmetic functions, including multi-word arithmetic, and
94| division and square root approximations. (Can be specialized to target if
95| desired.)
96*----------------------------------------------------------------------------*/
97#include "softfloat-macros.h"
98
99/*----------------------------------------------------------------------------
100| Functions and definitions to determine: (1) whether tininess for underflow
101| is detected before or after rounding by default, (2) what (if anything)
102| happens when exceptions are raised, (3) how signaling NaNs are distinguished
103| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
104| are propagated from function inputs to output. These details are target-
105| specific.
106*----------------------------------------------------------------------------*/
107#include "softfloat-specialize.h"
108
bb4d4bb3
PM
109/*----------------------------------------------------------------------------
110| Returns the fraction bits of the half-precision floating-point value `a'.
111*----------------------------------------------------------------------------*/
112
a49db98d 113static inline uint32_t extractFloat16Frac(float16 a)
bb4d4bb3
PM
114{
115 return float16_val(a) & 0x3ff;
116}
117
118/*----------------------------------------------------------------------------
119| Returns the exponent bits of the half-precision floating-point value `a'.
120*----------------------------------------------------------------------------*/
121
a49db98d 122static inline int_fast16_t extractFloat16Exp(float16 a)
bb4d4bb3
PM
123{
124 return (float16_val(a) >> 10) & 0x1f;
125}
126
127/*----------------------------------------------------------------------------
128| Returns the sign bit of the single-precision floating-point value `a'.
129*----------------------------------------------------------------------------*/
130
a49db98d 131static inline flag extractFloat16Sign(float16 a)
bb4d4bb3
PM
132{
133 return float16_val(a)>>15;
134}
135
158142c2
FB
136/*----------------------------------------------------------------------------
137| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
138| and 7, and returns the properly rounded 32-bit integer corresponding to the
139| input. If `zSign' is 1, the input is negated before being converted to an
140| integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
141| is simply rounded to an integer, with the inexact exception raised if the
142| input cannot be represented exactly as an integer. However, if the fixed-
143| point input is too large, the invalid exception is raised and the largest
144| positive or negative integer is returned.
145*----------------------------------------------------------------------------*/
146
e5a41ffa 147static int32 roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
158142c2
FB
148{
149 int8 roundingMode;
150 flag roundNearestEven;
151 int8 roundIncrement, roundBits;
760e1416 152 int32_t z;
158142c2
FB
153
154 roundingMode = STATUS(float_rounding_mode);
155 roundNearestEven = ( roundingMode == float_round_nearest_even );
dc355b76
PM
156 switch (roundingMode) {
157 case float_round_nearest_even:
f9288a76 158 case float_round_ties_away:
dc355b76
PM
159 roundIncrement = 0x40;
160 break;
161 case float_round_to_zero:
162 roundIncrement = 0;
163 break;
164 case float_round_up:
165 roundIncrement = zSign ? 0 : 0x7f;
166 break;
167 case float_round_down:
168 roundIncrement = zSign ? 0x7f : 0;
169 break;
170 default:
171 abort();
158142c2
FB
172 }
173 roundBits = absZ & 0x7F;
174 absZ = ( absZ + roundIncrement )>>7;
175 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
176 z = absZ;
177 if ( zSign ) z = - z;
178 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
ff32e16e 179 float_raise(float_flag_invalid, status);
bb98fe42 180 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2
FB
181 }
182 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
183 return z;
184
185}
186
187/*----------------------------------------------------------------------------
188| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
189| `absZ1', with binary point between bits 63 and 64 (between the input words),
190| and returns the properly rounded 64-bit integer corresponding to the input.
191| If `zSign' is 1, the input is negated before being converted to an integer.
192| Ordinarily, the fixed-point input is simply rounded to an integer, with
193| the inexact exception raised if the input cannot be represented exactly as
194| an integer. However, if the fixed-point input is too large, the invalid
195| exception is raised and the largest positive or negative integer is
196| returned.
197*----------------------------------------------------------------------------*/
198
e5a41ffa
PM
199static int64 roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
200 float_status *status)
158142c2
FB
201{
202 int8 roundingMode;
203 flag roundNearestEven, increment;
760e1416 204 int64_t z;
158142c2
FB
205
206 roundingMode = STATUS(float_rounding_mode);
207 roundNearestEven = ( roundingMode == float_round_nearest_even );
dc355b76
PM
208 switch (roundingMode) {
209 case float_round_nearest_even:
f9288a76 210 case float_round_ties_away:
dc355b76
PM
211 increment = ((int64_t) absZ1 < 0);
212 break;
213 case float_round_to_zero:
214 increment = 0;
215 break;
216 case float_round_up:
217 increment = !zSign && absZ1;
218 break;
219 case float_round_down:
220 increment = zSign && absZ1;
221 break;
222 default:
223 abort();
158142c2
FB
224 }
225 if ( increment ) {
226 ++absZ0;
227 if ( absZ0 == 0 ) goto overflow;
bb98fe42 228 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
158142c2
FB
229 }
230 z = absZ0;
231 if ( zSign ) z = - z;
232 if ( z && ( ( z < 0 ) ^ zSign ) ) {
233 overflow:
ff32e16e 234 float_raise(float_flag_invalid, status);
158142c2 235 return
bb98fe42 236 zSign ? (int64_t) LIT64( 0x8000000000000000 )
158142c2
FB
237 : LIT64( 0x7FFFFFFFFFFFFFFF );
238 }
239 if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
240 return z;
241
242}
243
fb3ea83a
TM
244/*----------------------------------------------------------------------------
245| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
246| `absZ1', with binary point between bits 63 and 64 (between the input words),
247| and returns the properly rounded 64-bit unsigned integer corresponding to the
248| input. Ordinarily, the fixed-point input is simply rounded to an integer,
249| with the inexact exception raised if the input cannot be represented exactly
250| as an integer. However, if the fixed-point input is too large, the invalid
251| exception is raised and the largest unsigned integer is returned.
252*----------------------------------------------------------------------------*/
253
254static int64 roundAndPackUint64(flag zSign, uint64_t absZ0,
e5a41ffa 255 uint64_t absZ1, float_status *status)
fb3ea83a
TM
256{
257 int8 roundingMode;
258 flag roundNearestEven, increment;
259
260 roundingMode = STATUS(float_rounding_mode);
261 roundNearestEven = (roundingMode == float_round_nearest_even);
dc355b76
PM
262 switch (roundingMode) {
263 case float_round_nearest_even:
f9288a76 264 case float_round_ties_away:
dc355b76
PM
265 increment = ((int64_t)absZ1 < 0);
266 break;
267 case float_round_to_zero:
268 increment = 0;
269 break;
270 case float_round_up:
271 increment = !zSign && absZ1;
272 break;
273 case float_round_down:
274 increment = zSign && absZ1;
275 break;
276 default:
277 abort();
fb3ea83a
TM
278 }
279 if (increment) {
280 ++absZ0;
281 if (absZ0 == 0) {
ff32e16e 282 float_raise(float_flag_invalid, status);
fb3ea83a
TM
283 return LIT64(0xFFFFFFFFFFFFFFFF);
284 }
285 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
286 }
287
288 if (zSign && absZ0) {
ff32e16e 289 float_raise(float_flag_invalid, status);
fb3ea83a
TM
290 return 0;
291 }
292
293 if (absZ1) {
294 STATUS(float_exception_flags) |= float_flag_inexact;
295 }
296 return absZ0;
297}
298
158142c2
FB
299/*----------------------------------------------------------------------------
300| Returns the fraction bits of the single-precision floating-point value `a'.
301*----------------------------------------------------------------------------*/
302
a49db98d 303static inline uint32_t extractFloat32Frac( float32 a )
158142c2
FB
304{
305
f090c9d4 306 return float32_val(a) & 0x007FFFFF;
158142c2
FB
307
308}
309
310/*----------------------------------------------------------------------------
311| Returns the exponent bits of the single-precision floating-point value `a'.
312*----------------------------------------------------------------------------*/
313
a49db98d 314static inline int_fast16_t extractFloat32Exp(float32 a)
158142c2
FB
315{
316
f090c9d4 317 return ( float32_val(a)>>23 ) & 0xFF;
158142c2
FB
318
319}
320
321/*----------------------------------------------------------------------------
322| Returns the sign bit of the single-precision floating-point value `a'.
323*----------------------------------------------------------------------------*/
324
a49db98d 325static inline flag extractFloat32Sign( float32 a )
158142c2
FB
326{
327
f090c9d4 328 return float32_val(a)>>31;
158142c2
FB
329
330}
331
37d18660
PM
332/*----------------------------------------------------------------------------
333| If `a' is denormal and we are in flush-to-zero mode then set the
334| input-denormal exception and return zero. Otherwise just return the value.
335*----------------------------------------------------------------------------*/
e5a41ffa 336float32 float32_squash_input_denormal(float32 a, float_status *status)
37d18660
PM
337{
338 if (STATUS(flush_inputs_to_zero)) {
339 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
ff32e16e 340 float_raise(float_flag_input_denormal, status);
37d18660
PM
341 return make_float32(float32_val(a) & 0x80000000);
342 }
343 }
344 return a;
345}
346
158142c2
FB
347/*----------------------------------------------------------------------------
348| Normalizes the subnormal single-precision floating-point value represented
349| by the denormalized significand `aSig'. The normalized exponent and
350| significand are stored at the locations pointed to by `zExpPtr' and
351| `zSigPtr', respectively.
352*----------------------------------------------------------------------------*/
353
354static void
94a49d86 355 normalizeFloat32Subnormal(uint32_t aSig, int_fast16_t *zExpPtr, uint32_t *zSigPtr)
158142c2
FB
356{
357 int8 shiftCount;
358
359 shiftCount = countLeadingZeros32( aSig ) - 8;
360 *zSigPtr = aSig<<shiftCount;
361 *zExpPtr = 1 - shiftCount;
362
363}
364
365/*----------------------------------------------------------------------------
366| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
367| single-precision floating-point value, returning the result. After being
368| shifted into the proper positions, the three fields are simply added
369| together to form the result. This means that any integer portion of `zSig'
370| will be added into the exponent. Since a properly normalized significand
371| will have an integer portion equal to 1, the `zExp' input should be 1 less
372| than the desired result exponent whenever `zSig' is a complete, normalized
373| significand.
374*----------------------------------------------------------------------------*/
375
a49db98d 376static inline float32 packFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig)
158142c2
FB
377{
378
f090c9d4 379 return make_float32(
bb98fe42 380 ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
158142c2
FB
381
382}
383
384/*----------------------------------------------------------------------------
385| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
386| and significand `zSig', and returns the proper single-precision floating-
387| point value corresponding to the abstract input. Ordinarily, the abstract
388| value is simply rounded and packed into the single-precision format, with
389| the inexact exception raised if the abstract input cannot be represented
390| exactly. However, if the abstract value is too large, the overflow and
391| inexact exceptions are raised and an infinity or maximal finite value is
392| returned. If the abstract value is too small, the input value is rounded to
393| a subnormal number, and the underflow and inexact exceptions are raised if
394| the abstract input cannot be represented exactly as a subnormal single-
395| precision floating-point number.
396| The input significand `zSig' has its binary point between bits 30
397| and 29, which is 7 bits to the left of the usual location. This shifted
398| significand must be normalized or smaller. If `zSig' is not normalized,
399| `zExp' must be 0; in that case, the result returned is a subnormal number,
400| and it must not require rounding. In the usual case that `zSig' is
401| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
402| The handling of underflow and overflow follows the IEC/IEEE Standard for
403| Binary Floating-Point Arithmetic.
404*----------------------------------------------------------------------------*/
405
e5a41ffa
PM
406static float32 roundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig,
407 float_status *status)
158142c2
FB
408{
409 int8 roundingMode;
410 flag roundNearestEven;
411 int8 roundIncrement, roundBits;
412 flag isTiny;
413
414 roundingMode = STATUS(float_rounding_mode);
415 roundNearestEven = ( roundingMode == float_round_nearest_even );
dc355b76
PM
416 switch (roundingMode) {
417 case float_round_nearest_even:
f9288a76 418 case float_round_ties_away:
dc355b76
PM
419 roundIncrement = 0x40;
420 break;
421 case float_round_to_zero:
422 roundIncrement = 0;
423 break;
424 case float_round_up:
425 roundIncrement = zSign ? 0 : 0x7f;
426 break;
427 case float_round_down:
428 roundIncrement = zSign ? 0x7f : 0;
429 break;
430 default:
431 abort();
432 break;
158142c2
FB
433 }
434 roundBits = zSig & 0x7F;
bb98fe42 435 if ( 0xFD <= (uint16_t) zExp ) {
158142c2
FB
436 if ( ( 0xFD < zExp )
437 || ( ( zExp == 0xFD )
bb98fe42 438 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
158142c2 439 ) {
ff32e16e 440 float_raise(float_flag_overflow | float_flag_inexact, status);
f090c9d4 441 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
158142c2
FB
442 }
443 if ( zExp < 0 ) {
e6afc87f 444 if (STATUS(flush_to_zero)) {
ff32e16e 445 float_raise(float_flag_output_denormal, status);
e6afc87f
PM
446 return packFloat32(zSign, 0, 0);
447 }
158142c2
FB
448 isTiny =
449 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
450 || ( zExp < -1 )
451 || ( zSig + roundIncrement < 0x80000000 );
452 shift32RightJamming( zSig, - zExp, &zSig );
453 zExp = 0;
454 roundBits = zSig & 0x7F;
ff32e16e
PM
455 if (isTiny && roundBits) {
456 float_raise(float_flag_underflow, status);
457 }
158142c2
FB
458 }
459 }
460 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
461 zSig = ( zSig + roundIncrement )>>7;
462 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
463 if ( zSig == 0 ) zExp = 0;
464 return packFloat32( zSign, zExp, zSig );
465
466}
467
468/*----------------------------------------------------------------------------
469| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
470| and significand `zSig', and returns the proper single-precision floating-
471| point value corresponding to the abstract input. This routine is just like
472| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
473| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
474| floating-point exponent.
475*----------------------------------------------------------------------------*/
476
477static float32
e5a41ffa
PM
478 normalizeRoundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig,
479 float_status *status)
158142c2
FB
480{
481 int8 shiftCount;
482
483 shiftCount = countLeadingZeros32( zSig ) - 1;
ff32e16e
PM
484 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
485 status);
158142c2
FB
486
487}
488
489/*----------------------------------------------------------------------------
490| Returns the fraction bits of the double-precision floating-point value `a'.
491*----------------------------------------------------------------------------*/
492
a49db98d 493static inline uint64_t extractFloat64Frac( float64 a )
158142c2
FB
494{
495
f090c9d4 496 return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
158142c2
FB
497
498}
499
500/*----------------------------------------------------------------------------
501| Returns the exponent bits of the double-precision floating-point value `a'.
502*----------------------------------------------------------------------------*/
503
a49db98d 504static inline int_fast16_t extractFloat64Exp(float64 a)
158142c2
FB
505{
506
f090c9d4 507 return ( float64_val(a)>>52 ) & 0x7FF;
158142c2
FB
508
509}
510
511/*----------------------------------------------------------------------------
512| Returns the sign bit of the double-precision floating-point value `a'.
513*----------------------------------------------------------------------------*/
514
a49db98d 515static inline flag extractFloat64Sign( float64 a )
158142c2
FB
516{
517
f090c9d4 518 return float64_val(a)>>63;
158142c2
FB
519
520}
521
37d18660
PM
522/*----------------------------------------------------------------------------
523| If `a' is denormal and we are in flush-to-zero mode then set the
524| input-denormal exception and return zero. Otherwise just return the value.
525*----------------------------------------------------------------------------*/
e5a41ffa 526float64 float64_squash_input_denormal(float64 a, float_status *status)
37d18660
PM
527{
528 if (STATUS(flush_inputs_to_zero)) {
529 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
ff32e16e 530 float_raise(float_flag_input_denormal, status);
37d18660
PM
531 return make_float64(float64_val(a) & (1ULL << 63));
532 }
533 }
534 return a;
535}
536
158142c2
FB
537/*----------------------------------------------------------------------------
538| Normalizes the subnormal double-precision floating-point value represented
539| by the denormalized significand `aSig'. The normalized exponent and
540| significand are stored at the locations pointed to by `zExpPtr' and
541| `zSigPtr', respectively.
542*----------------------------------------------------------------------------*/
543
544static void
94a49d86 545 normalizeFloat64Subnormal(uint64_t aSig, int_fast16_t *zExpPtr, uint64_t *zSigPtr)
158142c2
FB
546{
547 int8 shiftCount;
548
549 shiftCount = countLeadingZeros64( aSig ) - 11;
550 *zSigPtr = aSig<<shiftCount;
551 *zExpPtr = 1 - shiftCount;
552
553}
554
555/*----------------------------------------------------------------------------
556| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
557| double-precision floating-point value, returning the result. After being
558| shifted into the proper positions, the three fields are simply added
559| together to form the result. This means that any integer portion of `zSig'
560| will be added into the exponent. Since a properly normalized significand
561| will have an integer portion equal to 1, the `zExp' input should be 1 less
562| than the desired result exponent whenever `zSig' is a complete, normalized
563| significand.
564*----------------------------------------------------------------------------*/
565
a49db98d 566static inline float64 packFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig)
158142c2
FB
567{
568
f090c9d4 569 return make_float64(
bb98fe42 570 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
158142c2
FB
571
572}
573
574/*----------------------------------------------------------------------------
575| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
576| and significand `zSig', and returns the proper double-precision floating-
577| point value corresponding to the abstract input. Ordinarily, the abstract
578| value is simply rounded and packed into the double-precision format, with
579| the inexact exception raised if the abstract input cannot be represented
580| exactly. However, if the abstract value is too large, the overflow and
581| inexact exceptions are raised and an infinity or maximal finite value is
a7d1ac78
PM
582| returned. If the abstract value is too small, the input value is rounded to
583| a subnormal number, and the underflow and inexact exceptions are raised if
584| the abstract input cannot be represented exactly as a subnormal double-
158142c2
FB
585| precision floating-point number.
586| The input significand `zSig' has its binary point between bits 62
587| and 61, which is 10 bits to the left of the usual location. This shifted
588| significand must be normalized or smaller. If `zSig' is not normalized,
589| `zExp' must be 0; in that case, the result returned is a subnormal number,
590| and it must not require rounding. In the usual case that `zSig' is
591| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
592| The handling of underflow and overflow follows the IEC/IEEE Standard for
593| Binary Floating-Point Arithmetic.
594*----------------------------------------------------------------------------*/
595
e5a41ffa
PM
596static float64 roundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig,
597 float_status *status)
158142c2
FB
598{
599 int8 roundingMode;
600 flag roundNearestEven;
94a49d86 601 int_fast16_t roundIncrement, roundBits;
158142c2
FB
602 flag isTiny;
603
604 roundingMode = STATUS(float_rounding_mode);
605 roundNearestEven = ( roundingMode == float_round_nearest_even );
dc355b76
PM
606 switch (roundingMode) {
607 case float_round_nearest_even:
f9288a76 608 case float_round_ties_away:
dc355b76
PM
609 roundIncrement = 0x200;
610 break;
611 case float_round_to_zero:
612 roundIncrement = 0;
613 break;
614 case float_round_up:
615 roundIncrement = zSign ? 0 : 0x3ff;
616 break;
617 case float_round_down:
618 roundIncrement = zSign ? 0x3ff : 0;
619 break;
620 default:
621 abort();
158142c2
FB
622 }
623 roundBits = zSig & 0x3FF;
bb98fe42 624 if ( 0x7FD <= (uint16_t) zExp ) {
158142c2
FB
625 if ( ( 0x7FD < zExp )
626 || ( ( zExp == 0x7FD )
bb98fe42 627 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
158142c2 628 ) {
ff32e16e 629 float_raise(float_flag_overflow | float_flag_inexact, status);
f090c9d4 630 return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
158142c2
FB
631 }
632 if ( zExp < 0 ) {
e6afc87f 633 if (STATUS(flush_to_zero)) {
ff32e16e 634 float_raise(float_flag_output_denormal, status);
e6afc87f
PM
635 return packFloat64(zSign, 0, 0);
636 }
158142c2
FB
637 isTiny =
638 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
639 || ( zExp < -1 )
640 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
641 shift64RightJamming( zSig, - zExp, &zSig );
642 zExp = 0;
643 roundBits = zSig & 0x3FF;
ff32e16e
PM
644 if (isTiny && roundBits) {
645 float_raise(float_flag_underflow, status);
646 }
158142c2
FB
647 }
648 }
649 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
650 zSig = ( zSig + roundIncrement )>>10;
651 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
652 if ( zSig == 0 ) zExp = 0;
653 return packFloat64( zSign, zExp, zSig );
654
655}
656
657/*----------------------------------------------------------------------------
658| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
659| and significand `zSig', and returns the proper double-precision floating-
660| point value corresponding to the abstract input. This routine is just like
661| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
662| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
663| floating-point exponent.
664*----------------------------------------------------------------------------*/
665
666static float64
e5a41ffa
PM
667 normalizeRoundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig,
668 float_status *status)
158142c2
FB
669{
670 int8 shiftCount;
671
672 shiftCount = countLeadingZeros64( zSig ) - 1;
ff32e16e
PM
673 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
674 status);
158142c2
FB
675
676}
677
158142c2
FB
678/*----------------------------------------------------------------------------
679| Returns the fraction bits of the extended double-precision floating-point
680| value `a'.
681*----------------------------------------------------------------------------*/
682
a49db98d 683static inline uint64_t extractFloatx80Frac( floatx80 a )
158142c2
FB
684{
685
686 return a.low;
687
688}
689
690/*----------------------------------------------------------------------------
691| Returns the exponent bits of the extended double-precision floating-point
692| value `a'.
693*----------------------------------------------------------------------------*/
694
a49db98d 695static inline int32 extractFloatx80Exp( floatx80 a )
158142c2
FB
696{
697
698 return a.high & 0x7FFF;
699
700}
701
702/*----------------------------------------------------------------------------
703| Returns the sign bit of the extended double-precision floating-point value
704| `a'.
705*----------------------------------------------------------------------------*/
706
a49db98d 707static inline flag extractFloatx80Sign( floatx80 a )
158142c2
FB
708{
709
710 return a.high>>15;
711
712}
713
714/*----------------------------------------------------------------------------
715| Normalizes the subnormal extended double-precision floating-point value
716| represented by the denormalized significand `aSig'. The normalized exponent
717| and significand are stored at the locations pointed to by `zExpPtr' and
718| `zSigPtr', respectively.
719*----------------------------------------------------------------------------*/
720
721static void
bb98fe42 722 normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr )
158142c2
FB
723{
724 int8 shiftCount;
725
726 shiftCount = countLeadingZeros64( aSig );
727 *zSigPtr = aSig<<shiftCount;
728 *zExpPtr = 1 - shiftCount;
729
730}
731
732/*----------------------------------------------------------------------------
733| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
734| extended double-precision floating-point value, returning the result.
735*----------------------------------------------------------------------------*/
736
a49db98d 737static inline floatx80 packFloatx80( flag zSign, int32 zExp, uint64_t zSig )
158142c2
FB
738{
739 floatx80 z;
740
741 z.low = zSig;
bb98fe42 742 z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
158142c2
FB
743 return z;
744
745}
746
747/*----------------------------------------------------------------------------
748| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
749| and extended significand formed by the concatenation of `zSig0' and `zSig1',
750| and returns the proper extended double-precision floating-point value
751| corresponding to the abstract input. Ordinarily, the abstract value is
752| rounded and packed into the extended double-precision format, with the
753| inexact exception raised if the abstract input cannot be represented
754| exactly. However, if the abstract value is too large, the overflow and
755| inexact exceptions are raised and an infinity or maximal finite value is
756| returned. If the abstract value is too small, the input value is rounded to
757| a subnormal number, and the underflow and inexact exceptions are raised if
758| the abstract input cannot be represented exactly as a subnormal extended
759| double-precision floating-point number.
760| If `roundingPrecision' is 32 or 64, the result is rounded to the same
761| number of bits as single or double precision, respectively. Otherwise, the
762| result is rounded to the full precision of the extended double-precision
763| format.
764| The input significand must be normalized or smaller. If the input
765| significand is not normalized, `zExp' must be 0; in that case, the result
766| returned is a subnormal number, and it must not require rounding. The
767| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
768| Floating-Point Arithmetic.
769*----------------------------------------------------------------------------*/
770
e5a41ffa
PM
771static floatx80 roundAndPackFloatx80(int8 roundingPrecision, flag zSign,
772 int32 zExp, uint64_t zSig0, uint64_t zSig1,
773 float_status *status)
158142c2
FB
774{
775 int8 roundingMode;
776 flag roundNearestEven, increment, isTiny;
777 int64 roundIncrement, roundMask, roundBits;
778
779 roundingMode = STATUS(float_rounding_mode);
780 roundNearestEven = ( roundingMode == float_round_nearest_even );
781 if ( roundingPrecision == 80 ) goto precision80;
782 if ( roundingPrecision == 64 ) {
783 roundIncrement = LIT64( 0x0000000000000400 );
784 roundMask = LIT64( 0x00000000000007FF );
785 }
786 else if ( roundingPrecision == 32 ) {
787 roundIncrement = LIT64( 0x0000008000000000 );
788 roundMask = LIT64( 0x000000FFFFFFFFFF );
789 }
790 else {
791 goto precision80;
792 }
793 zSig0 |= ( zSig1 != 0 );
dc355b76
PM
794 switch (roundingMode) {
795 case float_round_nearest_even:
f9288a76 796 case float_round_ties_away:
dc355b76
PM
797 break;
798 case float_round_to_zero:
799 roundIncrement = 0;
800 break;
801 case float_round_up:
802 roundIncrement = zSign ? 0 : roundMask;
803 break;
804 case float_round_down:
805 roundIncrement = zSign ? roundMask : 0;
806 break;
807 default:
808 abort();
158142c2
FB
809 }
810 roundBits = zSig0 & roundMask;
bb98fe42 811 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
158142c2
FB
812 if ( ( 0x7FFE < zExp )
813 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
814 ) {
815 goto overflow;
816 }
817 if ( zExp <= 0 ) {
e6afc87f 818 if (STATUS(flush_to_zero)) {
ff32e16e 819 float_raise(float_flag_output_denormal, status);
e6afc87f
PM
820 return packFloatx80(zSign, 0, 0);
821 }
158142c2
FB
822 isTiny =
823 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
824 || ( zExp < 0 )
825 || ( zSig0 <= zSig0 + roundIncrement );
826 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
827 zExp = 0;
828 roundBits = zSig0 & roundMask;
ff32e16e
PM
829 if (isTiny && roundBits) {
830 float_raise(float_flag_underflow, status);
831 }
158142c2
FB
832 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
833 zSig0 += roundIncrement;
bb98fe42 834 if ( (int64_t) zSig0 < 0 ) zExp = 1;
158142c2
FB
835 roundIncrement = roundMask + 1;
836 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
837 roundMask |= roundIncrement;
838 }
839 zSig0 &= ~ roundMask;
840 return packFloatx80( zSign, zExp, zSig0 );
841 }
842 }
843 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
844 zSig0 += roundIncrement;
845 if ( zSig0 < roundIncrement ) {
846 ++zExp;
847 zSig0 = LIT64( 0x8000000000000000 );
848 }
849 roundIncrement = roundMask + 1;
850 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
851 roundMask |= roundIncrement;
852 }
853 zSig0 &= ~ roundMask;
854 if ( zSig0 == 0 ) zExp = 0;
855 return packFloatx80( zSign, zExp, zSig0 );
856 precision80:
dc355b76
PM
857 switch (roundingMode) {
858 case float_round_nearest_even:
f9288a76 859 case float_round_ties_away:
dc355b76
PM
860 increment = ((int64_t)zSig1 < 0);
861 break;
862 case float_round_to_zero:
863 increment = 0;
864 break;
865 case float_round_up:
866 increment = !zSign && zSig1;
867 break;
868 case float_round_down:
869 increment = zSign && zSig1;
870 break;
871 default:
872 abort();
158142c2 873 }
bb98fe42 874 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
158142c2
FB
875 if ( ( 0x7FFE < zExp )
876 || ( ( zExp == 0x7FFE )
877 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
878 && increment
879 )
880 ) {
881 roundMask = 0;
882 overflow:
ff32e16e 883 float_raise(float_flag_overflow | float_flag_inexact, status);
158142c2
FB
884 if ( ( roundingMode == float_round_to_zero )
885 || ( zSign && ( roundingMode == float_round_up ) )
886 || ( ! zSign && ( roundingMode == float_round_down ) )
887 ) {
888 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
889 }
890 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
891 }
892 if ( zExp <= 0 ) {
893 isTiny =
894 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
895 || ( zExp < 0 )
896 || ! increment
897 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
898 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
899 zExp = 0;
ff32e16e
PM
900 if (isTiny && zSig1) {
901 float_raise(float_flag_underflow, status);
902 }
158142c2 903 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
dc355b76
PM
904 switch (roundingMode) {
905 case float_round_nearest_even:
f9288a76 906 case float_round_ties_away:
dc355b76
PM
907 increment = ((int64_t)zSig1 < 0);
908 break;
909 case float_round_to_zero:
910 increment = 0;
911 break;
912 case float_round_up:
913 increment = !zSign && zSig1;
914 break;
915 case float_round_down:
916 increment = zSign && zSig1;
917 break;
918 default:
919 abort();
158142c2
FB
920 }
921 if ( increment ) {
922 ++zSig0;
923 zSig0 &=
bb98fe42
AF
924 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
925 if ( (int64_t) zSig0 < 0 ) zExp = 1;
158142c2
FB
926 }
927 return packFloatx80( zSign, zExp, zSig0 );
928 }
929 }
930 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
931 if ( increment ) {
932 ++zSig0;
933 if ( zSig0 == 0 ) {
934 ++zExp;
935 zSig0 = LIT64( 0x8000000000000000 );
936 }
937 else {
bb98fe42 938 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
158142c2
FB
939 }
940 }
941 else {
942 if ( zSig0 == 0 ) zExp = 0;
943 }
944 return packFloatx80( zSign, zExp, zSig0 );
945
946}
947
948/*----------------------------------------------------------------------------
949| Takes an abstract floating-point value having sign `zSign', exponent
950| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
951| and returns the proper extended double-precision floating-point value
952| corresponding to the abstract input. This routine is just like
953| `roundAndPackFloatx80' except that the input significand does not have to be
954| normalized.
955*----------------------------------------------------------------------------*/
956
e5a41ffa
PM
957static floatx80 normalizeRoundAndPackFloatx80(int8 roundingPrecision,
958 flag zSign, int32 zExp,
959 uint64_t zSig0, uint64_t zSig1,
960 float_status *status)
158142c2
FB
961{
962 int8 shiftCount;
963
964 if ( zSig0 == 0 ) {
965 zSig0 = zSig1;
966 zSig1 = 0;
967 zExp -= 64;
968 }
969 shiftCount = countLeadingZeros64( zSig0 );
970 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
971 zExp -= shiftCount;
ff32e16e
PM
972 return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
973 zSig0, zSig1, status);
158142c2
FB
974
975}
976
158142c2
FB
977/*----------------------------------------------------------------------------
978| Returns the least-significant 64 fraction bits of the quadruple-precision
979| floating-point value `a'.
980*----------------------------------------------------------------------------*/
981
a49db98d 982static inline uint64_t extractFloat128Frac1( float128 a )
158142c2
FB
983{
984
985 return a.low;
986
987}
988
989/*----------------------------------------------------------------------------
990| Returns the most-significant 48 fraction bits of the quadruple-precision
991| floating-point value `a'.
992*----------------------------------------------------------------------------*/
993
a49db98d 994static inline uint64_t extractFloat128Frac0( float128 a )
158142c2
FB
995{
996
997 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
998
999}
1000
1001/*----------------------------------------------------------------------------
1002| Returns the exponent bits of the quadruple-precision floating-point value
1003| `a'.
1004*----------------------------------------------------------------------------*/
1005
a49db98d 1006static inline int32 extractFloat128Exp( float128 a )
158142c2
FB
1007{
1008
1009 return ( a.high>>48 ) & 0x7FFF;
1010
1011}
1012
1013/*----------------------------------------------------------------------------
1014| Returns the sign bit of the quadruple-precision floating-point value `a'.
1015*----------------------------------------------------------------------------*/
1016
a49db98d 1017static inline flag extractFloat128Sign( float128 a )
158142c2
FB
1018{
1019
1020 return a.high>>63;
1021
1022}
1023
1024/*----------------------------------------------------------------------------
1025| Normalizes the subnormal quadruple-precision floating-point value
1026| represented by the denormalized significand formed by the concatenation of
1027| `aSig0' and `aSig1'. The normalized exponent is stored at the location
1028| pointed to by `zExpPtr'. The most significant 49 bits of the normalized
1029| significand are stored at the location pointed to by `zSig0Ptr', and the
1030| least significant 64 bits of the normalized significand are stored at the
1031| location pointed to by `zSig1Ptr'.
1032*----------------------------------------------------------------------------*/
1033
1034static void
1035 normalizeFloat128Subnormal(
bb98fe42
AF
1036 uint64_t aSig0,
1037 uint64_t aSig1,
158142c2 1038 int32 *zExpPtr,
bb98fe42
AF
1039 uint64_t *zSig0Ptr,
1040 uint64_t *zSig1Ptr
158142c2
FB
1041 )
1042{
1043 int8 shiftCount;
1044
1045 if ( aSig0 == 0 ) {
1046 shiftCount = countLeadingZeros64( aSig1 ) - 15;
1047 if ( shiftCount < 0 ) {
1048 *zSig0Ptr = aSig1>>( - shiftCount );
1049 *zSig1Ptr = aSig1<<( shiftCount & 63 );
1050 }
1051 else {
1052 *zSig0Ptr = aSig1<<shiftCount;
1053 *zSig1Ptr = 0;
1054 }
1055 *zExpPtr = - shiftCount - 63;
1056 }
1057 else {
1058 shiftCount = countLeadingZeros64( aSig0 ) - 15;
1059 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
1060 *zExpPtr = 1 - shiftCount;
1061 }
1062
1063}
1064
1065/*----------------------------------------------------------------------------
1066| Packs the sign `zSign', the exponent `zExp', and the significand formed
1067| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
1068| floating-point value, returning the result. After being shifted into the
1069| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
1070| added together to form the most significant 32 bits of the result. This
1071| means that any integer portion of `zSig0' will be added into the exponent.
1072| Since a properly normalized significand will have an integer portion equal
1073| to 1, the `zExp' input should be 1 less than the desired result exponent
1074| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
1075| significand.
1076*----------------------------------------------------------------------------*/
1077
a49db98d 1078static inline float128
bb98fe42 1079 packFloat128( flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 )
158142c2
FB
1080{
1081 float128 z;
1082
1083 z.low = zSig1;
bb98fe42 1084 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
158142c2
FB
1085 return z;
1086
1087}
1088
1089/*----------------------------------------------------------------------------
1090| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1091| and extended significand formed by the concatenation of `zSig0', `zSig1',
1092| and `zSig2', and returns the proper quadruple-precision floating-point value
1093| corresponding to the abstract input. Ordinarily, the abstract value is
1094| simply rounded and packed into the quadruple-precision format, with the
1095| inexact exception raised if the abstract input cannot be represented
1096| exactly. However, if the abstract value is too large, the overflow and
1097| inexact exceptions are raised and an infinity or maximal finite value is
1098| returned. If the abstract value is too small, the input value is rounded to
1099| a subnormal number, and the underflow and inexact exceptions are raised if
1100| the abstract input cannot be represented exactly as a subnormal quadruple-
1101| precision floating-point number.
1102| The input significand must be normalized or smaller. If the input
1103| significand is not normalized, `zExp' must be 0; in that case, the result
1104| returned is a subnormal number, and it must not require rounding. In the
1105| usual case that the input significand is normalized, `zExp' must be 1 less
1106| than the ``true'' floating-point exponent. The handling of underflow and
1107| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1108*----------------------------------------------------------------------------*/
1109
e5a41ffa
PM
1110static float128 roundAndPackFloat128(flag zSign, int32 zExp,
1111 uint64_t zSig0, uint64_t zSig1,
1112 uint64_t zSig2, float_status *status)
158142c2
FB
1113{
1114 int8 roundingMode;
1115 flag roundNearestEven, increment, isTiny;
1116
1117 roundingMode = STATUS(float_rounding_mode);
1118 roundNearestEven = ( roundingMode == float_round_nearest_even );
dc355b76
PM
1119 switch (roundingMode) {
1120 case float_round_nearest_even:
f9288a76 1121 case float_round_ties_away:
dc355b76
PM
1122 increment = ((int64_t)zSig2 < 0);
1123 break;
1124 case float_round_to_zero:
1125 increment = 0;
1126 break;
1127 case float_round_up:
1128 increment = !zSign && zSig2;
1129 break;
1130 case float_round_down:
1131 increment = zSign && zSig2;
1132 break;
1133 default:
1134 abort();
158142c2 1135 }
bb98fe42 1136 if ( 0x7FFD <= (uint32_t) zExp ) {
158142c2
FB
1137 if ( ( 0x7FFD < zExp )
1138 || ( ( zExp == 0x7FFD )
1139 && eq128(
1140 LIT64( 0x0001FFFFFFFFFFFF ),
1141 LIT64( 0xFFFFFFFFFFFFFFFF ),
1142 zSig0,
1143 zSig1
1144 )
1145 && increment
1146 )
1147 ) {
ff32e16e 1148 float_raise(float_flag_overflow | float_flag_inexact, status);
158142c2
FB
1149 if ( ( roundingMode == float_round_to_zero )
1150 || ( zSign && ( roundingMode == float_round_up ) )
1151 || ( ! zSign && ( roundingMode == float_round_down ) )
1152 ) {
1153 return
1154 packFloat128(
1155 zSign,
1156 0x7FFE,
1157 LIT64( 0x0000FFFFFFFFFFFF ),
1158 LIT64( 0xFFFFFFFFFFFFFFFF )
1159 );
1160 }
1161 return packFloat128( zSign, 0x7FFF, 0, 0 );
1162 }
1163 if ( zExp < 0 ) {
e6afc87f 1164 if (STATUS(flush_to_zero)) {
ff32e16e 1165 float_raise(float_flag_output_denormal, status);
e6afc87f
PM
1166 return packFloat128(zSign, 0, 0, 0);
1167 }
158142c2
FB
1168 isTiny =
1169 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
1170 || ( zExp < -1 )
1171 || ! increment
1172 || lt128(
1173 zSig0,
1174 zSig1,
1175 LIT64( 0x0001FFFFFFFFFFFF ),
1176 LIT64( 0xFFFFFFFFFFFFFFFF )
1177 );
1178 shift128ExtraRightJamming(
1179 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1180 zExp = 0;
ff32e16e
PM
1181 if (isTiny && zSig2) {
1182 float_raise(float_flag_underflow, status);
1183 }
dc355b76
PM
1184 switch (roundingMode) {
1185 case float_round_nearest_even:
f9288a76 1186 case float_round_ties_away:
dc355b76
PM
1187 increment = ((int64_t)zSig2 < 0);
1188 break;
1189 case float_round_to_zero:
1190 increment = 0;
1191 break;
1192 case float_round_up:
1193 increment = !zSign && zSig2;
1194 break;
1195 case float_round_down:
1196 increment = zSign && zSig2;
1197 break;
1198 default:
1199 abort();
158142c2
FB
1200 }
1201 }
1202 }
1203 if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1204 if ( increment ) {
1205 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1206 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1207 }
1208 else {
1209 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1210 }
1211 return packFloat128( zSign, zExp, zSig0, zSig1 );
1212
1213}
1214
1215/*----------------------------------------------------------------------------
1216| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1217| and significand formed by the concatenation of `zSig0' and `zSig1', and
1218| returns the proper quadruple-precision floating-point value corresponding
1219| to the abstract input. This routine is just like `roundAndPackFloat128'
1220| except that the input significand has fewer bits and does not have to be
1221| normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1222| point exponent.
1223*----------------------------------------------------------------------------*/
1224
e5a41ffa
PM
1225static float128 normalizeRoundAndPackFloat128(flag zSign, int32 zExp,
1226 uint64_t zSig0, uint64_t zSig1,
1227 float_status *status)
158142c2
FB
1228{
1229 int8 shiftCount;
bb98fe42 1230 uint64_t zSig2;
158142c2
FB
1231
1232 if ( zSig0 == 0 ) {
1233 zSig0 = zSig1;
1234 zSig1 = 0;
1235 zExp -= 64;
1236 }
1237 shiftCount = countLeadingZeros64( zSig0 ) - 15;
1238 if ( 0 <= shiftCount ) {
1239 zSig2 = 0;
1240 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1241 }
1242 else {
1243 shift128ExtraRightJamming(
1244 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1245 }
1246 zExp -= shiftCount;
ff32e16e 1247 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
158142c2
FB
1248
1249}
1250
158142c2
FB
1251/*----------------------------------------------------------------------------
1252| Returns the result of converting the 32-bit two's complement integer `a'
1253| to the single-precision floating-point format. The conversion is performed
1254| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1255*----------------------------------------------------------------------------*/
1256
e5a41ffa 1257float32 int32_to_float32(int32_t a, float_status *status)
158142c2
FB
1258{
1259 flag zSign;
1260
f090c9d4 1261 if ( a == 0 ) return float32_zero;
bb98fe42 1262 if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
158142c2 1263 zSign = ( a < 0 );
ff32e16e 1264 return normalizeRoundAndPackFloat32(zSign, 0x9C, zSign ? -a : a, status);
158142c2
FB
1265}
1266
1267/*----------------------------------------------------------------------------
1268| Returns the result of converting the 32-bit two's complement integer `a'
1269| to the double-precision floating-point format. The conversion is performed
1270| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1271*----------------------------------------------------------------------------*/
1272
e5a41ffa 1273float64 int32_to_float64(int32_t a, float_status *status)
158142c2
FB
1274{
1275 flag zSign;
1276 uint32 absA;
1277 int8 shiftCount;
bb98fe42 1278 uint64_t zSig;
158142c2 1279
f090c9d4 1280 if ( a == 0 ) return float64_zero;
158142c2
FB
1281 zSign = ( a < 0 );
1282 absA = zSign ? - a : a;
1283 shiftCount = countLeadingZeros32( absA ) + 21;
1284 zSig = absA;
1285 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1286
1287}
1288
158142c2
FB
1289/*----------------------------------------------------------------------------
1290| Returns the result of converting the 32-bit two's complement integer `a'
1291| to the extended double-precision floating-point format. The conversion
1292| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1293| Arithmetic.
1294*----------------------------------------------------------------------------*/
1295
e5a41ffa 1296floatx80 int32_to_floatx80(int32_t a, float_status *status)
158142c2
FB
1297{
1298 flag zSign;
1299 uint32 absA;
1300 int8 shiftCount;
bb98fe42 1301 uint64_t zSig;
158142c2
FB
1302
1303 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1304 zSign = ( a < 0 );
1305 absA = zSign ? - a : a;
1306 shiftCount = countLeadingZeros32( absA ) + 32;
1307 zSig = absA;
1308 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1309
1310}
1311
158142c2
FB
1312/*----------------------------------------------------------------------------
1313| Returns the result of converting the 32-bit two's complement integer `a' to
1314| the quadruple-precision floating-point format. The conversion is performed
1315| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1316*----------------------------------------------------------------------------*/
1317
e5a41ffa 1318float128 int32_to_float128(int32_t a, float_status *status)
158142c2
FB
1319{
1320 flag zSign;
1321 uint32 absA;
1322 int8 shiftCount;
bb98fe42 1323 uint64_t zSig0;
158142c2
FB
1324
1325 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1326 zSign = ( a < 0 );
1327 absA = zSign ? - a : a;
1328 shiftCount = countLeadingZeros32( absA ) + 17;
1329 zSig0 = absA;
1330 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1331
1332}
1333
158142c2
FB
1334/*----------------------------------------------------------------------------
1335| Returns the result of converting the 64-bit two's complement integer `a'
1336| to the single-precision floating-point format. The conversion is performed
1337| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1338*----------------------------------------------------------------------------*/
1339
e5a41ffa 1340float32 int64_to_float32(int64_t a, float_status *status)
158142c2
FB
1341{
1342 flag zSign;
1343 uint64 absA;
1344 int8 shiftCount;
1345
f090c9d4 1346 if ( a == 0 ) return float32_zero;
158142c2
FB
1347 zSign = ( a < 0 );
1348 absA = zSign ? - a : a;
1349 shiftCount = countLeadingZeros64( absA ) - 40;
1350 if ( 0 <= shiftCount ) {
1351 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1352 }
1353 else {
1354 shiftCount += 7;
1355 if ( shiftCount < 0 ) {
1356 shift64RightJamming( absA, - shiftCount, &absA );
1357 }
1358 else {
1359 absA <<= shiftCount;
1360 }
ff32e16e 1361 return roundAndPackFloat32(zSign, 0x9C - shiftCount, absA, status);
158142c2
FB
1362 }
1363
1364}
1365
1366/*----------------------------------------------------------------------------
1367| Returns the result of converting the 64-bit two's complement integer `a'
1368| to the double-precision floating-point format. The conversion is performed
1369| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1370*----------------------------------------------------------------------------*/
1371
e5a41ffa 1372float64 int64_to_float64(int64_t a, float_status *status)
158142c2
FB
1373{
1374 flag zSign;
1375
f090c9d4 1376 if ( a == 0 ) return float64_zero;
bb98fe42 1377 if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
158142c2
FB
1378 return packFloat64( 1, 0x43E, 0 );
1379 }
1380 zSign = ( a < 0 );
ff32e16e 1381 return normalizeRoundAndPackFloat64(zSign, 0x43C, zSign ? -a : a, status);
158142c2
FB
1382}
1383
158142c2
FB
1384/*----------------------------------------------------------------------------
1385| Returns the result of converting the 64-bit two's complement integer `a'
1386| to the extended double-precision floating-point format. The conversion
1387| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1388| Arithmetic.
1389*----------------------------------------------------------------------------*/
1390
e5a41ffa 1391floatx80 int64_to_floatx80(int64_t a, float_status *status)
158142c2
FB
1392{
1393 flag zSign;
1394 uint64 absA;
1395 int8 shiftCount;
1396
1397 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1398 zSign = ( a < 0 );
1399 absA = zSign ? - a : a;
1400 shiftCount = countLeadingZeros64( absA );
1401 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1402
1403}
1404
158142c2
FB
1405/*----------------------------------------------------------------------------
1406| Returns the result of converting the 64-bit two's complement integer `a' to
1407| the quadruple-precision floating-point format. The conversion is performed
1408| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1409*----------------------------------------------------------------------------*/
1410
e5a41ffa 1411float128 int64_to_float128(int64_t a, float_status *status)
158142c2
FB
1412{
1413 flag zSign;
1414 uint64 absA;
1415 int8 shiftCount;
1416 int32 zExp;
bb98fe42 1417 uint64_t zSig0, zSig1;
158142c2
FB
1418
1419 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1420 zSign = ( a < 0 );
1421 absA = zSign ? - a : a;
1422 shiftCount = countLeadingZeros64( absA ) + 49;
1423 zExp = 0x406E - shiftCount;
1424 if ( 64 <= shiftCount ) {
1425 zSig1 = 0;
1426 zSig0 = absA;
1427 shiftCount -= 64;
1428 }
1429 else {
1430 zSig1 = absA;
1431 zSig0 = 0;
1432 }
1433 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1434 return packFloat128( zSign, zExp, zSig0, zSig1 );
1435
1436}
1437
6bb8e0f1
PM
1438/*----------------------------------------------------------------------------
1439| Returns the result of converting the 64-bit unsigned integer `a'
1440| to the single-precision floating-point format. The conversion is performed
1441| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1442*----------------------------------------------------------------------------*/
1443
e5a41ffa 1444float32 uint64_to_float32(uint64_t a, float_status *status)
6bb8e0f1
PM
1445{
1446 int shiftcount;
1447
1448 if (a == 0) {
1449 return float32_zero;
1450 }
1451
1452 /* Determine (left) shift needed to put first set bit into bit posn 23
1453 * (since packFloat32() expects the binary point between bits 23 and 22);
1454 * this is the fast case for smallish numbers.
1455 */
1456 shiftcount = countLeadingZeros64(a) - 40;
1457 if (shiftcount >= 0) {
1458 return packFloat32(0, 0x95 - shiftcount, a << shiftcount);
1459 }
1460 /* Otherwise we need to do a round-and-pack. roundAndPackFloat32()
1461 * expects the binary point between bits 30 and 29, hence the + 7.
1462 */
1463 shiftcount += 7;
1464 if (shiftcount < 0) {
1465 shift64RightJamming(a, -shiftcount, &a);
1466 } else {
1467 a <<= shiftcount;
1468 }
1469
ff32e16e 1470 return roundAndPackFloat32(0, 0x9c - shiftcount, a, status);
6bb8e0f1
PM
1471}
1472
1473/*----------------------------------------------------------------------------
1474| Returns the result of converting the 64-bit unsigned integer `a'
1475| to the double-precision floating-point format. The conversion is performed
1476| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1477*----------------------------------------------------------------------------*/
1478
e5a41ffa 1479float64 uint64_to_float64(uint64_t a, float_status *status)
6bb8e0f1
PM
1480{
1481 int exp = 0x43C;
1482 int shiftcount;
1483
1484 if (a == 0) {
1485 return float64_zero;
1486 }
1487
1488 shiftcount = countLeadingZeros64(a) - 1;
1489 if (shiftcount < 0) {
1490 shift64RightJamming(a, -shiftcount, &a);
1491 } else {
1492 a <<= shiftcount;
1493 }
ff32e16e 1494 return roundAndPackFloat64(0, exp - shiftcount, a, status);
6bb8e0f1
PM
1495}
1496
1497/*----------------------------------------------------------------------------
1498| Returns the result of converting the 64-bit unsigned integer `a'
1499| to the quadruple-precision floating-point format. The conversion is performed
1500| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1501*----------------------------------------------------------------------------*/
1502
e5a41ffa 1503float128 uint64_to_float128(uint64_t a, float_status *status)
1e397ead
RH
1504{
1505 if (a == 0) {
1506 return float128_zero;
1507 }
ff32e16e 1508 return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status);
1e397ead
RH
1509}
1510
158142c2
FB
1511/*----------------------------------------------------------------------------
1512| Returns the result of converting the single-precision floating-point value
1513| `a' to the 32-bit two's complement integer format. The conversion is
1514| performed according to the IEC/IEEE Standard for Binary Floating-Point
1515| Arithmetic---which means in particular that the conversion is rounded
1516| according to the current rounding mode. If `a' is a NaN, the largest
1517| positive integer is returned. Otherwise, if the conversion overflows, the
1518| largest integer with the same sign as `a' is returned.
1519*----------------------------------------------------------------------------*/
1520
e5a41ffa 1521int32 float32_to_int32(float32 a, float_status *status)
158142c2
FB
1522{
1523 flag aSign;
94a49d86 1524 int_fast16_t aExp, shiftCount;
bb98fe42
AF
1525 uint32_t aSig;
1526 uint64_t aSig64;
158142c2 1527
ff32e16e 1528 a = float32_squash_input_denormal(a, status);
158142c2
FB
1529 aSig = extractFloat32Frac( a );
1530 aExp = extractFloat32Exp( a );
1531 aSign = extractFloat32Sign( a );
1532 if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1533 if ( aExp ) aSig |= 0x00800000;
1534 shiftCount = 0xAF - aExp;
1535 aSig64 = aSig;
1536 aSig64 <<= 32;
1537 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
ff32e16e 1538 return roundAndPackInt32(aSign, aSig64, status);
158142c2
FB
1539
1540}
1541
1542/*----------------------------------------------------------------------------
1543| Returns the result of converting the single-precision floating-point value
1544| `a' to the 32-bit two's complement integer format. The conversion is
1545| performed according to the IEC/IEEE Standard for Binary Floating-Point
1546| Arithmetic, except that the conversion is always rounded toward zero.
1547| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1548| the conversion overflows, the largest integer with the same sign as `a' is
1549| returned.
1550*----------------------------------------------------------------------------*/
1551
e5a41ffa 1552int32 float32_to_int32_round_to_zero(float32 a, float_status *status)
158142c2
FB
1553{
1554 flag aSign;
94a49d86 1555 int_fast16_t aExp, shiftCount;
bb98fe42 1556 uint32_t aSig;
b3a6a2e0 1557 int32_t z;
ff32e16e 1558 a = float32_squash_input_denormal(a, status);
158142c2
FB
1559
1560 aSig = extractFloat32Frac( a );
1561 aExp = extractFloat32Exp( a );
1562 aSign = extractFloat32Sign( a );
1563 shiftCount = aExp - 0x9E;
1564 if ( 0 <= shiftCount ) {
f090c9d4 1565 if ( float32_val(a) != 0xCF000000 ) {
ff32e16e 1566 float_raise(float_flag_invalid, status);
158142c2
FB
1567 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1568 }
bb98fe42 1569 return (int32_t) 0x80000000;
158142c2
FB
1570 }
1571 else if ( aExp <= 0x7E ) {
1572 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1573 return 0;
1574 }
1575 aSig = ( aSig | 0x00800000 )<<8;
1576 z = aSig>>( - shiftCount );
bb98fe42 1577 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
158142c2
FB
1578 STATUS(float_exception_flags) |= float_flag_inexact;
1579 }
1580 if ( aSign ) z = - z;
1581 return z;
1582
1583}
1584
cbcef455
PM
1585/*----------------------------------------------------------------------------
1586| Returns the result of converting the single-precision floating-point value
1587| `a' to the 16-bit two's complement integer format. The conversion is
1588| performed according to the IEC/IEEE Standard for Binary Floating-Point
1589| Arithmetic, except that the conversion is always rounded toward zero.
1590| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1591| the conversion overflows, the largest integer with the same sign as `a' is
1592| returned.
1593*----------------------------------------------------------------------------*/
1594
e5a41ffa 1595int_fast16_t float32_to_int16_round_to_zero(float32 a, float_status *status)
cbcef455
PM
1596{
1597 flag aSign;
94a49d86 1598 int_fast16_t aExp, shiftCount;
bb98fe42 1599 uint32_t aSig;
cbcef455
PM
1600 int32 z;
1601
1602 aSig = extractFloat32Frac( a );
1603 aExp = extractFloat32Exp( a );
1604 aSign = extractFloat32Sign( a );
1605 shiftCount = aExp - 0x8E;
1606 if ( 0 <= shiftCount ) {
1607 if ( float32_val(a) != 0xC7000000 ) {
ff32e16e 1608 float_raise(float_flag_invalid, status);
cbcef455
PM
1609 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1610 return 0x7FFF;
1611 }
1612 }
bb98fe42 1613 return (int32_t) 0xffff8000;
cbcef455
PM
1614 }
1615 else if ( aExp <= 0x7E ) {
1616 if ( aExp | aSig ) {
1617 STATUS(float_exception_flags) |= float_flag_inexact;
1618 }
1619 return 0;
1620 }
1621 shiftCount -= 0x10;
1622 aSig = ( aSig | 0x00800000 )<<8;
1623 z = aSig>>( - shiftCount );
bb98fe42 1624 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
cbcef455
PM
1625 STATUS(float_exception_flags) |= float_flag_inexact;
1626 }
1627 if ( aSign ) {
1628 z = - z;
1629 }
1630 return z;
1631
1632}
1633
158142c2
FB
1634/*----------------------------------------------------------------------------
1635| Returns the result of converting the single-precision floating-point value
1636| `a' to the 64-bit two's complement integer format. The conversion is
1637| performed according to the IEC/IEEE Standard for Binary Floating-Point
1638| Arithmetic---which means in particular that the conversion is rounded
1639| according to the current rounding mode. If `a' is a NaN, the largest
1640| positive integer is returned. Otherwise, if the conversion overflows, the
1641| largest integer with the same sign as `a' is returned.
1642*----------------------------------------------------------------------------*/
1643
e5a41ffa 1644int64 float32_to_int64(float32 a, float_status *status)
158142c2
FB
1645{
1646 flag aSign;
94a49d86 1647 int_fast16_t aExp, shiftCount;
bb98fe42
AF
1648 uint32_t aSig;
1649 uint64_t aSig64, aSigExtra;
ff32e16e 1650 a = float32_squash_input_denormal(a, status);
158142c2
FB
1651
1652 aSig = extractFloat32Frac( a );
1653 aExp = extractFloat32Exp( a );
1654 aSign = extractFloat32Sign( a );
1655 shiftCount = 0xBE - aExp;
1656 if ( shiftCount < 0 ) {
ff32e16e 1657 float_raise(float_flag_invalid, status);
158142c2
FB
1658 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1659 return LIT64( 0x7FFFFFFFFFFFFFFF );
1660 }
bb98fe42 1661 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
1662 }
1663 if ( aExp ) aSig |= 0x00800000;
1664 aSig64 = aSig;
1665 aSig64 <<= 40;
1666 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
ff32e16e 1667 return roundAndPackInt64(aSign, aSig64, aSigExtra, status);
158142c2
FB
1668
1669}
1670
2f18bbf9
TM
1671/*----------------------------------------------------------------------------
1672| Returns the result of converting the single-precision floating-point value
1673| `a' to the 64-bit unsigned integer format. The conversion is
1674| performed according to the IEC/IEEE Standard for Binary Floating-Point
1675| Arithmetic---which means in particular that the conversion is rounded
1676| according to the current rounding mode. If `a' is a NaN, the largest
1677| unsigned integer is returned. Otherwise, if the conversion overflows, the
1678| largest unsigned integer is returned. If the 'a' is negative, the result
1679| is rounded and zero is returned; values that do not round to zero will
1680| raise the inexact exception flag.
1681*----------------------------------------------------------------------------*/
1682
e5a41ffa 1683uint64 float32_to_uint64(float32 a, float_status *status)
2f18bbf9
TM
1684{
1685 flag aSign;
1686 int_fast16_t aExp, shiftCount;
1687 uint32_t aSig;
1688 uint64_t aSig64, aSigExtra;
ff32e16e 1689 a = float32_squash_input_denormal(a, status);
2f18bbf9
TM
1690
1691 aSig = extractFloat32Frac(a);
1692 aExp = extractFloat32Exp(a);
1693 aSign = extractFloat32Sign(a);
1694 if ((aSign) && (aExp > 126)) {
ff32e16e 1695 float_raise(float_flag_invalid, status);
2f18bbf9
TM
1696 if (float32_is_any_nan(a)) {
1697 return LIT64(0xFFFFFFFFFFFFFFFF);
1698 } else {
1699 return 0;
1700 }
1701 }
1702 shiftCount = 0xBE - aExp;
1703 if (aExp) {
1704 aSig |= 0x00800000;
1705 }
1706 if (shiftCount < 0) {
ff32e16e 1707 float_raise(float_flag_invalid, status);
2f18bbf9
TM
1708 return LIT64(0xFFFFFFFFFFFFFFFF);
1709 }
1710
1711 aSig64 = aSig;
1712 aSig64 <<= 40;
1713 shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra);
ff32e16e 1714 return roundAndPackUint64(aSign, aSig64, aSigExtra, status);
2f18bbf9
TM
1715}
1716
a13d4489
TM
1717/*----------------------------------------------------------------------------
1718| Returns the result of converting the single-precision floating-point value
1719| `a' to the 64-bit unsigned integer format. The conversion is
1720| performed according to the IEC/IEEE Standard for Binary Floating-Point
1721| Arithmetic, except that the conversion is always rounded toward zero. If
1722| `a' is a NaN, the largest unsigned integer is returned. Otherwise, if the
1723| conversion overflows, the largest unsigned integer is returned. If the
1724| 'a' is negative, the result is rounded and zero is returned; values that do
1725| not round to zero will raise the inexact flag.
1726*----------------------------------------------------------------------------*/
1727
e5a41ffa 1728uint64 float32_to_uint64_round_to_zero(float32 a, float_status *status)
a13d4489
TM
1729{
1730 signed char current_rounding_mode = STATUS(float_rounding_mode);
ff32e16e
PM
1731 set_float_rounding_mode(float_round_to_zero, status);
1732 int64_t v = float32_to_uint64(a, status);
1733 set_float_rounding_mode(current_rounding_mode, status);
a13d4489
TM
1734 return v;
1735}
1736
158142c2
FB
1737/*----------------------------------------------------------------------------
1738| Returns the result of converting the single-precision floating-point value
1739| `a' to the 64-bit two's complement integer format. The conversion is
1740| performed according to the IEC/IEEE Standard for Binary Floating-Point
1741| Arithmetic, except that the conversion is always rounded toward zero. If
1742| `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1743| conversion overflows, the largest integer with the same sign as `a' is
1744| returned.
1745*----------------------------------------------------------------------------*/
1746
e5a41ffa 1747int64 float32_to_int64_round_to_zero(float32 a, float_status *status)
158142c2
FB
1748{
1749 flag aSign;
94a49d86 1750 int_fast16_t aExp, shiftCount;
bb98fe42
AF
1751 uint32_t aSig;
1752 uint64_t aSig64;
158142c2 1753 int64 z;
ff32e16e 1754 a = float32_squash_input_denormal(a, status);
158142c2
FB
1755
1756 aSig = extractFloat32Frac( a );
1757 aExp = extractFloat32Exp( a );
1758 aSign = extractFloat32Sign( a );
1759 shiftCount = aExp - 0xBE;
1760 if ( 0 <= shiftCount ) {
f090c9d4 1761 if ( float32_val(a) != 0xDF000000 ) {
ff32e16e 1762 float_raise(float_flag_invalid, status);
158142c2
FB
1763 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1764 return LIT64( 0x7FFFFFFFFFFFFFFF );
1765 }
1766 }
bb98fe42 1767 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
1768 }
1769 else if ( aExp <= 0x7E ) {
1770 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1771 return 0;
1772 }
1773 aSig64 = aSig | 0x00800000;
1774 aSig64 <<= 40;
1775 z = aSig64>>( - shiftCount );
bb98fe42 1776 if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
158142c2
FB
1777 STATUS(float_exception_flags) |= float_flag_inexact;
1778 }
1779 if ( aSign ) z = - z;
1780 return z;
1781
1782}
1783
1784/*----------------------------------------------------------------------------
1785| Returns the result of converting the single-precision floating-point value
1786| `a' to the double-precision floating-point format. The conversion is
1787| performed according to the IEC/IEEE Standard for Binary Floating-Point
1788| Arithmetic.
1789*----------------------------------------------------------------------------*/
1790
e5a41ffa 1791float64 float32_to_float64(float32 a, float_status *status)
158142c2
FB
1792{
1793 flag aSign;
94a49d86 1794 int_fast16_t aExp;
bb98fe42 1795 uint32_t aSig;
ff32e16e 1796 a = float32_squash_input_denormal(a, status);
158142c2
FB
1797
1798 aSig = extractFloat32Frac( a );
1799 aExp = extractFloat32Exp( a );
1800 aSign = extractFloat32Sign( a );
1801 if ( aExp == 0xFF ) {
ff32e16e
PM
1802 if (aSig) {
1803 return commonNaNToFloat64(float32ToCommonNaN(a, status), status);
1804 }
158142c2
FB
1805 return packFloat64( aSign, 0x7FF, 0 );
1806 }
1807 if ( aExp == 0 ) {
1808 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1809 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1810 --aExp;
1811 }
bb98fe42 1812 return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
158142c2
FB
1813
1814}
1815
158142c2
FB
1816/*----------------------------------------------------------------------------
1817| Returns the result of converting the single-precision floating-point value
1818| `a' to the extended double-precision floating-point format. The conversion
1819| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1820| Arithmetic.
1821*----------------------------------------------------------------------------*/
1822
e5a41ffa 1823floatx80 float32_to_floatx80(float32 a, float_status *status)
158142c2
FB
1824{
1825 flag aSign;
94a49d86 1826 int_fast16_t aExp;
bb98fe42 1827 uint32_t aSig;
158142c2 1828
ff32e16e 1829 a = float32_squash_input_denormal(a, status);
158142c2
FB
1830 aSig = extractFloat32Frac( a );
1831 aExp = extractFloat32Exp( a );
1832 aSign = extractFloat32Sign( a );
1833 if ( aExp == 0xFF ) {
ff32e16e
PM
1834 if (aSig) {
1835 return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
1836 }
158142c2
FB
1837 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1838 }
1839 if ( aExp == 0 ) {
1840 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1841 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1842 }
1843 aSig |= 0x00800000;
bb98fe42 1844 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
158142c2
FB
1845
1846}
1847
158142c2
FB
1848/*----------------------------------------------------------------------------
1849| Returns the result of converting the single-precision floating-point value
1850| `a' to the double-precision floating-point format. The conversion is
1851| performed according to the IEC/IEEE Standard for Binary Floating-Point
1852| Arithmetic.
1853*----------------------------------------------------------------------------*/
1854
e5a41ffa 1855float128 float32_to_float128(float32 a, float_status *status)
158142c2
FB
1856{
1857 flag aSign;
94a49d86 1858 int_fast16_t aExp;
bb98fe42 1859 uint32_t aSig;
158142c2 1860
ff32e16e 1861 a = float32_squash_input_denormal(a, status);
158142c2
FB
1862 aSig = extractFloat32Frac( a );
1863 aExp = extractFloat32Exp( a );
1864 aSign = extractFloat32Sign( a );
1865 if ( aExp == 0xFF ) {
ff32e16e
PM
1866 if (aSig) {
1867 return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
1868 }
158142c2
FB
1869 return packFloat128( aSign, 0x7FFF, 0, 0 );
1870 }
1871 if ( aExp == 0 ) {
1872 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1873 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1874 --aExp;
1875 }
bb98fe42 1876 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
158142c2
FB
1877
1878}
1879
158142c2
FB
1880/*----------------------------------------------------------------------------
1881| Rounds the single-precision floating-point value `a' to an integer, and
1882| returns the result as a single-precision floating-point value. The
1883| operation is performed according to the IEC/IEEE Standard for Binary
1884| Floating-Point Arithmetic.
1885*----------------------------------------------------------------------------*/
1886
e5a41ffa 1887float32 float32_round_to_int(float32 a, float_status *status)
158142c2
FB
1888{
1889 flag aSign;
94a49d86 1890 int_fast16_t aExp;
bb98fe42 1891 uint32_t lastBitMask, roundBitsMask;
bb98fe42 1892 uint32_t z;
ff32e16e 1893 a = float32_squash_input_denormal(a, status);
158142c2
FB
1894
1895 aExp = extractFloat32Exp( a );
1896 if ( 0x96 <= aExp ) {
1897 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
ff32e16e 1898 return propagateFloat32NaN(a, a, status);
158142c2
FB
1899 }
1900 return a;
1901 }
1902 if ( aExp <= 0x7E ) {
bb98fe42 1903 if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
158142c2
FB
1904 STATUS(float_exception_flags) |= float_flag_inexact;
1905 aSign = extractFloat32Sign( a );
1906 switch ( STATUS(float_rounding_mode) ) {
1907 case float_round_nearest_even:
1908 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1909 return packFloat32( aSign, 0x7F, 0 );
1910 }
1911 break;
f9288a76
PM
1912 case float_round_ties_away:
1913 if (aExp == 0x7E) {
1914 return packFloat32(aSign, 0x7F, 0);
1915 }
1916 break;
158142c2 1917 case float_round_down:
f090c9d4 1918 return make_float32(aSign ? 0xBF800000 : 0);
158142c2 1919 case float_round_up:
f090c9d4 1920 return make_float32(aSign ? 0x80000000 : 0x3F800000);
158142c2
FB
1921 }
1922 return packFloat32( aSign, 0, 0 );
1923 }
1924 lastBitMask = 1;
1925 lastBitMask <<= 0x96 - aExp;
1926 roundBitsMask = lastBitMask - 1;
f090c9d4 1927 z = float32_val(a);
dc355b76
PM
1928 switch (STATUS(float_rounding_mode)) {
1929 case float_round_nearest_even:
158142c2 1930 z += lastBitMask>>1;
dc355b76
PM
1931 if ((z & roundBitsMask) == 0) {
1932 z &= ~lastBitMask;
1933 }
1934 break;
f9288a76
PM
1935 case float_round_ties_away:
1936 z += lastBitMask >> 1;
1937 break;
dc355b76
PM
1938 case float_round_to_zero:
1939 break;
1940 case float_round_up:
1941 if (!extractFloat32Sign(make_float32(z))) {
1942 z += roundBitsMask;
1943 }
1944 break;
1945 case float_round_down:
1946 if (extractFloat32Sign(make_float32(z))) {
158142c2
FB
1947 z += roundBitsMask;
1948 }
dc355b76
PM
1949 break;
1950 default:
1951 abort();
158142c2
FB
1952 }
1953 z &= ~ roundBitsMask;
f090c9d4
PB
1954 if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1955 return make_float32(z);
158142c2
FB
1956
1957}
1958
1959/*----------------------------------------------------------------------------
1960| Returns the result of adding the absolute values of the single-precision
1961| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1962| before being returned. `zSign' is ignored if the result is a NaN.
1963| The addition is performed according to the IEC/IEEE Standard for Binary
1964| Floating-Point Arithmetic.
1965*----------------------------------------------------------------------------*/
1966
e5a41ffa
PM
1967static float32 addFloat32Sigs(float32 a, float32 b, flag zSign,
1968 float_status *status)
158142c2 1969{
94a49d86 1970 int_fast16_t aExp, bExp, zExp;
bb98fe42 1971 uint32_t aSig, bSig, zSig;
94a49d86 1972 int_fast16_t expDiff;
158142c2
FB
1973
1974 aSig = extractFloat32Frac( a );
1975 aExp = extractFloat32Exp( a );
1976 bSig = extractFloat32Frac( b );
1977 bExp = extractFloat32Exp( b );
1978 expDiff = aExp - bExp;
1979 aSig <<= 6;
1980 bSig <<= 6;
1981 if ( 0 < expDiff ) {
1982 if ( aExp == 0xFF ) {
ff32e16e
PM
1983 if (aSig) {
1984 return propagateFloat32NaN(a, b, status);
1985 }
158142c2
FB
1986 return a;
1987 }
1988 if ( bExp == 0 ) {
1989 --expDiff;
1990 }
1991 else {
1992 bSig |= 0x20000000;
1993 }
1994 shift32RightJamming( bSig, expDiff, &bSig );
1995 zExp = aExp;
1996 }
1997 else if ( expDiff < 0 ) {
1998 if ( bExp == 0xFF ) {
ff32e16e
PM
1999 if (bSig) {
2000 return propagateFloat32NaN(a, b, status);
2001 }
158142c2
FB
2002 return packFloat32( zSign, 0xFF, 0 );
2003 }
2004 if ( aExp == 0 ) {
2005 ++expDiff;
2006 }
2007 else {
2008 aSig |= 0x20000000;
2009 }
2010 shift32RightJamming( aSig, - expDiff, &aSig );
2011 zExp = bExp;
2012 }
2013 else {
2014 if ( aExp == 0xFF ) {
ff32e16e
PM
2015 if (aSig | bSig) {
2016 return propagateFloat32NaN(a, b, status);
2017 }
158142c2
FB
2018 return a;
2019 }
fe76d976 2020 if ( aExp == 0 ) {
e6afc87f
PM
2021 if (STATUS(flush_to_zero)) {
2022 if (aSig | bSig) {
ff32e16e 2023 float_raise(float_flag_output_denormal, status);
e6afc87f
PM
2024 }
2025 return packFloat32(zSign, 0, 0);
2026 }
fe76d976
PB
2027 return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
2028 }
158142c2
FB
2029 zSig = 0x40000000 + aSig + bSig;
2030 zExp = aExp;
2031 goto roundAndPack;
2032 }
2033 aSig |= 0x20000000;
2034 zSig = ( aSig + bSig )<<1;
2035 --zExp;
bb98fe42 2036 if ( (int32_t) zSig < 0 ) {
158142c2
FB
2037 zSig = aSig + bSig;
2038 ++zExp;
2039 }
2040 roundAndPack:
ff32e16e 2041 return roundAndPackFloat32(zSign, zExp, zSig, status);
158142c2
FB
2042
2043}
2044
2045/*----------------------------------------------------------------------------
2046| Returns the result of subtracting the absolute values of the single-
2047| precision floating-point values `a' and `b'. If `zSign' is 1, the
2048| difference is negated before being returned. `zSign' is ignored if the
2049| result is a NaN. The subtraction is performed according to the IEC/IEEE
2050| Standard for Binary Floating-Point Arithmetic.
2051*----------------------------------------------------------------------------*/
2052
e5a41ffa
PM
2053static float32 subFloat32Sigs(float32 a, float32 b, flag zSign,
2054 float_status *status)
158142c2 2055{
94a49d86 2056 int_fast16_t aExp, bExp, zExp;
bb98fe42 2057 uint32_t aSig, bSig, zSig;
94a49d86 2058 int_fast16_t expDiff;
158142c2
FB
2059
2060 aSig = extractFloat32Frac( a );
2061 aExp = extractFloat32Exp( a );
2062 bSig = extractFloat32Frac( b );
2063 bExp = extractFloat32Exp( b );
2064 expDiff = aExp - bExp;
2065 aSig <<= 7;
2066 bSig <<= 7;
2067 if ( 0 < expDiff ) goto aExpBigger;
2068 if ( expDiff < 0 ) goto bExpBigger;
2069 if ( aExp == 0xFF ) {
ff32e16e
PM
2070 if (aSig | bSig) {
2071 return propagateFloat32NaN(a, b, status);
2072 }
2073 float_raise(float_flag_invalid, status);
158142c2
FB
2074 return float32_default_nan;
2075 }
2076 if ( aExp == 0 ) {
2077 aExp = 1;
2078 bExp = 1;
2079 }
2080 if ( bSig < aSig ) goto aBigger;
2081 if ( aSig < bSig ) goto bBigger;
2082 return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
2083 bExpBigger:
2084 if ( bExp == 0xFF ) {
ff32e16e
PM
2085 if (bSig) {
2086 return propagateFloat32NaN(a, b, status);
2087 }
158142c2
FB
2088 return packFloat32( zSign ^ 1, 0xFF, 0 );
2089 }
2090 if ( aExp == 0 ) {
2091 ++expDiff;
2092 }
2093 else {
2094 aSig |= 0x40000000;
2095 }
2096 shift32RightJamming( aSig, - expDiff, &aSig );
2097 bSig |= 0x40000000;
2098 bBigger:
2099 zSig = bSig - aSig;
2100 zExp = bExp;
2101 zSign ^= 1;
2102 goto normalizeRoundAndPack;
2103 aExpBigger:
2104 if ( aExp == 0xFF ) {
ff32e16e
PM
2105 if (aSig) {
2106 return propagateFloat32NaN(a, b, status);
2107 }
158142c2
FB
2108 return a;
2109 }
2110 if ( bExp == 0 ) {
2111 --expDiff;
2112 }
2113 else {
2114 bSig |= 0x40000000;
2115 }
2116 shift32RightJamming( bSig, expDiff, &bSig );
2117 aSig |= 0x40000000;
2118 aBigger:
2119 zSig = aSig - bSig;
2120 zExp = aExp;
2121 normalizeRoundAndPack:
2122 --zExp;
ff32e16e 2123 return normalizeRoundAndPackFloat32(zSign, zExp, zSig, status);
158142c2
FB
2124
2125}
2126
2127/*----------------------------------------------------------------------------
2128| Returns the result of adding the single-precision floating-point values `a'
2129| and `b'. The operation is performed according to the IEC/IEEE Standard for
2130| Binary Floating-Point Arithmetic.
2131*----------------------------------------------------------------------------*/
2132
e5a41ffa 2133float32 float32_add(float32 a, float32 b, float_status *status)
158142c2
FB
2134{
2135 flag aSign, bSign;
ff32e16e
PM
2136 a = float32_squash_input_denormal(a, status);
2137 b = float32_squash_input_denormal(b, status);
158142c2
FB
2138
2139 aSign = extractFloat32Sign( a );
2140 bSign = extractFloat32Sign( b );
2141 if ( aSign == bSign ) {
ff32e16e 2142 return addFloat32Sigs(a, b, aSign, status);
158142c2
FB
2143 }
2144 else {
ff32e16e 2145 return subFloat32Sigs(a, b, aSign, status);
158142c2
FB
2146 }
2147
2148}
2149
2150/*----------------------------------------------------------------------------
2151| Returns the result of subtracting the single-precision floating-point values
2152| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2153| for Binary Floating-Point Arithmetic.
2154*----------------------------------------------------------------------------*/
2155
e5a41ffa 2156float32 float32_sub(float32 a, float32 b, float_status *status)
158142c2
FB
2157{
2158 flag aSign, bSign;
ff32e16e
PM
2159 a = float32_squash_input_denormal(a, status);
2160 b = float32_squash_input_denormal(b, status);
158142c2
FB
2161
2162 aSign = extractFloat32Sign( a );
2163 bSign = extractFloat32Sign( b );
2164 if ( aSign == bSign ) {
ff32e16e 2165 return subFloat32Sigs(a, b, aSign, status);
158142c2
FB
2166 }
2167 else {
ff32e16e 2168 return addFloat32Sigs(a, b, aSign, status);
158142c2
FB
2169 }
2170
2171}
2172
2173/*----------------------------------------------------------------------------
2174| Returns the result of multiplying the single-precision floating-point values
2175| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2176| for Binary Floating-Point Arithmetic.
2177*----------------------------------------------------------------------------*/
2178
e5a41ffa 2179float32 float32_mul(float32 a, float32 b, float_status *status)
158142c2
FB
2180{
2181 flag aSign, bSign, zSign;
94a49d86 2182 int_fast16_t aExp, bExp, zExp;
bb98fe42
AF
2183 uint32_t aSig, bSig;
2184 uint64_t zSig64;
2185 uint32_t zSig;
158142c2 2186
ff32e16e
PM
2187 a = float32_squash_input_denormal(a, status);
2188 b = float32_squash_input_denormal(b, status);
37d18660 2189
158142c2
FB
2190 aSig = extractFloat32Frac( a );
2191 aExp = extractFloat32Exp( a );
2192 aSign = extractFloat32Sign( a );
2193 bSig = extractFloat32Frac( b );
2194 bExp = extractFloat32Exp( b );
2195 bSign = extractFloat32Sign( b );
2196 zSign = aSign ^ bSign;
2197 if ( aExp == 0xFF ) {
2198 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
ff32e16e 2199 return propagateFloat32NaN(a, b, status);
158142c2
FB
2200 }
2201 if ( ( bExp | bSig ) == 0 ) {
ff32e16e 2202 float_raise(float_flag_invalid, status);
158142c2
FB
2203 return float32_default_nan;
2204 }
2205 return packFloat32( zSign, 0xFF, 0 );
2206 }
2207 if ( bExp == 0xFF ) {
ff32e16e
PM
2208 if (bSig) {
2209 return propagateFloat32NaN(a, b, status);
2210 }
158142c2 2211 if ( ( aExp | aSig ) == 0 ) {
ff32e16e 2212 float_raise(float_flag_invalid, status);
158142c2
FB
2213 return float32_default_nan;
2214 }
2215 return packFloat32( zSign, 0xFF, 0 );
2216 }
2217 if ( aExp == 0 ) {
2218 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2219 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2220 }
2221 if ( bExp == 0 ) {
2222 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
2223 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2224 }
2225 zExp = aExp + bExp - 0x7F;
2226 aSig = ( aSig | 0x00800000 )<<7;
2227 bSig = ( bSig | 0x00800000 )<<8;
bb98fe42 2228 shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
158142c2 2229 zSig = zSig64;
bb98fe42 2230 if ( 0 <= (int32_t) ( zSig<<1 ) ) {
158142c2
FB
2231 zSig <<= 1;
2232 --zExp;
2233 }
ff32e16e 2234 return roundAndPackFloat32(zSign, zExp, zSig, status);
158142c2
FB
2235
2236}
2237
2238/*----------------------------------------------------------------------------
2239| Returns the result of dividing the single-precision floating-point value `a'
2240| by the corresponding value `b'. The operation is performed according to the
2241| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2242*----------------------------------------------------------------------------*/
2243
e5a41ffa 2244float32 float32_div(float32 a, float32 b, float_status *status)
158142c2
FB
2245{
2246 flag aSign, bSign, zSign;
94a49d86 2247 int_fast16_t aExp, bExp, zExp;
bb98fe42 2248 uint32_t aSig, bSig, zSig;
ff32e16e
PM
2249 a = float32_squash_input_denormal(a, status);
2250 b = float32_squash_input_denormal(b, status);
158142c2
FB
2251
2252 aSig = extractFloat32Frac( a );
2253 aExp = extractFloat32Exp( a );
2254 aSign = extractFloat32Sign( a );
2255 bSig = extractFloat32Frac( b );
2256 bExp = extractFloat32Exp( b );
2257 bSign = extractFloat32Sign( b );
2258 zSign = aSign ^ bSign;
2259 if ( aExp == 0xFF ) {
ff32e16e
PM
2260 if (aSig) {
2261 return propagateFloat32NaN(a, b, status);
2262 }
158142c2 2263 if ( bExp == 0xFF ) {
ff32e16e
PM
2264 if (bSig) {
2265 return propagateFloat32NaN(a, b, status);
2266 }
2267 float_raise(float_flag_invalid, status);
158142c2
FB
2268 return float32_default_nan;
2269 }
2270 return packFloat32( zSign, 0xFF, 0 );
2271 }
2272 if ( bExp == 0xFF ) {
ff32e16e
PM
2273 if (bSig) {
2274 return propagateFloat32NaN(a, b, status);
2275 }
158142c2
FB
2276 return packFloat32( zSign, 0, 0 );
2277 }
2278 if ( bExp == 0 ) {
2279 if ( bSig == 0 ) {
2280 if ( ( aExp | aSig ) == 0 ) {
ff32e16e 2281 float_raise(float_flag_invalid, status);
158142c2
FB
2282 return float32_default_nan;
2283 }
ff32e16e 2284 float_raise(float_flag_divbyzero, status);
158142c2
FB
2285 return packFloat32( zSign, 0xFF, 0 );
2286 }
2287 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2288 }
2289 if ( aExp == 0 ) {
2290 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2291 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2292 }
2293 zExp = aExp - bExp + 0x7D;
2294 aSig = ( aSig | 0x00800000 )<<7;
2295 bSig = ( bSig | 0x00800000 )<<8;
2296 if ( bSig <= ( aSig + aSig ) ) {
2297 aSig >>= 1;
2298 ++zExp;
2299 }
bb98fe42 2300 zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
158142c2 2301 if ( ( zSig & 0x3F ) == 0 ) {
bb98fe42 2302 zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
158142c2 2303 }
ff32e16e 2304 return roundAndPackFloat32(zSign, zExp, zSig, status);
158142c2
FB
2305
2306}
2307
2308/*----------------------------------------------------------------------------
2309| Returns the remainder of the single-precision floating-point value `a'
2310| with respect to the corresponding value `b'. The operation is performed
2311| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2312*----------------------------------------------------------------------------*/
2313
e5a41ffa 2314float32 float32_rem(float32 a, float32 b, float_status *status)
158142c2 2315{
ed086f3d 2316 flag aSign, zSign;
94a49d86 2317 int_fast16_t aExp, bExp, expDiff;
bb98fe42
AF
2318 uint32_t aSig, bSig;
2319 uint32_t q;
2320 uint64_t aSig64, bSig64, q64;
2321 uint32_t alternateASig;
2322 int32_t sigMean;
ff32e16e
PM
2323 a = float32_squash_input_denormal(a, status);
2324 b = float32_squash_input_denormal(b, status);
158142c2
FB
2325
2326 aSig = extractFloat32Frac( a );
2327 aExp = extractFloat32Exp( a );
2328 aSign = extractFloat32Sign( a );
2329 bSig = extractFloat32Frac( b );
2330 bExp = extractFloat32Exp( b );
158142c2
FB
2331 if ( aExp == 0xFF ) {
2332 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
ff32e16e 2333 return propagateFloat32NaN(a, b, status);
158142c2 2334 }
ff32e16e 2335 float_raise(float_flag_invalid, status);
158142c2
FB
2336 return float32_default_nan;
2337 }
2338 if ( bExp == 0xFF ) {
ff32e16e
PM
2339 if (bSig) {
2340 return propagateFloat32NaN(a, b, status);
2341 }
158142c2
FB
2342 return a;
2343 }
2344 if ( bExp == 0 ) {
2345 if ( bSig == 0 ) {
ff32e16e 2346 float_raise(float_flag_invalid, status);
158142c2
FB
2347 return float32_default_nan;
2348 }
2349 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2350 }
2351 if ( aExp == 0 ) {
2352 if ( aSig == 0 ) return a;
2353 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2354 }
2355 expDiff = aExp - bExp;
2356 aSig |= 0x00800000;
2357 bSig |= 0x00800000;
2358 if ( expDiff < 32 ) {
2359 aSig <<= 8;
2360 bSig <<= 8;
2361 if ( expDiff < 0 ) {
2362 if ( expDiff < -1 ) return a;
2363 aSig >>= 1;
2364 }
2365 q = ( bSig <= aSig );
2366 if ( q ) aSig -= bSig;
2367 if ( 0 < expDiff ) {
bb98fe42 2368 q = ( ( (uint64_t) aSig )<<32 ) / bSig;
158142c2
FB
2369 q >>= 32 - expDiff;
2370 bSig >>= 2;
2371 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2372 }
2373 else {
2374 aSig >>= 2;
2375 bSig >>= 2;
2376 }
2377 }
2378 else {
2379 if ( bSig <= aSig ) aSig -= bSig;
bb98fe42
AF
2380 aSig64 = ( (uint64_t) aSig )<<40;
2381 bSig64 = ( (uint64_t) bSig )<<40;
158142c2
FB
2382 expDiff -= 64;
2383 while ( 0 < expDiff ) {
2384 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2385 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2386 aSig64 = - ( ( bSig * q64 )<<38 );
2387 expDiff -= 62;
2388 }
2389 expDiff += 64;
2390 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2391 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2392 q = q64>>( 64 - expDiff );
2393 bSig <<= 6;
2394 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2395 }
2396 do {
2397 alternateASig = aSig;
2398 ++q;
2399 aSig -= bSig;
bb98fe42 2400 } while ( 0 <= (int32_t) aSig );
158142c2
FB
2401 sigMean = aSig + alternateASig;
2402 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2403 aSig = alternateASig;
2404 }
bb98fe42 2405 zSign = ( (int32_t) aSig < 0 );
158142c2 2406 if ( zSign ) aSig = - aSig;
ff32e16e 2407 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
158142c2
FB
2408}
2409
369be8f6
PM
2410/*----------------------------------------------------------------------------
2411| Returns the result of multiplying the single-precision floating-point values
2412| `a' and `b' then adding 'c', with no intermediate rounding step after the
2413| multiplication. The operation is performed according to the IEC/IEEE
2414| Standard for Binary Floating-Point Arithmetic 754-2008.
2415| The flags argument allows the caller to select negation of the
2416| addend, the intermediate product, or the final result. (The difference
2417| between this and having the caller do a separate negation is that negating
2418| externally will flip the sign bit on NaNs.)
2419*----------------------------------------------------------------------------*/
2420
e5a41ffa
PM
2421float32 float32_muladd(float32 a, float32 b, float32 c, int flags,
2422 float_status *status)
369be8f6
PM
2423{
2424 flag aSign, bSign, cSign, zSign;
94a49d86 2425 int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
369be8f6
PM
2426 uint32_t aSig, bSig, cSig;
2427 flag pInf, pZero, pSign;
2428 uint64_t pSig64, cSig64, zSig64;
2429 uint32_t pSig;
2430 int shiftcount;
2431 flag signflip, infzero;
2432
ff32e16e
PM
2433 a = float32_squash_input_denormal(a, status);
2434 b = float32_squash_input_denormal(b, status);
2435 c = float32_squash_input_denormal(c, status);
369be8f6
PM
2436 aSig = extractFloat32Frac(a);
2437 aExp = extractFloat32Exp(a);
2438 aSign = extractFloat32Sign(a);
2439 bSig = extractFloat32Frac(b);
2440 bExp = extractFloat32Exp(b);
2441 bSign = extractFloat32Sign(b);
2442 cSig = extractFloat32Frac(c);
2443 cExp = extractFloat32Exp(c);
2444 cSign = extractFloat32Sign(c);
2445
2446 infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) ||
2447 (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0));
2448
2449 /* It is implementation-defined whether the cases of (0,inf,qnan)
2450 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
2451 * they return if they do), so we have to hand this information
2452 * off to the target-specific pick-a-NaN routine.
2453 */
2454 if (((aExp == 0xff) && aSig) ||
2455 ((bExp == 0xff) && bSig) ||
2456 ((cExp == 0xff) && cSig)) {
ff32e16e 2457 return propagateFloat32MulAddNaN(a, b, c, infzero, status);
369be8f6
PM
2458 }
2459
2460 if (infzero) {
ff32e16e 2461 float_raise(float_flag_invalid, status);
369be8f6
PM
2462 return float32_default_nan;
2463 }
2464
2465 if (flags & float_muladd_negate_c) {
2466 cSign ^= 1;
2467 }
2468
2469 signflip = (flags & float_muladd_negate_result) ? 1 : 0;
2470
2471 /* Work out the sign and type of the product */
2472 pSign = aSign ^ bSign;
2473 if (flags & float_muladd_negate_product) {
2474 pSign ^= 1;
2475 }
2476 pInf = (aExp == 0xff) || (bExp == 0xff);
2477 pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
2478
2479 if (cExp == 0xff) {
2480 if (pInf && (pSign ^ cSign)) {
2481 /* addition of opposite-signed infinities => InvalidOperation */
ff32e16e 2482 float_raise(float_flag_invalid, status);
369be8f6
PM
2483 return float32_default_nan;
2484 }
2485 /* Otherwise generate an infinity of the same sign */
2486 return packFloat32(cSign ^ signflip, 0xff, 0);
2487 }
2488
2489 if (pInf) {
2490 return packFloat32(pSign ^ signflip, 0xff, 0);
2491 }
2492
2493 if (pZero) {
2494 if (cExp == 0) {
2495 if (cSig == 0) {
2496 /* Adding two exact zeroes */
2497 if (pSign == cSign) {
2498 zSign = pSign;
2499 } else if (STATUS(float_rounding_mode) == float_round_down) {
2500 zSign = 1;
2501 } else {
2502 zSign = 0;
2503 }
2504 return packFloat32(zSign ^ signflip, 0, 0);
2505 }
2506 /* Exact zero plus a denorm */
2507 if (STATUS(flush_to_zero)) {
ff32e16e 2508 float_raise(float_flag_output_denormal, status);
369be8f6
PM
2509 return packFloat32(cSign ^ signflip, 0, 0);
2510 }
2511 }
2512 /* Zero plus something non-zero : just return the something */
67d43538
PM
2513 if (flags & float_muladd_halve_result) {
2514 if (cExp == 0) {
2515 normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2516 }
2517 /* Subtract one to halve, and one again because roundAndPackFloat32
2518 * wants one less than the true exponent.
2519 */
2520 cExp -= 2;
2521 cSig = (cSig | 0x00800000) << 7;
ff32e16e 2522 return roundAndPackFloat32(cSign ^ signflip, cExp, cSig, status);
67d43538 2523 }
a6e7c184 2524 return packFloat32(cSign ^ signflip, cExp, cSig);
369be8f6
PM
2525 }
2526
2527 if (aExp == 0) {
2528 normalizeFloat32Subnormal(aSig, &aExp, &aSig);
2529 }
2530 if (bExp == 0) {
2531 normalizeFloat32Subnormal(bSig, &bExp, &bSig);
2532 }
2533
2534 /* Calculate the actual result a * b + c */
2535
2536 /* Multiply first; this is easy. */
2537 /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
2538 * because we want the true exponent, not the "one-less-than"
2539 * flavour that roundAndPackFloat32() takes.
2540 */
2541 pExp = aExp + bExp - 0x7e;
2542 aSig = (aSig | 0x00800000) << 7;
2543 bSig = (bSig | 0x00800000) << 8;
2544 pSig64 = (uint64_t)aSig * bSig;
2545 if ((int64_t)(pSig64 << 1) >= 0) {
2546 pSig64 <<= 1;
2547 pExp--;
2548 }
2549
2550 zSign = pSign ^ signflip;
2551
2552 /* Now pSig64 is the significand of the multiply, with the explicit bit in
2553 * position 62.
2554 */
2555 if (cExp == 0) {
2556 if (!cSig) {
2557 /* Throw out the special case of c being an exact zero now */
2558 shift64RightJamming(pSig64, 32, &pSig64);
2559 pSig = pSig64;
67d43538
PM
2560 if (flags & float_muladd_halve_result) {
2561 pExp--;
2562 }
369be8f6 2563 return roundAndPackFloat32(zSign, pExp - 1,
ff32e16e 2564 pSig, status);
369be8f6
PM
2565 }
2566 normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2567 }
2568
2569 cSig64 = (uint64_t)cSig << (62 - 23);
2570 cSig64 |= LIT64(0x4000000000000000);
2571 expDiff = pExp - cExp;
2572
2573 if (pSign == cSign) {
2574 /* Addition */
2575 if (expDiff > 0) {
2576 /* scale c to match p */
2577 shift64RightJamming(cSig64, expDiff, &cSig64);
2578 zExp = pExp;
2579 } else if (expDiff < 0) {
2580 /* scale p to match c */
2581 shift64RightJamming(pSig64, -expDiff, &pSig64);
2582 zExp = cExp;
2583 } else {
2584 /* no scaling needed */
2585 zExp = cExp;
2586 }
2587 /* Add significands and make sure explicit bit ends up in posn 62 */
2588 zSig64 = pSig64 + cSig64;
2589 if ((int64_t)zSig64 < 0) {
2590 shift64RightJamming(zSig64, 1, &zSig64);
2591 } else {
2592 zExp--;
2593 }
2594 } else {
2595 /* Subtraction */
2596 if (expDiff > 0) {
2597 shift64RightJamming(cSig64, expDiff, &cSig64);
2598 zSig64 = pSig64 - cSig64;
2599 zExp = pExp;
2600 } else if (expDiff < 0) {
2601 shift64RightJamming(pSig64, -expDiff, &pSig64);
2602 zSig64 = cSig64 - pSig64;
2603 zExp = cExp;
2604 zSign ^= 1;
2605 } else {
2606 zExp = pExp;
2607 if (cSig64 < pSig64) {
2608 zSig64 = pSig64 - cSig64;
2609 } else if (pSig64 < cSig64) {
2610 zSig64 = cSig64 - pSig64;
2611 zSign ^= 1;
2612 } else {
2613 /* Exact zero */
2614 zSign = signflip;
2615 if (STATUS(float_rounding_mode) == float_round_down) {
2616 zSign ^= 1;
2617 }
2618 return packFloat32(zSign, 0, 0);
2619 }
2620 }
2621 --zExp;
2622 /* Normalize to put the explicit bit back into bit 62. */
2623 shiftcount = countLeadingZeros64(zSig64) - 1;
2624 zSig64 <<= shiftcount;
2625 zExp -= shiftcount;
2626 }
67d43538
PM
2627 if (flags & float_muladd_halve_result) {
2628 zExp--;
2629 }
2630
369be8f6 2631 shift64RightJamming(zSig64, 32, &zSig64);
ff32e16e 2632 return roundAndPackFloat32(zSign, zExp, zSig64, status);
369be8f6
PM
2633}
2634
2635
158142c2
FB
2636/*----------------------------------------------------------------------------
2637| Returns the square root of the single-precision floating-point value `a'.
2638| The operation is performed according to the IEC/IEEE Standard for Binary
2639| Floating-Point Arithmetic.
2640*----------------------------------------------------------------------------*/
2641
e5a41ffa 2642float32 float32_sqrt(float32 a, float_status *status)
158142c2
FB
2643{
2644 flag aSign;
94a49d86 2645 int_fast16_t aExp, zExp;
bb98fe42
AF
2646 uint32_t aSig, zSig;
2647 uint64_t rem, term;
ff32e16e 2648 a = float32_squash_input_denormal(a, status);
158142c2
FB
2649
2650 aSig = extractFloat32Frac( a );
2651 aExp = extractFloat32Exp( a );
2652 aSign = extractFloat32Sign( a );
2653 if ( aExp == 0xFF ) {
ff32e16e
PM
2654 if (aSig) {
2655 return propagateFloat32NaN(a, float32_zero, status);
2656 }
158142c2 2657 if ( ! aSign ) return a;
ff32e16e 2658 float_raise(float_flag_invalid, status);
158142c2
FB
2659 return float32_default_nan;
2660 }
2661 if ( aSign ) {
2662 if ( ( aExp | aSig ) == 0 ) return a;
ff32e16e 2663 float_raise(float_flag_invalid, status);
158142c2
FB
2664 return float32_default_nan;
2665 }
2666 if ( aExp == 0 ) {
f090c9d4 2667 if ( aSig == 0 ) return float32_zero;
158142c2
FB
2668 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2669 }
2670 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2671 aSig = ( aSig | 0x00800000 )<<8;
2672 zSig = estimateSqrt32( aExp, aSig ) + 2;
2673 if ( ( zSig & 0x7F ) <= 5 ) {
2674 if ( zSig < 2 ) {
2675 zSig = 0x7FFFFFFF;
2676 goto roundAndPack;
2677 }
2678 aSig >>= aExp & 1;
bb98fe42
AF
2679 term = ( (uint64_t) zSig ) * zSig;
2680 rem = ( ( (uint64_t) aSig )<<32 ) - term;
2681 while ( (int64_t) rem < 0 ) {
158142c2 2682 --zSig;
bb98fe42 2683 rem += ( ( (uint64_t) zSig )<<1 ) | 1;
158142c2
FB
2684 }
2685 zSig |= ( rem != 0 );
2686 }
2687 shift32RightJamming( zSig, 1, &zSig );
2688 roundAndPack:
ff32e16e 2689 return roundAndPackFloat32(0, zExp, zSig, status);
158142c2
FB
2690
2691}
2692
8229c991
AJ
2693/*----------------------------------------------------------------------------
2694| Returns the binary exponential of the single-precision floating-point value
2695| `a'. The operation is performed according to the IEC/IEEE Standard for
2696| Binary Floating-Point Arithmetic.
2697|
2698| Uses the following identities:
2699|
2700| 1. -------------------------------------------------------------------------
2701| x x*ln(2)
2702| 2 = e
2703|
2704| 2. -------------------------------------------------------------------------
2705| 2 3 4 5 n
2706| x x x x x x x
2707| e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2708| 1! 2! 3! 4! 5! n!
2709*----------------------------------------------------------------------------*/
2710
2711static const float64 float32_exp2_coefficients[15] =
2712{
d5138cf4
PM
2713 const_float64( 0x3ff0000000000000ll ), /* 1 */
2714 const_float64( 0x3fe0000000000000ll ), /* 2 */
2715 const_float64( 0x3fc5555555555555ll ), /* 3 */
2716 const_float64( 0x3fa5555555555555ll ), /* 4 */
2717 const_float64( 0x3f81111111111111ll ), /* 5 */
2718 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
2719 const_float64( 0x3f2a01a01a01a01all ), /* 7 */
2720 const_float64( 0x3efa01a01a01a01all ), /* 8 */
2721 const_float64( 0x3ec71de3a556c734ll ), /* 9 */
2722 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2723 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2724 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2725 const_float64( 0x3de6124613a86d09ll ), /* 13 */
2726 const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2727 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
8229c991
AJ
2728};
2729
e5a41ffa 2730float32 float32_exp2(float32 a, float_status *status)
8229c991
AJ
2731{
2732 flag aSign;
94a49d86 2733 int_fast16_t aExp;
bb98fe42 2734 uint32_t aSig;
8229c991
AJ
2735 float64 r, x, xn;
2736 int i;
ff32e16e 2737 a = float32_squash_input_denormal(a, status);
8229c991
AJ
2738
2739 aSig = extractFloat32Frac( a );
2740 aExp = extractFloat32Exp( a );
2741 aSign = extractFloat32Sign( a );
2742
2743 if ( aExp == 0xFF) {
ff32e16e
PM
2744 if (aSig) {
2745 return propagateFloat32NaN(a, float32_zero, status);
2746 }
8229c991
AJ
2747 return (aSign) ? float32_zero : a;
2748 }
2749 if (aExp == 0) {
2750 if (aSig == 0) return float32_one;
2751 }
2752
ff32e16e 2753 float_raise(float_flag_inexact, status);
8229c991
AJ
2754
2755 /* ******************************* */
2756 /* using float64 for approximation */
2757 /* ******************************* */
ff32e16e
PM
2758 x = float32_to_float64(a, status);
2759 x = float64_mul(x, float64_ln2, status);
8229c991
AJ
2760
2761 xn = x;
2762 r = float64_one;
2763 for (i = 0 ; i < 15 ; i++) {
2764 float64 f;
2765
ff32e16e
PM
2766 f = float64_mul(xn, float32_exp2_coefficients[i], status);
2767 r = float64_add(r, f, status);
8229c991 2768
ff32e16e 2769 xn = float64_mul(xn, x, status);
8229c991
AJ
2770 }
2771
2772 return float64_to_float32(r, status);
2773}
2774
374dfc33
AJ
2775/*----------------------------------------------------------------------------
2776| Returns the binary log of the single-precision floating-point value `a'.
2777| The operation is performed according to the IEC/IEEE Standard for Binary
2778| Floating-Point Arithmetic.
2779*----------------------------------------------------------------------------*/
e5a41ffa 2780float32 float32_log2(float32 a, float_status *status)
374dfc33
AJ
2781{
2782 flag aSign, zSign;
94a49d86 2783 int_fast16_t aExp;
bb98fe42 2784 uint32_t aSig, zSig, i;
374dfc33 2785
ff32e16e 2786 a = float32_squash_input_denormal(a, status);
374dfc33
AJ
2787 aSig = extractFloat32Frac( a );
2788 aExp = extractFloat32Exp( a );
2789 aSign = extractFloat32Sign( a );
2790
2791 if ( aExp == 0 ) {
2792 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2793 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2794 }
2795 if ( aSign ) {
ff32e16e 2796 float_raise(float_flag_invalid, status);
374dfc33
AJ
2797 return float32_default_nan;
2798 }
2799 if ( aExp == 0xFF ) {
ff32e16e
PM
2800 if (aSig) {
2801 return propagateFloat32NaN(a, float32_zero, status);
2802 }
374dfc33
AJ
2803 return a;
2804 }
2805
2806 aExp -= 0x7F;
2807 aSig |= 0x00800000;
2808 zSign = aExp < 0;
2809 zSig = aExp << 23;
2810
2811 for (i = 1 << 22; i > 0; i >>= 1) {
bb98fe42 2812 aSig = ( (uint64_t)aSig * aSig ) >> 23;
374dfc33
AJ
2813 if ( aSig & 0x01000000 ) {
2814 aSig >>= 1;
2815 zSig |= i;
2816 }
2817 }
2818
2819 if ( zSign )
2820 zSig = -zSig;
2821
ff32e16e 2822 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
374dfc33
AJ
2823}
2824
158142c2
FB
2825/*----------------------------------------------------------------------------
2826| Returns 1 if the single-precision floating-point value `a' is equal to
b689362d
AJ
2827| the corresponding value `b', and 0 otherwise. The invalid exception is
2828| raised if either operand is a NaN. Otherwise, the comparison is performed
158142c2
FB
2829| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2830*----------------------------------------------------------------------------*/
2831
e5a41ffa 2832int float32_eq(float32 a, float32 b, float_status *status)
158142c2 2833{
b689362d 2834 uint32_t av, bv;
ff32e16e
PM
2835 a = float32_squash_input_denormal(a, status);
2836 b = float32_squash_input_denormal(b, status);
158142c2
FB
2837
2838 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2839 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2840 ) {
ff32e16e 2841 float_raise(float_flag_invalid, status);
158142c2
FB
2842 return 0;
2843 }
b689362d
AJ
2844 av = float32_val(a);
2845 bv = float32_val(b);
2846 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
158142c2
FB
2847}
2848
2849/*----------------------------------------------------------------------------
2850| Returns 1 if the single-precision floating-point value `a' is less than
f5a64251
AJ
2851| or equal to the corresponding value `b', and 0 otherwise. The invalid
2852| exception is raised if either operand is a NaN. The comparison is performed
2853| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
2854*----------------------------------------------------------------------------*/
2855
e5a41ffa 2856int float32_le(float32 a, float32 b, float_status *status)
158142c2
FB
2857{
2858 flag aSign, bSign;
bb98fe42 2859 uint32_t av, bv;
ff32e16e
PM
2860 a = float32_squash_input_denormal(a, status);
2861 b = float32_squash_input_denormal(b, status);
158142c2
FB
2862
2863 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2864 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2865 ) {
ff32e16e 2866 float_raise(float_flag_invalid, status);
158142c2
FB
2867 return 0;
2868 }
2869 aSign = extractFloat32Sign( a );
2870 bSign = extractFloat32Sign( b );
f090c9d4
PB
2871 av = float32_val(a);
2872 bv = float32_val(b);
bb98fe42 2873 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 2874 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
2875
2876}
2877
2878/*----------------------------------------------------------------------------
2879| Returns 1 if the single-precision floating-point value `a' is less than
f5a64251
AJ
2880| the corresponding value `b', and 0 otherwise. The invalid exception is
2881| raised if either operand is a NaN. The comparison is performed according
2882| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
2883*----------------------------------------------------------------------------*/
2884
e5a41ffa 2885int float32_lt(float32 a, float32 b, float_status *status)
158142c2
FB
2886{
2887 flag aSign, bSign;
bb98fe42 2888 uint32_t av, bv;
ff32e16e
PM
2889 a = float32_squash_input_denormal(a, status);
2890 b = float32_squash_input_denormal(b, status);
158142c2
FB
2891
2892 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2893 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2894 ) {
ff32e16e 2895 float_raise(float_flag_invalid, status);
158142c2
FB
2896 return 0;
2897 }
2898 aSign = extractFloat32Sign( a );
2899 bSign = extractFloat32Sign( b );
f090c9d4
PB
2900 av = float32_val(a);
2901 bv = float32_val(b);
bb98fe42 2902 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 2903 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
2904
2905}
2906
67b7861d
AJ
2907/*----------------------------------------------------------------------------
2908| Returns 1 if the single-precision floating-point values `a' and `b' cannot
f5a64251
AJ
2909| be compared, and 0 otherwise. The invalid exception is raised if either
2910| operand is a NaN. The comparison is performed according to the IEC/IEEE
2911| Standard for Binary Floating-Point Arithmetic.
67b7861d
AJ
2912*----------------------------------------------------------------------------*/
2913
e5a41ffa 2914int float32_unordered(float32 a, float32 b, float_status *status)
67b7861d 2915{
ff32e16e
PM
2916 a = float32_squash_input_denormal(a, status);
2917 b = float32_squash_input_denormal(b, status);
67b7861d
AJ
2918
2919 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2920 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2921 ) {
ff32e16e 2922 float_raise(float_flag_invalid, status);
67b7861d
AJ
2923 return 1;
2924 }
2925 return 0;
2926}
b689362d 2927
158142c2
FB
2928/*----------------------------------------------------------------------------
2929| Returns 1 if the single-precision floating-point value `a' is equal to
f5a64251
AJ
2930| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2931| exception. The comparison is performed according to the IEC/IEEE Standard
2932| for Binary Floating-Point Arithmetic.
158142c2
FB
2933*----------------------------------------------------------------------------*/
2934
e5a41ffa 2935int float32_eq_quiet(float32 a, float32 b, float_status *status)
158142c2 2936{
ff32e16e
PM
2937 a = float32_squash_input_denormal(a, status);
2938 b = float32_squash_input_denormal(b, status);
158142c2
FB
2939
2940 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2941 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2942 ) {
b689362d 2943 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
ff32e16e 2944 float_raise(float_flag_invalid, status);
b689362d 2945 }
158142c2
FB
2946 return 0;
2947 }
b689362d
AJ
2948 return ( float32_val(a) == float32_val(b) ) ||
2949 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
158142c2
FB
2950}
2951
2952/*----------------------------------------------------------------------------
2953| Returns 1 if the single-precision floating-point value `a' is less than or
2954| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2955| cause an exception. Otherwise, the comparison is performed according to the
2956| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2957*----------------------------------------------------------------------------*/
2958
e5a41ffa 2959int float32_le_quiet(float32 a, float32 b, float_status *status)
158142c2
FB
2960{
2961 flag aSign, bSign;
bb98fe42 2962 uint32_t av, bv;
ff32e16e
PM
2963 a = float32_squash_input_denormal(a, status);
2964 b = float32_squash_input_denormal(b, status);
158142c2
FB
2965
2966 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2967 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2968 ) {
2969 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
ff32e16e 2970 float_raise(float_flag_invalid, status);
158142c2
FB
2971 }
2972 return 0;
2973 }
2974 aSign = extractFloat32Sign( a );
2975 bSign = extractFloat32Sign( b );
f090c9d4
PB
2976 av = float32_val(a);
2977 bv = float32_val(b);
bb98fe42 2978 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 2979 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
2980
2981}
2982
2983/*----------------------------------------------------------------------------
2984| Returns 1 if the single-precision floating-point value `a' is less than
2985| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2986| exception. Otherwise, the comparison is performed according to the IEC/IEEE
2987| Standard for Binary Floating-Point Arithmetic.
2988*----------------------------------------------------------------------------*/
2989
e5a41ffa 2990int float32_lt_quiet(float32 a, float32 b, float_status *status)
158142c2
FB
2991{
2992 flag aSign, bSign;
bb98fe42 2993 uint32_t av, bv;
ff32e16e
PM
2994 a = float32_squash_input_denormal(a, status);
2995 b = float32_squash_input_denormal(b, status);
158142c2
FB
2996
2997 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2998 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2999 ) {
3000 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
ff32e16e 3001 float_raise(float_flag_invalid, status);
158142c2
FB
3002 }
3003 return 0;
3004 }
3005 aSign = extractFloat32Sign( a );
3006 bSign = extractFloat32Sign( b );
f090c9d4
PB
3007 av = float32_val(a);
3008 bv = float32_val(b);
bb98fe42 3009 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 3010 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
3011
3012}
3013
67b7861d
AJ
3014/*----------------------------------------------------------------------------
3015| Returns 1 if the single-precision floating-point values `a' and `b' cannot
3016| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
3017| comparison is performed according to the IEC/IEEE Standard for Binary
3018| Floating-Point Arithmetic.
3019*----------------------------------------------------------------------------*/
3020
e5a41ffa 3021int float32_unordered_quiet(float32 a, float32 b, float_status *status)
67b7861d 3022{
ff32e16e
PM
3023 a = float32_squash_input_denormal(a, status);
3024 b = float32_squash_input_denormal(b, status);
67b7861d
AJ
3025
3026 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3027 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3028 ) {
3029 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
ff32e16e 3030 float_raise(float_flag_invalid, status);
67b7861d
AJ
3031 }
3032 return 1;
3033 }
3034 return 0;
3035}
3036
158142c2
FB
3037/*----------------------------------------------------------------------------
3038| Returns the result of converting the double-precision floating-point value
3039| `a' to the 32-bit two's complement integer format. The conversion is
3040| performed according to the IEC/IEEE Standard for Binary Floating-Point
3041| Arithmetic---which means in particular that the conversion is rounded
3042| according to the current rounding mode. If `a' is a NaN, the largest
3043| positive integer is returned. Otherwise, if the conversion overflows, the
3044| largest integer with the same sign as `a' is returned.
3045*----------------------------------------------------------------------------*/
3046
e5a41ffa 3047int32 float64_to_int32(float64 a, float_status *status)
158142c2
FB
3048{
3049 flag aSign;
94a49d86 3050 int_fast16_t aExp, shiftCount;
bb98fe42 3051 uint64_t aSig;
ff32e16e 3052 a = float64_squash_input_denormal(a, status);
158142c2
FB
3053
3054 aSig = extractFloat64Frac( a );
3055 aExp = extractFloat64Exp( a );
3056 aSign = extractFloat64Sign( a );
3057 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3058 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3059 shiftCount = 0x42C - aExp;
3060 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
ff32e16e 3061 return roundAndPackInt32(aSign, aSig, status);
158142c2
FB
3062
3063}
3064
3065/*----------------------------------------------------------------------------
3066| Returns the result of converting the double-precision floating-point value
3067| `a' to the 32-bit two's complement integer format. The conversion is
3068| performed according to the IEC/IEEE Standard for Binary Floating-Point
3069| Arithmetic, except that the conversion is always rounded toward zero.
3070| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
3071| the conversion overflows, the largest integer with the same sign as `a' is
3072| returned.
3073*----------------------------------------------------------------------------*/
3074
e5a41ffa 3075int32 float64_to_int32_round_to_zero(float64 a, float_status *status)
158142c2
FB
3076{
3077 flag aSign;
94a49d86 3078 int_fast16_t aExp, shiftCount;
bb98fe42 3079 uint64_t aSig, savedASig;
b3a6a2e0 3080 int32_t z;
ff32e16e 3081 a = float64_squash_input_denormal(a, status);
158142c2
FB
3082
3083 aSig = extractFloat64Frac( a );
3084 aExp = extractFloat64Exp( a );
3085 aSign = extractFloat64Sign( a );
3086 if ( 0x41E < aExp ) {
3087 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3088 goto invalid;
3089 }
3090 else if ( aExp < 0x3FF ) {
3091 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3092 return 0;
3093 }
3094 aSig |= LIT64( 0x0010000000000000 );
3095 shiftCount = 0x433 - aExp;
3096 savedASig = aSig;
3097 aSig >>= shiftCount;
3098 z = aSig;
3099 if ( aSign ) z = - z;
3100 if ( ( z < 0 ) ^ aSign ) {
3101 invalid:
ff32e16e 3102 float_raise(float_flag_invalid, status);
bb98fe42 3103 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2
FB
3104 }
3105 if ( ( aSig<<shiftCount ) != savedASig ) {
3106 STATUS(float_exception_flags) |= float_flag_inexact;
3107 }
3108 return z;
3109
3110}
3111
cbcef455
PM
3112/*----------------------------------------------------------------------------
3113| Returns the result of converting the double-precision floating-point value
3114| `a' to the 16-bit two's complement integer format. The conversion is
3115| performed according to the IEC/IEEE Standard for Binary Floating-Point
3116| Arithmetic, except that the conversion is always rounded toward zero.
3117| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
3118| the conversion overflows, the largest integer with the same sign as `a' is
3119| returned.
3120*----------------------------------------------------------------------------*/
3121
e5a41ffa 3122int_fast16_t float64_to_int16_round_to_zero(float64 a, float_status *status)
cbcef455
PM
3123{
3124 flag aSign;
94a49d86 3125 int_fast16_t aExp, shiftCount;
bb98fe42 3126 uint64_t aSig, savedASig;
cbcef455
PM
3127 int32 z;
3128
3129 aSig = extractFloat64Frac( a );
3130 aExp = extractFloat64Exp( a );
3131 aSign = extractFloat64Sign( a );
3132 if ( 0x40E < aExp ) {
3133 if ( ( aExp == 0x7FF ) && aSig ) {
3134 aSign = 0;
3135 }
3136 goto invalid;
3137 }
3138 else if ( aExp < 0x3FF ) {
3139 if ( aExp || aSig ) {
3140 STATUS(float_exception_flags) |= float_flag_inexact;
3141 }
3142 return 0;
3143 }
3144 aSig |= LIT64( 0x0010000000000000 );
3145 shiftCount = 0x433 - aExp;
3146 savedASig = aSig;
3147 aSig >>= shiftCount;
3148 z = aSig;
3149 if ( aSign ) {
3150 z = - z;
3151 }
3152 if ( ( (int16_t)z < 0 ) ^ aSign ) {
3153 invalid:
ff32e16e 3154 float_raise(float_flag_invalid, status);
bb98fe42 3155 return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
cbcef455
PM
3156 }
3157 if ( ( aSig<<shiftCount ) != savedASig ) {
3158 STATUS(float_exception_flags) |= float_flag_inexact;
3159 }
3160 return z;
3161}
3162
158142c2
FB
3163/*----------------------------------------------------------------------------
3164| Returns the result of converting the double-precision floating-point value
3165| `a' to the 64-bit two's complement integer format. The conversion is
3166| performed according to the IEC/IEEE Standard for Binary Floating-Point
3167| Arithmetic---which means in particular that the conversion is rounded
3168| according to the current rounding mode. If `a' is a NaN, the largest
3169| positive integer is returned. Otherwise, if the conversion overflows, the
3170| largest integer with the same sign as `a' is returned.
3171*----------------------------------------------------------------------------*/
3172
e5a41ffa 3173int64 float64_to_int64(float64 a, float_status *status)
158142c2
FB
3174{
3175 flag aSign;
94a49d86 3176 int_fast16_t aExp, shiftCount;
bb98fe42 3177 uint64_t aSig, aSigExtra;
ff32e16e 3178 a = float64_squash_input_denormal(a, status);
158142c2
FB
3179
3180 aSig = extractFloat64Frac( a );
3181 aExp = extractFloat64Exp( a );
3182 aSign = extractFloat64Sign( a );
3183 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3184 shiftCount = 0x433 - aExp;
3185 if ( shiftCount <= 0 ) {
3186 if ( 0x43E < aExp ) {
ff32e16e 3187 float_raise(float_flag_invalid, status);
158142c2
FB
3188 if ( ! aSign
3189 || ( ( aExp == 0x7FF )
3190 && ( aSig != LIT64( 0x0010000000000000 ) ) )
3191 ) {
3192 return LIT64( 0x7FFFFFFFFFFFFFFF );
3193 }
bb98fe42 3194 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
3195 }
3196 aSigExtra = 0;
3197 aSig <<= - shiftCount;
3198 }
3199 else {
3200 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3201 }
ff32e16e 3202 return roundAndPackInt64(aSign, aSig, aSigExtra, status);
158142c2
FB
3203
3204}
3205
3206/*----------------------------------------------------------------------------
3207| Returns the result of converting the double-precision floating-point value
3208| `a' to the 64-bit two's complement integer format. The conversion is
3209| performed according to the IEC/IEEE Standard for Binary Floating-Point
3210| Arithmetic, except that the conversion is always rounded toward zero.
3211| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
3212| the conversion overflows, the largest integer with the same sign as `a' is
3213| returned.
3214*----------------------------------------------------------------------------*/
3215
e5a41ffa 3216int64 float64_to_int64_round_to_zero(float64 a, float_status *status)
158142c2
FB
3217{
3218 flag aSign;
94a49d86 3219 int_fast16_t aExp, shiftCount;
bb98fe42 3220 uint64_t aSig;
158142c2 3221 int64 z;
ff32e16e 3222 a = float64_squash_input_denormal(a, status);
158142c2
FB
3223
3224 aSig = extractFloat64Frac( a );
3225 aExp = extractFloat64Exp( a );
3226 aSign = extractFloat64Sign( a );
3227 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3228 shiftCount = aExp - 0x433;
3229 if ( 0 <= shiftCount ) {
3230 if ( 0x43E <= aExp ) {
f090c9d4 3231 if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
ff32e16e 3232 float_raise(float_flag_invalid, status);
158142c2
FB
3233 if ( ! aSign
3234 || ( ( aExp == 0x7FF )
3235 && ( aSig != LIT64( 0x0010000000000000 ) ) )
3236 ) {
3237 return LIT64( 0x7FFFFFFFFFFFFFFF );
3238 }
3239 }
bb98fe42 3240 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
3241 }
3242 z = aSig<<shiftCount;
3243 }
3244 else {
3245 if ( aExp < 0x3FE ) {
3246 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3247 return 0;
3248 }
3249 z = aSig>>( - shiftCount );
bb98fe42 3250 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
158142c2
FB
3251 STATUS(float_exception_flags) |= float_flag_inexact;
3252 }
3253 }
3254 if ( aSign ) z = - z;
3255 return z;
3256
3257}
3258
3259/*----------------------------------------------------------------------------
3260| Returns the result of converting the double-precision floating-point value
3261| `a' to the single-precision floating-point format. The conversion is
3262| performed according to the IEC/IEEE Standard for Binary Floating-Point
3263| Arithmetic.
3264*----------------------------------------------------------------------------*/
3265
e5a41ffa 3266float32 float64_to_float32(float64 a, float_status *status)
158142c2
FB
3267{
3268 flag aSign;
94a49d86 3269 int_fast16_t aExp;
bb98fe42
AF
3270 uint64_t aSig;
3271 uint32_t zSig;
ff32e16e 3272 a = float64_squash_input_denormal(a, status);
158142c2
FB
3273
3274 aSig = extractFloat64Frac( a );
3275 aExp = extractFloat64Exp( a );
3276 aSign = extractFloat64Sign( a );
3277 if ( aExp == 0x7FF ) {
ff32e16e
PM
3278 if (aSig) {
3279 return commonNaNToFloat32(float64ToCommonNaN(a, status), status);
3280 }
158142c2
FB
3281 return packFloat32( aSign, 0xFF, 0 );
3282 }
3283 shift64RightJamming( aSig, 22, &aSig );
3284 zSig = aSig;
3285 if ( aExp || zSig ) {
3286 zSig |= 0x40000000;
3287 aExp -= 0x381;
3288 }
ff32e16e 3289 return roundAndPackFloat32(aSign, aExp, zSig, status);
158142c2
FB
3290
3291}
3292
60011498
PB
3293
3294/*----------------------------------------------------------------------------
3295| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3296| half-precision floating-point value, returning the result. After being
3297| shifted into the proper positions, the three fields are simply added
3298| together to form the result. This means that any integer portion of `zSig'
3299| will be added into the exponent. Since a properly normalized significand
3300| will have an integer portion equal to 1, the `zExp' input should be 1 less
3301| than the desired result exponent whenever `zSig' is a complete, normalized
3302| significand.
3303*----------------------------------------------------------------------------*/
94a49d86 3304static float16 packFloat16(flag zSign, int_fast16_t zExp, uint16_t zSig)
60011498 3305{
bb4d4bb3 3306 return make_float16(
bb98fe42 3307 (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
60011498
PB
3308}
3309
c4a1c5e7
PM
3310/*----------------------------------------------------------------------------
3311| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3312| and significand `zSig', and returns the proper half-precision floating-
3313| point value corresponding to the abstract input. Ordinarily, the abstract
3314| value is simply rounded and packed into the half-precision format, with
3315| the inexact exception raised if the abstract input cannot be represented
3316| exactly. However, if the abstract value is too large, the overflow and
3317| inexact exceptions are raised and an infinity or maximal finite value is
3318| returned. If the abstract value is too small, the input value is rounded to
3319| a subnormal number, and the underflow and inexact exceptions are raised if
3320| the abstract input cannot be represented exactly as a subnormal half-
3321| precision floating-point number.
3322| The `ieee' flag indicates whether to use IEEE standard half precision, or
3323| ARM-style "alternative representation", which omits the NaN and Inf
3324| encodings in order to raise the maximum representable exponent by one.
3325| The input significand `zSig' has its binary point between bits 22
3326| and 23, which is 13 bits to the left of the usual location. This shifted
3327| significand must be normalized or smaller. If `zSig' is not normalized,
3328| `zExp' must be 0; in that case, the result returned is a subnormal number,
3329| and it must not require rounding. In the usual case that `zSig' is
3330| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3331| Note the slightly odd position of the binary point in zSig compared with the
3332| other roundAndPackFloat functions. This should probably be fixed if we
3333| need to implement more float16 routines than just conversion.
3334| The handling of underflow and overflow follows the IEC/IEEE Standard for
3335| Binary Floating-Point Arithmetic.
3336*----------------------------------------------------------------------------*/
3337
3338static float32 roundAndPackFloat16(flag zSign, int_fast16_t zExp,
e5a41ffa
PM
3339 uint32_t zSig, flag ieee,
3340 float_status *status)
c4a1c5e7
PM
3341{
3342 int maxexp = ieee ? 29 : 30;
3343 uint32_t mask;
3344 uint32_t increment;
c4a1c5e7
PM
3345 bool rounding_bumps_exp;
3346 bool is_tiny = false;
3347
3348 /* Calculate the mask of bits of the mantissa which are not
3349 * representable in half-precision and will be lost.
3350 */
3351 if (zExp < 1) {
3352 /* Will be denormal in halfprec */
3353 mask = 0x00ffffff;
3354 if (zExp >= -11) {
3355 mask >>= 11 + zExp;
3356 }
3357 } else {
3358 /* Normal number in halfprec */
3359 mask = 0x00001fff;
3360 }
3361
dc355b76 3362 switch (STATUS(float_rounding_mode)) {
c4a1c5e7
PM
3363 case float_round_nearest_even:
3364 increment = (mask + 1) >> 1;
3365 if ((zSig & mask) == increment) {
3366 increment = zSig & (increment << 1);
3367 }
3368 break;
f9288a76
PM
3369 case float_round_ties_away:
3370 increment = (mask + 1) >> 1;
3371 break;
c4a1c5e7
PM
3372 case float_round_up:
3373 increment = zSign ? 0 : mask;
3374 break;
3375 case float_round_down:
3376 increment = zSign ? mask : 0;
3377 break;
3378 default: /* round_to_zero */
3379 increment = 0;
3380 break;
3381 }
3382
3383 rounding_bumps_exp = (zSig + increment >= 0x01000000);
3384
3385 if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3386 if (ieee) {
ff32e16e 3387 float_raise(float_flag_overflow | float_flag_inexact, status);
c4a1c5e7
PM
3388 return packFloat16(zSign, 0x1f, 0);
3389 } else {
ff32e16e 3390 float_raise(float_flag_invalid, status);
c4a1c5e7
PM
3391 return packFloat16(zSign, 0x1f, 0x3ff);
3392 }
3393 }
3394
3395 if (zExp < 0) {
3396 /* Note that flush-to-zero does not affect half-precision results */
3397 is_tiny =
3398 (STATUS(float_detect_tininess) == float_tininess_before_rounding)
3399 || (zExp < -1)
3400 || (!rounding_bumps_exp);
3401 }
3402 if (zSig & mask) {
ff32e16e 3403 float_raise(float_flag_inexact, status);
c4a1c5e7 3404 if (is_tiny) {
ff32e16e 3405 float_raise(float_flag_underflow, status);
c4a1c5e7
PM
3406 }
3407 }
3408
3409 zSig += increment;
3410 if (rounding_bumps_exp) {
3411 zSig >>= 1;
3412 zExp++;
3413 }
3414
3415 if (zExp < -10) {
3416 return packFloat16(zSign, 0, 0);
3417 }
3418 if (zExp < 0) {
3419 zSig >>= -zExp;
3420 zExp = 0;
3421 }
3422 return packFloat16(zSign, zExp, zSig >> 13);
3423}
3424
3425static void normalizeFloat16Subnormal(uint32_t aSig, int_fast16_t *zExpPtr,
3426 uint32_t *zSigPtr)
3427{
3428 int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3429 *zSigPtr = aSig << shiftCount;
3430 *zExpPtr = 1 - shiftCount;
3431}
3432
60011498
PB
3433/* Half precision floats come in two formats: standard IEEE and "ARM" format.
3434 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
bb4d4bb3 3435
e5a41ffa 3436float32 float16_to_float32(float16 a, flag ieee, float_status *status)
60011498
PB
3437{
3438 flag aSign;
94a49d86 3439 int_fast16_t aExp;
bb98fe42 3440 uint32_t aSig;
60011498 3441
bb4d4bb3
PM
3442 aSign = extractFloat16Sign(a);
3443 aExp = extractFloat16Exp(a);
3444 aSig = extractFloat16Frac(a);
60011498
PB
3445
3446 if (aExp == 0x1f && ieee) {
3447 if (aSig) {
ff32e16e 3448 return commonNaNToFloat32(float16ToCommonNaN(a, status), status);
60011498 3449 }
4be8eeac 3450 return packFloat32(aSign, 0xff, 0);
60011498
PB
3451 }
3452 if (aExp == 0) {
60011498
PB
3453 if (aSig == 0) {
3454 return packFloat32(aSign, 0, 0);
3455 }
3456
c4a1c5e7
PM
3457 normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3458 aExp--;
60011498
PB
3459 }
3460 return packFloat32( aSign, aExp + 0x70, aSig << 13);
3461}
3462
e5a41ffa 3463float16 float32_to_float16(float32 a, flag ieee, float_status *status)
60011498
PB
3464{
3465 flag aSign;
94a49d86 3466 int_fast16_t aExp;
bb98fe42 3467 uint32_t aSig;
38970efa 3468
ff32e16e 3469 a = float32_squash_input_denormal(a, status);
60011498
PB
3470
3471 aSig = extractFloat32Frac( a );
3472 aExp = extractFloat32Exp( a );
3473 aSign = extractFloat32Sign( a );
3474 if ( aExp == 0xFF ) {
3475 if (aSig) {
600e30d2 3476 /* Input is a NaN */
600e30d2 3477 if (!ieee) {
ff32e16e 3478 float_raise(float_flag_invalid, status);
600e30d2
PM
3479 return packFloat16(aSign, 0, 0);
3480 }
38970efa 3481 return commonNaNToFloat16(
ff32e16e 3482 float32ToCommonNaN(a, status), status);
60011498 3483 }
600e30d2
PM
3484 /* Infinity */
3485 if (!ieee) {
ff32e16e 3486 float_raise(float_flag_invalid, status);
600e30d2
PM
3487 return packFloat16(aSign, 0x1f, 0x3ff);
3488 }
3489 return packFloat16(aSign, 0x1f, 0);
60011498 3490 }
600e30d2 3491 if (aExp == 0 && aSig == 0) {
60011498
PB
3492 return packFloat16(aSign, 0, 0);
3493 }
38970efa
PM
3494 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3495 * even if the input is denormal; however this is harmless because
3496 * the largest possible single-precision denormal is still smaller
3497 * than the smallest representable half-precision denormal, and so we
3498 * will end up ignoring aSig and returning via the "always return zero"
3499 * codepath.
3500 */
60011498 3501 aSig |= 0x00800000;
c4a1c5e7 3502 aExp -= 0x71;
60011498 3503
ff32e16e 3504 return roundAndPackFloat16(aSign, aExp, aSig, ieee, status);
60011498
PB
3505}
3506
e5a41ffa 3507float64 float16_to_float64(float16 a, flag ieee, float_status *status)
14c9a07e
PM
3508{
3509 flag aSign;
3510 int_fast16_t aExp;
3511 uint32_t aSig;
3512
3513 aSign = extractFloat16Sign(a);
3514 aExp = extractFloat16Exp(a);
3515 aSig = extractFloat16Frac(a);
3516
3517 if (aExp == 0x1f && ieee) {
3518 if (aSig) {
3519 return commonNaNToFloat64(
ff32e16e 3520 float16ToCommonNaN(a, status), status);
14c9a07e
PM
3521 }
3522 return packFloat64(aSign, 0x7ff, 0);
3523 }
3524 if (aExp == 0) {
3525 if (aSig == 0) {
3526 return packFloat64(aSign, 0, 0);
3527 }
3528
3529 normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3530 aExp--;
3531 }
3532 return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);
3533}
3534
e5a41ffa 3535float16 float64_to_float16(float64 a, flag ieee, float_status *status)
14c9a07e
PM
3536{
3537 flag aSign;
3538 int_fast16_t aExp;
3539 uint64_t aSig;
3540 uint32_t zSig;
3541
ff32e16e 3542 a = float64_squash_input_denormal(a, status);
14c9a07e
PM
3543
3544 aSig = extractFloat64Frac(a);
3545 aExp = extractFloat64Exp(a);
3546 aSign = extractFloat64Sign(a);
3547 if (aExp == 0x7FF) {
3548 if (aSig) {
3549 /* Input is a NaN */
3550 if (!ieee) {
ff32e16e 3551 float_raise(float_flag_invalid, status);
14c9a07e
PM
3552 return packFloat16(aSign, 0, 0);
3553 }
3554 return commonNaNToFloat16(
ff32e16e 3555 float64ToCommonNaN(a, status), status);
14c9a07e
PM
3556 }
3557 /* Infinity */
3558 if (!ieee) {
ff32e16e 3559 float_raise(float_flag_invalid, status);
14c9a07e
PM
3560 return packFloat16(aSign, 0x1f, 0x3ff);
3561 }
3562 return packFloat16(aSign, 0x1f, 0);
3563 }
3564 shift64RightJamming(aSig, 29, &aSig);
3565 zSig = aSig;
3566 if (aExp == 0 && zSig == 0) {
3567 return packFloat16(aSign, 0, 0);
3568 }
3569 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3570 * even if the input is denormal; however this is harmless because
3571 * the largest possible single-precision denormal is still smaller
3572 * than the smallest representable half-precision denormal, and so we
3573 * will end up ignoring aSig and returning via the "always return zero"
3574 * codepath.
3575 */
3576 zSig |= 0x00800000;
3577 aExp -= 0x3F1;
3578
ff32e16e 3579 return roundAndPackFloat16(aSign, aExp, zSig, ieee, status);
14c9a07e
PM
3580}
3581
158142c2
FB
3582/*----------------------------------------------------------------------------
3583| Returns the result of converting the double-precision floating-point value
3584| `a' to the extended double-precision floating-point format. The conversion
3585| is performed according to the IEC/IEEE Standard for Binary Floating-Point
3586| Arithmetic.
3587*----------------------------------------------------------------------------*/
3588
e5a41ffa 3589floatx80 float64_to_floatx80(float64 a, float_status *status)
158142c2
FB
3590{
3591 flag aSign;
94a49d86 3592 int_fast16_t aExp;
bb98fe42 3593 uint64_t aSig;
158142c2 3594
ff32e16e 3595 a = float64_squash_input_denormal(a, status);
158142c2
FB
3596 aSig = extractFloat64Frac( a );
3597 aExp = extractFloat64Exp( a );
3598 aSign = extractFloat64Sign( a );
3599 if ( aExp == 0x7FF ) {
ff32e16e
PM
3600 if (aSig) {
3601 return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
3602 }
158142c2
FB
3603 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3604 }
3605 if ( aExp == 0 ) {
3606 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3607 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3608 }
3609 return
3610 packFloatx80(
3611 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
3612
3613}
3614
158142c2
FB
3615/*----------------------------------------------------------------------------
3616| Returns the result of converting the double-precision floating-point value
3617| `a' to the quadruple-precision floating-point format. The conversion is
3618| performed according to the IEC/IEEE Standard for Binary Floating-Point
3619| Arithmetic.
3620*----------------------------------------------------------------------------*/
3621
e5a41ffa 3622float128 float64_to_float128(float64 a, float_status *status)
158142c2
FB
3623{
3624 flag aSign;
94a49d86 3625 int_fast16_t aExp;
bb98fe42 3626 uint64_t aSig, zSig0, zSig1;
158142c2 3627
ff32e16e 3628 a = float64_squash_input_denormal(a, status);
158142c2
FB
3629 aSig = extractFloat64Frac( a );
3630 aExp = extractFloat64Exp( a );
3631 aSign = extractFloat64Sign( a );
3632 if ( aExp == 0x7FF ) {
ff32e16e
PM
3633 if (aSig) {
3634 return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
3635 }
158142c2
FB
3636 return packFloat128( aSign, 0x7FFF, 0, 0 );
3637 }
3638 if ( aExp == 0 ) {
3639 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3640 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3641 --aExp;
3642 }
3643 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
3644 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
3645
3646}
3647
158142c2
FB
3648/*----------------------------------------------------------------------------
3649| Rounds the double-precision floating-point value `a' to an integer, and
3650| returns the result as a double-precision floating-point value. The
3651| operation is performed according to the IEC/IEEE Standard for Binary
3652| Floating-Point Arithmetic.
3653*----------------------------------------------------------------------------*/
3654
e5a41ffa 3655float64 float64_round_to_int(float64 a, float_status *status)
158142c2
FB
3656{
3657 flag aSign;
94a49d86 3658 int_fast16_t aExp;
bb98fe42 3659 uint64_t lastBitMask, roundBitsMask;
bb98fe42 3660 uint64_t z;
ff32e16e 3661 a = float64_squash_input_denormal(a, status);
158142c2
FB
3662
3663 aExp = extractFloat64Exp( a );
3664 if ( 0x433 <= aExp ) {
3665 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
ff32e16e 3666 return propagateFloat64NaN(a, a, status);
158142c2
FB
3667 }
3668 return a;
3669 }
3670 if ( aExp < 0x3FF ) {
bb98fe42 3671 if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
158142c2
FB
3672 STATUS(float_exception_flags) |= float_flag_inexact;
3673 aSign = extractFloat64Sign( a );
3674 switch ( STATUS(float_rounding_mode) ) {
3675 case float_round_nearest_even:
3676 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3677 return packFloat64( aSign, 0x3FF, 0 );
3678 }
3679 break;
f9288a76
PM
3680 case float_round_ties_away:
3681 if (aExp == 0x3FE) {
3682 return packFloat64(aSign, 0x3ff, 0);
3683 }
3684 break;
158142c2 3685 case float_round_down:
f090c9d4 3686 return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
158142c2 3687 case float_round_up:
f090c9d4
PB
3688 return make_float64(
3689 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
158142c2
FB
3690 }
3691 return packFloat64( aSign, 0, 0 );
3692 }
3693 lastBitMask = 1;
3694 lastBitMask <<= 0x433 - aExp;
3695 roundBitsMask = lastBitMask - 1;
f090c9d4 3696 z = float64_val(a);
dc355b76
PM
3697 switch (STATUS(float_rounding_mode)) {
3698 case float_round_nearest_even:
3699 z += lastBitMask >> 1;
3700 if ((z & roundBitsMask) == 0) {
3701 z &= ~lastBitMask;
3702 }
3703 break;
f9288a76
PM
3704 case float_round_ties_away:
3705 z += lastBitMask >> 1;
3706 break;
dc355b76
PM
3707 case float_round_to_zero:
3708 break;
3709 case float_round_up:
3710 if (!extractFloat64Sign(make_float64(z))) {
3711 z += roundBitsMask;
3712 }
3713 break;
3714 case float_round_down:
3715 if (extractFloat64Sign(make_float64(z))) {
158142c2
FB
3716 z += roundBitsMask;
3717 }
dc355b76
PM
3718 break;
3719 default:
3720 abort();
158142c2
FB
3721 }
3722 z &= ~ roundBitsMask;
f090c9d4
PB
3723 if ( z != float64_val(a) )
3724 STATUS(float_exception_flags) |= float_flag_inexact;
3725 return make_float64(z);
158142c2
FB
3726
3727}
3728
e5a41ffa 3729float64 float64_trunc_to_int(float64 a, float_status *status)
e6e5906b
PB
3730{
3731 int oldmode;
3732 float64 res;
3733 oldmode = STATUS(float_rounding_mode);
3734 STATUS(float_rounding_mode) = float_round_to_zero;
ff32e16e 3735 res = float64_round_to_int(a, status);
e6e5906b
PB
3736 STATUS(float_rounding_mode) = oldmode;
3737 return res;
3738}
3739
158142c2
FB
3740/*----------------------------------------------------------------------------
3741| Returns the result of adding the absolute values of the double-precision
3742| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
3743| before being returned. `zSign' is ignored if the result is a NaN.
3744| The addition is performed according to the IEC/IEEE Standard for Binary
3745| Floating-Point Arithmetic.
3746*----------------------------------------------------------------------------*/
3747
e5a41ffa
PM
3748static float64 addFloat64Sigs(float64 a, float64 b, flag zSign,
3749 float_status *status)
158142c2 3750{
94a49d86 3751 int_fast16_t aExp, bExp, zExp;
bb98fe42 3752 uint64_t aSig, bSig, zSig;
94a49d86 3753 int_fast16_t expDiff;
158142c2
FB
3754
3755 aSig = extractFloat64Frac( a );
3756 aExp = extractFloat64Exp( a );
3757 bSig = extractFloat64Frac( b );
3758 bExp = extractFloat64Exp( b );
3759 expDiff = aExp - bExp;
3760 aSig <<= 9;
3761 bSig <<= 9;
3762 if ( 0 < expDiff ) {
3763 if ( aExp == 0x7FF ) {
ff32e16e
PM
3764 if (aSig) {
3765 return propagateFloat64NaN(a, b, status);
3766 }
158142c2
FB
3767 return a;
3768 }
3769 if ( bExp == 0 ) {
3770 --expDiff;
3771 }
3772 else {
3773 bSig |= LIT64( 0x2000000000000000 );
3774 }
3775 shift64RightJamming( bSig, expDiff, &bSig );
3776 zExp = aExp;
3777 }
3778 else if ( expDiff < 0 ) {
3779 if ( bExp == 0x7FF ) {
ff32e16e
PM
3780 if (bSig) {
3781 return propagateFloat64NaN(a, b, status);
3782 }
158142c2
FB
3783 return packFloat64( zSign, 0x7FF, 0 );
3784 }
3785 if ( aExp == 0 ) {
3786 ++expDiff;
3787 }
3788 else {
3789 aSig |= LIT64( 0x2000000000000000 );
3790 }
3791 shift64RightJamming( aSig, - expDiff, &aSig );
3792 zExp = bExp;
3793 }
3794 else {
3795 if ( aExp == 0x7FF ) {
ff32e16e
PM
3796 if (aSig | bSig) {
3797 return propagateFloat64NaN(a, b, status);
3798 }
158142c2
FB
3799 return a;
3800 }
fe76d976 3801 if ( aExp == 0 ) {
e6afc87f
PM
3802 if (STATUS(flush_to_zero)) {
3803 if (aSig | bSig) {
ff32e16e 3804 float_raise(float_flag_output_denormal, status);
e6afc87f
PM
3805 }
3806 return packFloat64(zSign, 0, 0);
3807 }
fe76d976
PB
3808 return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3809 }
158142c2
FB
3810 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3811 zExp = aExp;
3812 goto roundAndPack;
3813 }
3814 aSig |= LIT64( 0x2000000000000000 );
3815 zSig = ( aSig + bSig )<<1;
3816 --zExp;
bb98fe42 3817 if ( (int64_t) zSig < 0 ) {
158142c2
FB
3818 zSig = aSig + bSig;
3819 ++zExp;
3820 }
3821 roundAndPack:
ff32e16e 3822 return roundAndPackFloat64(zSign, zExp, zSig, status);
158142c2
FB
3823
3824}
3825
3826/*----------------------------------------------------------------------------
3827| Returns the result of subtracting the absolute values of the double-
3828| precision floating-point values `a' and `b'. If `zSign' is 1, the
3829| difference is negated before being returned. `zSign' is ignored if the
3830| result is a NaN. The subtraction is performed according to the IEC/IEEE
3831| Standard for Binary Floating-Point Arithmetic.
3832*----------------------------------------------------------------------------*/
3833
e5a41ffa
PM
3834static float64 subFloat64Sigs(float64 a, float64 b, flag zSign,
3835 float_status *status)
158142c2 3836{
94a49d86 3837 int_fast16_t aExp, bExp, zExp;
bb98fe42 3838 uint64_t aSig, bSig, zSig;
94a49d86 3839 int_fast16_t expDiff;
158142c2
FB
3840
3841 aSig = extractFloat64Frac( a );
3842 aExp = extractFloat64Exp( a );
3843 bSig = extractFloat64Frac( b );
3844 bExp = extractFloat64Exp( b );
3845 expDiff = aExp - bExp;
3846 aSig <<= 10;
3847 bSig <<= 10;
3848 if ( 0 < expDiff ) goto aExpBigger;
3849 if ( expDiff < 0 ) goto bExpBigger;
3850 if ( aExp == 0x7FF ) {
ff32e16e
PM
3851 if (aSig | bSig) {
3852 return propagateFloat64NaN(a, b, status);
3853 }
3854 float_raise(float_flag_invalid, status);
158142c2
FB
3855 return float64_default_nan;
3856 }
3857 if ( aExp == 0 ) {
3858 aExp = 1;
3859 bExp = 1;
3860 }
3861 if ( bSig < aSig ) goto aBigger;
3862 if ( aSig < bSig ) goto bBigger;
3863 return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3864 bExpBigger:
3865 if ( bExp == 0x7FF ) {
ff32e16e
PM
3866 if (bSig) {
3867 return propagateFloat64NaN(a, b, status);
3868 }
158142c2
FB
3869 return packFloat64( zSign ^ 1, 0x7FF, 0 );
3870 }
3871 if ( aExp == 0 ) {
3872 ++expDiff;
3873 }
3874 else {
3875 aSig |= LIT64( 0x4000000000000000 );
3876 }
3877 shift64RightJamming( aSig, - expDiff, &aSig );
3878 bSig |= LIT64( 0x4000000000000000 );
3879 bBigger:
3880 zSig = bSig - aSig;
3881 zExp = bExp;
3882 zSign ^= 1;
3883 goto normalizeRoundAndPack;
3884 aExpBigger:
3885 if ( aExp == 0x7FF ) {
ff32e16e
PM
3886 if (aSig) {
3887 return propagateFloat64NaN(a, b, status);
3888 }
158142c2
FB
3889 return a;
3890 }
3891 if ( bExp == 0 ) {
3892 --expDiff;
3893 }
3894 else {
3895 bSig |= LIT64( 0x4000000000000000 );
3896 }
3897 shift64RightJamming( bSig, expDiff, &bSig );
3898 aSig |= LIT64( 0x4000000000000000 );
3899 aBigger:
3900 zSig = aSig - bSig;
3901 zExp = aExp;
3902 normalizeRoundAndPack:
3903 --zExp;
ff32e16e 3904 return normalizeRoundAndPackFloat64(zSign, zExp, zSig, status);
158142c2
FB
3905
3906}
3907
3908/*----------------------------------------------------------------------------
3909| Returns the result of adding the double-precision floating-point values `a'
3910| and `b'. The operation is performed according to the IEC/IEEE Standard for
3911| Binary Floating-Point Arithmetic.
3912*----------------------------------------------------------------------------*/
3913
e5a41ffa 3914float64 float64_add(float64 a, float64 b, float_status *status)
158142c2
FB
3915{
3916 flag aSign, bSign;
ff32e16e
PM
3917 a = float64_squash_input_denormal(a, status);
3918 b = float64_squash_input_denormal(b, status);
158142c2
FB
3919
3920 aSign = extractFloat64Sign( a );
3921 bSign = extractFloat64Sign( b );
3922 if ( aSign == bSign ) {
ff32e16e 3923 return addFloat64Sigs(a, b, aSign, status);
158142c2
FB
3924 }
3925 else {
ff32e16e 3926 return subFloat64Sigs(a, b, aSign, status);
158142c2
FB
3927 }
3928
3929}
3930
3931/*----------------------------------------------------------------------------
3932| Returns the result of subtracting the double-precision floating-point values
3933| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3934| for Binary Floating-Point Arithmetic.
3935*----------------------------------------------------------------------------*/
3936
e5a41ffa 3937float64 float64_sub(float64 a, float64 b, float_status *status)
158142c2
FB
3938{
3939 flag aSign, bSign;
ff32e16e
PM
3940 a = float64_squash_input_denormal(a, status);
3941 b = float64_squash_input_denormal(b, status);
158142c2
FB
3942
3943 aSign = extractFloat64Sign( a );
3944 bSign = extractFloat64Sign( b );
3945 if ( aSign == bSign ) {
ff32e16e 3946 return subFloat64Sigs(a, b, aSign, status);
158142c2
FB
3947 }
3948 else {
ff32e16e 3949 return addFloat64Sigs(a, b, aSign, status);
158142c2
FB
3950 }
3951
3952}
3953
3954/*----------------------------------------------------------------------------
3955| Returns the result of multiplying the double-precision floating-point values
3956| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3957| for Binary Floating-Point Arithmetic.
3958*----------------------------------------------------------------------------*/
3959
e5a41ffa 3960float64 float64_mul(float64 a, float64 b, float_status *status)
158142c2
FB
3961{
3962 flag aSign, bSign, zSign;
94a49d86 3963 int_fast16_t aExp, bExp, zExp;
bb98fe42 3964 uint64_t aSig, bSig, zSig0, zSig1;
158142c2 3965
ff32e16e
PM
3966 a = float64_squash_input_denormal(a, status);
3967 b = float64_squash_input_denormal(b, status);
37d18660 3968
158142c2
FB
3969 aSig = extractFloat64Frac( a );
3970 aExp = extractFloat64Exp( a );
3971 aSign = extractFloat64Sign( a );
3972 bSig = extractFloat64Frac( b );
3973 bExp = extractFloat64Exp( b );
3974 bSign = extractFloat64Sign( b );
3975 zSign = aSign ^ bSign;
3976 if ( aExp == 0x7FF ) {
3977 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
ff32e16e 3978 return propagateFloat64NaN(a, b, status);
158142c2
FB
3979 }
3980 if ( ( bExp | bSig ) == 0 ) {
ff32e16e 3981 float_raise(float_flag_invalid, status);
158142c2
FB
3982 return float64_default_nan;
3983 }
3984 return packFloat64( zSign, 0x7FF, 0 );
3985 }
3986 if ( bExp == 0x7FF ) {
ff32e16e
PM
3987 if (bSig) {
3988 return propagateFloat64NaN(a, b, status);
3989 }
158142c2 3990 if ( ( aExp | aSig ) == 0 ) {
ff32e16e 3991 float_raise(float_flag_invalid, status);
158142c2
FB
3992 return float64_default_nan;
3993 }
3994 return packFloat64( zSign, 0x7FF, 0 );
3995 }
3996 if ( aExp == 0 ) {
3997 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3998 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3999 }
4000 if ( bExp == 0 ) {
4001 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
4002 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4003 }
4004 zExp = aExp + bExp - 0x3FF;
4005 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
4006 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4007 mul64To128( aSig, bSig, &zSig0, &zSig1 );
4008 zSig0 |= ( zSig1 != 0 );
bb98fe42 4009 if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
158142c2
FB
4010 zSig0 <<= 1;
4011 --zExp;
4012 }
ff32e16e 4013 return roundAndPackFloat64(zSign, zExp, zSig0, status);
158142c2
FB
4014
4015}
4016
4017/*----------------------------------------------------------------------------
4018| Returns the result of dividing the double-precision floating-point value `a'
4019| by the corresponding value `b'. The operation is performed according to
4020| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4021*----------------------------------------------------------------------------*/
4022
e5a41ffa 4023float64 float64_div(float64 a, float64 b, float_status *status)
158142c2
FB
4024{
4025 flag aSign, bSign, zSign;
94a49d86 4026 int_fast16_t aExp, bExp, zExp;
bb98fe42
AF
4027 uint64_t aSig, bSig, zSig;
4028 uint64_t rem0, rem1;
4029 uint64_t term0, term1;
ff32e16e
PM
4030 a = float64_squash_input_denormal(a, status);
4031 b = float64_squash_input_denormal(b, status);
158142c2
FB
4032
4033 aSig = extractFloat64Frac( a );
4034 aExp = extractFloat64Exp( a );
4035 aSign = extractFloat64Sign( a );
4036 bSig = extractFloat64Frac( b );
4037 bExp = extractFloat64Exp( b );
4038 bSign = extractFloat64Sign( b );
4039 zSign = aSign ^ bSign;
4040 if ( aExp == 0x7FF ) {
ff32e16e
PM
4041 if (aSig) {
4042 return propagateFloat64NaN(a, b, status);
4043 }
158142c2 4044 if ( bExp == 0x7FF ) {
ff32e16e
PM
4045 if (bSig) {
4046 return propagateFloat64NaN(a, b, status);
4047 }
4048 float_raise(float_flag_invalid, status);
158142c2
FB
4049 return float64_default_nan;
4050 }
4051 return packFloat64( zSign, 0x7FF, 0 );
4052 }
4053 if ( bExp == 0x7FF ) {
ff32e16e
PM
4054 if (bSig) {
4055 return propagateFloat64NaN(a, b, status);
4056 }
158142c2
FB
4057 return packFloat64( zSign, 0, 0 );
4058 }
4059 if ( bExp == 0 ) {
4060 if ( bSig == 0 ) {
4061 if ( ( aExp | aSig ) == 0 ) {
ff32e16e 4062 float_raise(float_flag_invalid, status);
158142c2
FB
4063 return float64_default_nan;
4064 }
ff32e16e 4065 float_raise(float_flag_divbyzero, status);
158142c2
FB
4066 return packFloat64( zSign, 0x7FF, 0 );
4067 }
4068 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4069 }
4070 if ( aExp == 0 ) {
4071 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
4072 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4073 }
4074 zExp = aExp - bExp + 0x3FD;
4075 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
4076 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4077 if ( bSig <= ( aSig + aSig ) ) {
4078 aSig >>= 1;
4079 ++zExp;
4080 }
4081 zSig = estimateDiv128To64( aSig, 0, bSig );
4082 if ( ( zSig & 0x1FF ) <= 2 ) {
4083 mul64To128( bSig, zSig, &term0, &term1 );
4084 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
bb98fe42 4085 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
4086 --zSig;
4087 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4088 }
4089 zSig |= ( rem1 != 0 );
4090 }
ff32e16e 4091 return roundAndPackFloat64(zSign, zExp, zSig, status);
158142c2
FB
4092
4093}
4094
4095/*----------------------------------------------------------------------------
4096| Returns the remainder of the double-precision floating-point value `a'
4097| with respect to the corresponding value `b'. The operation is performed
4098| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4099*----------------------------------------------------------------------------*/
4100
e5a41ffa 4101float64 float64_rem(float64 a, float64 b, float_status *status)
158142c2 4102{
ed086f3d 4103 flag aSign, zSign;
94a49d86 4104 int_fast16_t aExp, bExp, expDiff;
bb98fe42
AF
4105 uint64_t aSig, bSig;
4106 uint64_t q, alternateASig;
4107 int64_t sigMean;
158142c2 4108
ff32e16e
PM
4109 a = float64_squash_input_denormal(a, status);
4110 b = float64_squash_input_denormal(b, status);
158142c2
FB
4111 aSig = extractFloat64Frac( a );
4112 aExp = extractFloat64Exp( a );
4113 aSign = extractFloat64Sign( a );
4114 bSig = extractFloat64Frac( b );
4115 bExp = extractFloat64Exp( b );
158142c2
FB
4116 if ( aExp == 0x7FF ) {
4117 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
ff32e16e 4118 return propagateFloat64NaN(a, b, status);
158142c2 4119 }
ff32e16e 4120 float_raise(float_flag_invalid, status);
158142c2
FB
4121 return float64_default_nan;
4122 }
4123 if ( bExp == 0x7FF ) {
ff32e16e
PM
4124 if (bSig) {
4125 return propagateFloat64NaN(a, b, status);
4126 }
158142c2
FB
4127 return a;
4128 }
4129 if ( bExp == 0 ) {
4130 if ( bSig == 0 ) {
ff32e16e 4131 float_raise(float_flag_invalid, status);
158142c2
FB
4132 return float64_default_nan;
4133 }
4134 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4135 }
4136 if ( aExp == 0 ) {
4137 if ( aSig == 0 ) return a;
4138 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4139 }
4140 expDiff = aExp - bExp;
4141 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4142 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4143 if ( expDiff < 0 ) {
4144 if ( expDiff < -1 ) return a;
4145 aSig >>= 1;
4146 }
4147 q = ( bSig <= aSig );
4148 if ( q ) aSig -= bSig;
4149 expDiff -= 64;
4150 while ( 0 < expDiff ) {
4151 q = estimateDiv128To64( aSig, 0, bSig );
4152 q = ( 2 < q ) ? q - 2 : 0;
4153 aSig = - ( ( bSig>>2 ) * q );
4154 expDiff -= 62;
4155 }
4156 expDiff += 64;
4157 if ( 0 < expDiff ) {
4158 q = estimateDiv128To64( aSig, 0, bSig );
4159 q = ( 2 < q ) ? q - 2 : 0;
4160 q >>= 64 - expDiff;
4161 bSig >>= 2;
4162 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4163 }
4164 else {
4165 aSig >>= 2;
4166 bSig >>= 2;
4167 }
4168 do {
4169 alternateASig = aSig;
4170 ++q;
4171 aSig -= bSig;
bb98fe42 4172 } while ( 0 <= (int64_t) aSig );
158142c2
FB
4173 sigMean = aSig + alternateASig;
4174 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4175 aSig = alternateASig;
4176 }
bb98fe42 4177 zSign = ( (int64_t) aSig < 0 );
158142c2 4178 if ( zSign ) aSig = - aSig;
ff32e16e 4179 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
158142c2
FB
4180
4181}
4182
369be8f6
PM
4183/*----------------------------------------------------------------------------
4184| Returns the result of multiplying the double-precision floating-point values
4185| `a' and `b' then adding 'c', with no intermediate rounding step after the
4186| multiplication. The operation is performed according to the IEC/IEEE
4187| Standard for Binary Floating-Point Arithmetic 754-2008.
4188| The flags argument allows the caller to select negation of the
4189| addend, the intermediate product, or the final result. (The difference
4190| between this and having the caller do a separate negation is that negating
4191| externally will flip the sign bit on NaNs.)
4192*----------------------------------------------------------------------------*/
4193
e5a41ffa
PM
4194float64 float64_muladd(float64 a, float64 b, float64 c, int flags,
4195 float_status *status)
369be8f6
PM
4196{
4197 flag aSign, bSign, cSign, zSign;
94a49d86 4198 int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
369be8f6
PM
4199 uint64_t aSig, bSig, cSig;
4200 flag pInf, pZero, pSign;
4201 uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1;
4202 int shiftcount;
4203 flag signflip, infzero;
4204
ff32e16e
PM
4205 a = float64_squash_input_denormal(a, status);
4206 b = float64_squash_input_denormal(b, status);
4207 c = float64_squash_input_denormal(c, status);
369be8f6
PM
4208 aSig = extractFloat64Frac(a);
4209 aExp = extractFloat64Exp(a);
4210 aSign = extractFloat64Sign(a);
4211 bSig = extractFloat64Frac(b);
4212 bExp = extractFloat64Exp(b);
4213 bSign = extractFloat64Sign(b);
4214 cSig = extractFloat64Frac(c);
4215 cExp = extractFloat64Exp(c);
4216 cSign = extractFloat64Sign(c);
4217
4218 infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) ||
4219 (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0));
4220
4221 /* It is implementation-defined whether the cases of (0,inf,qnan)
4222 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
4223 * they return if they do), so we have to hand this information
4224 * off to the target-specific pick-a-NaN routine.
4225 */
4226 if (((aExp == 0x7ff) && aSig) ||
4227 ((bExp == 0x7ff) && bSig) ||
4228 ((cExp == 0x7ff) && cSig)) {
ff32e16e 4229 return propagateFloat64MulAddNaN(a, b, c, infzero, status);
369be8f6
PM
4230 }
4231
4232 if (infzero) {
ff32e16e 4233 float_raise(float_flag_invalid, status);
369be8f6
PM
4234 return float64_default_nan;
4235 }
4236
4237 if (flags & float_muladd_negate_c) {
4238 cSign ^= 1;
4239 }
4240
4241 signflip = (flags & float_muladd_negate_result) ? 1 : 0;
4242
4243 /* Work out the sign and type of the product */
4244 pSign = aSign ^ bSign;
4245 if (flags & float_muladd_negate_product) {
4246 pSign ^= 1;
4247 }
4248 pInf = (aExp == 0x7ff) || (bExp == 0x7ff);
4249 pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
4250
4251 if (cExp == 0x7ff) {
4252 if (pInf && (pSign ^ cSign)) {
4253 /* addition of opposite-signed infinities => InvalidOperation */
ff32e16e 4254 float_raise(float_flag_invalid, status);
369be8f6
PM
4255 return float64_default_nan;
4256 }
4257 /* Otherwise generate an infinity of the same sign */
4258 return packFloat64(cSign ^ signflip, 0x7ff, 0);
4259 }
4260
4261 if (pInf) {
4262 return packFloat64(pSign ^ signflip, 0x7ff, 0);
4263 }
4264
4265 if (pZero) {
4266 if (cExp == 0) {
4267 if (cSig == 0) {
4268 /* Adding two exact zeroes */
4269 if (pSign == cSign) {
4270 zSign = pSign;
4271 } else if (STATUS(float_rounding_mode) == float_round_down) {
4272 zSign = 1;
4273 } else {
4274 zSign = 0;
4275 }
4276 return packFloat64(zSign ^ signflip, 0, 0);
4277 }
4278 /* Exact zero plus a denorm */
4279 if (STATUS(flush_to_zero)) {
ff32e16e 4280 float_raise(float_flag_output_denormal, status);
369be8f6
PM
4281 return packFloat64(cSign ^ signflip, 0, 0);
4282 }
4283 }
4284 /* Zero plus something non-zero : just return the something */
67d43538
PM
4285 if (flags & float_muladd_halve_result) {
4286 if (cExp == 0) {
4287 normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4288 }
4289 /* Subtract one to halve, and one again because roundAndPackFloat64
4290 * wants one less than the true exponent.
4291 */
4292 cExp -= 2;
4293 cSig = (cSig | 0x0010000000000000ULL) << 10;
ff32e16e 4294 return roundAndPackFloat64(cSign ^ signflip, cExp, cSig, status);
67d43538 4295 }
a6e7c184 4296 return packFloat64(cSign ^ signflip, cExp, cSig);
369be8f6
PM
4297 }
4298
4299 if (aExp == 0) {
4300 normalizeFloat64Subnormal(aSig, &aExp, &aSig);
4301 }
4302 if (bExp == 0) {
4303 normalizeFloat64Subnormal(bSig, &bExp, &bSig);
4304 }
4305
4306 /* Calculate the actual result a * b + c */
4307
4308 /* Multiply first; this is easy. */
4309 /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff
4310 * because we want the true exponent, not the "one-less-than"
4311 * flavour that roundAndPackFloat64() takes.
4312 */
4313 pExp = aExp + bExp - 0x3fe;
4314 aSig = (aSig | LIT64(0x0010000000000000))<<10;
4315 bSig = (bSig | LIT64(0x0010000000000000))<<11;
4316 mul64To128(aSig, bSig, &pSig0, &pSig1);
4317 if ((int64_t)(pSig0 << 1) >= 0) {
4318 shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1);
4319 pExp--;
4320 }
4321
4322 zSign = pSign ^ signflip;
4323
4324 /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit
4325 * bit in position 126.
4326 */
4327 if (cExp == 0) {
4328 if (!cSig) {
4329 /* Throw out the special case of c being an exact zero now */
4330 shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1);
67d43538
PM
4331 if (flags & float_muladd_halve_result) {
4332 pExp--;
4333 }
369be8f6 4334 return roundAndPackFloat64(zSign, pExp - 1,
ff32e16e 4335 pSig1, status);
369be8f6
PM
4336 }
4337 normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4338 }
4339
4340 /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the
4341 * significand of the addend, with the explicit bit in position 126.
4342 */
4343 cSig0 = cSig << (126 - 64 - 52);
4344 cSig1 = 0;
4345 cSig0 |= LIT64(0x4000000000000000);
4346 expDiff = pExp - cExp;
4347
4348 if (pSign == cSign) {
4349 /* Addition */
4350 if (expDiff > 0) {
4351 /* scale c to match p */
4352 shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4353 zExp = pExp;
4354 } else if (expDiff < 0) {
4355 /* scale p to match c */
4356 shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4357 zExp = cExp;
4358 } else {
4359 /* no scaling needed */
4360 zExp = cExp;
4361 }
4362 /* Add significands and make sure explicit bit ends up in posn 126 */
4363 add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4364 if ((int64_t)zSig0 < 0) {
4365 shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);
4366 } else {
4367 zExp--;
4368 }
4369 shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1);
67d43538
PM
4370 if (flags & float_muladd_halve_result) {
4371 zExp--;
4372 }
ff32e16e 4373 return roundAndPackFloat64(zSign, zExp, zSig1, status);
369be8f6
PM
4374 } else {
4375 /* Subtraction */
4376 if (expDiff > 0) {
4377 shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4378 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4379 zExp = pExp;
4380 } else if (expDiff < 0) {
4381 shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4382 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4383 zExp = cExp;
4384 zSign ^= 1;
4385 } else {
4386 zExp = pExp;
4387 if (lt128(cSig0, cSig1, pSig0, pSig1)) {
4388 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4389 } else if (lt128(pSig0, pSig1, cSig0, cSig1)) {
4390 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4391 zSign ^= 1;
4392 } else {
4393 /* Exact zero */
4394 zSign = signflip;
4395 if (STATUS(float_rounding_mode) == float_round_down) {
4396 zSign ^= 1;
4397 }
4398 return packFloat64(zSign, 0, 0);
4399 }
4400 }
4401 --zExp;
4402 /* Do the equivalent of normalizeRoundAndPackFloat64() but
4403 * starting with the significand in a pair of uint64_t.
4404 */
4405 if (zSig0) {
4406 shiftcount = countLeadingZeros64(zSig0) - 1;
4407 shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1);
4408 if (zSig1) {
4409 zSig0 |= 1;
4410 }
4411 zExp -= shiftcount;
4412 } else {
e3d142d0
PM
4413 shiftcount = countLeadingZeros64(zSig1);
4414 if (shiftcount == 0) {
4415 zSig0 = (zSig1 >> 1) | (zSig1 & 1);
4416 zExp -= 63;
4417 } else {
4418 shiftcount--;
4419 zSig0 = zSig1 << shiftcount;
4420 zExp -= (shiftcount + 64);
4421 }
369be8f6 4422 }
67d43538
PM
4423 if (flags & float_muladd_halve_result) {
4424 zExp--;
4425 }
ff32e16e 4426 return roundAndPackFloat64(zSign, zExp, zSig0, status);
369be8f6
PM
4427 }
4428}
4429
158142c2
FB
4430/*----------------------------------------------------------------------------
4431| Returns the square root of the double-precision floating-point value `a'.
4432| The operation is performed according to the IEC/IEEE Standard for Binary
4433| Floating-Point Arithmetic.
4434*----------------------------------------------------------------------------*/
4435
e5a41ffa 4436float64 float64_sqrt(float64 a, float_status *status)
158142c2
FB
4437{
4438 flag aSign;
94a49d86 4439 int_fast16_t aExp, zExp;
bb98fe42
AF
4440 uint64_t aSig, zSig, doubleZSig;
4441 uint64_t rem0, rem1, term0, term1;
ff32e16e 4442 a = float64_squash_input_denormal(a, status);
158142c2
FB
4443
4444 aSig = extractFloat64Frac( a );
4445 aExp = extractFloat64Exp( a );
4446 aSign = extractFloat64Sign( a );
4447 if ( aExp == 0x7FF ) {
ff32e16e
PM
4448 if (aSig) {
4449 return propagateFloat64NaN(a, a, status);
4450 }
158142c2 4451 if ( ! aSign ) return a;
ff32e16e 4452 float_raise(float_flag_invalid, status);
158142c2
FB
4453 return float64_default_nan;
4454 }
4455 if ( aSign ) {
4456 if ( ( aExp | aSig ) == 0 ) return a;
ff32e16e 4457 float_raise(float_flag_invalid, status);
158142c2
FB
4458 return float64_default_nan;
4459 }
4460 if ( aExp == 0 ) {
f090c9d4 4461 if ( aSig == 0 ) return float64_zero;
158142c2
FB
4462 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4463 }
4464 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
4465 aSig |= LIT64( 0x0010000000000000 );
4466 zSig = estimateSqrt32( aExp, aSig>>21 );
4467 aSig <<= 9 - ( aExp & 1 );
4468 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
4469 if ( ( zSig & 0x1FF ) <= 5 ) {
4470 doubleZSig = zSig<<1;
4471 mul64To128( zSig, zSig, &term0, &term1 );
4472 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
bb98fe42 4473 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
4474 --zSig;
4475 doubleZSig -= 2;
4476 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
4477 }
4478 zSig |= ( ( rem0 | rem1 ) != 0 );
4479 }
ff32e16e 4480 return roundAndPackFloat64(0, zExp, zSig, status);
158142c2
FB
4481
4482}
4483
374dfc33
AJ
4484/*----------------------------------------------------------------------------
4485| Returns the binary log of the double-precision floating-point value `a'.
4486| The operation is performed according to the IEC/IEEE Standard for Binary
4487| Floating-Point Arithmetic.
4488*----------------------------------------------------------------------------*/
e5a41ffa 4489float64 float64_log2(float64 a, float_status *status)
374dfc33
AJ
4490{
4491 flag aSign, zSign;
94a49d86 4492 int_fast16_t aExp;
bb98fe42 4493 uint64_t aSig, aSig0, aSig1, zSig, i;
ff32e16e 4494 a = float64_squash_input_denormal(a, status);
374dfc33
AJ
4495
4496 aSig = extractFloat64Frac( a );
4497 aExp = extractFloat64Exp( a );
4498 aSign = extractFloat64Sign( a );
4499
4500 if ( aExp == 0 ) {
4501 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4502 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4503 }
4504 if ( aSign ) {
ff32e16e 4505 float_raise(float_flag_invalid, status);
374dfc33
AJ
4506 return float64_default_nan;
4507 }
4508 if ( aExp == 0x7FF ) {
ff32e16e
PM
4509 if (aSig) {
4510 return propagateFloat64NaN(a, float64_zero, status);
4511 }
374dfc33
AJ
4512 return a;
4513 }
4514
4515 aExp -= 0x3FF;
4516 aSig |= LIT64( 0x0010000000000000 );
4517 zSign = aExp < 0;
bb98fe42 4518 zSig = (uint64_t)aExp << 52;
374dfc33
AJ
4519 for (i = 1LL << 51; i > 0; i >>= 1) {
4520 mul64To128( aSig, aSig, &aSig0, &aSig1 );
4521 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4522 if ( aSig & LIT64( 0x0020000000000000 ) ) {
4523 aSig >>= 1;
4524 zSig |= i;
4525 }
4526 }
4527
4528 if ( zSign )
4529 zSig = -zSig;
ff32e16e 4530 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
374dfc33
AJ
4531}
4532
158142c2
FB
4533/*----------------------------------------------------------------------------
4534| Returns 1 if the double-precision floating-point value `a' is equal to the
b689362d
AJ
4535| corresponding value `b', and 0 otherwise. The invalid exception is raised
4536| if either operand is a NaN. Otherwise, the comparison is performed
158142c2
FB
4537| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4538*----------------------------------------------------------------------------*/
4539
e5a41ffa 4540int float64_eq(float64 a, float64 b, float_status *status)
158142c2 4541{
bb98fe42 4542 uint64_t av, bv;
ff32e16e
PM
4543 a = float64_squash_input_denormal(a, status);
4544 b = float64_squash_input_denormal(b, status);
158142c2
FB
4545
4546 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4547 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4548 ) {
ff32e16e 4549 float_raise(float_flag_invalid, status);
158142c2
FB
4550 return 0;
4551 }
f090c9d4 4552 av = float64_val(a);
a1b91bb4 4553 bv = float64_val(b);
bb98fe42 4554 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
158142c2
FB
4555
4556}
4557
4558/*----------------------------------------------------------------------------
4559| Returns 1 if the double-precision floating-point value `a' is less than or
f5a64251
AJ
4560| equal to the corresponding value `b', and 0 otherwise. The invalid
4561| exception is raised if either operand is a NaN. The comparison is performed
4562| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
4563*----------------------------------------------------------------------------*/
4564
e5a41ffa 4565int float64_le(float64 a, float64 b, float_status *status)
158142c2
FB
4566{
4567 flag aSign, bSign;
bb98fe42 4568 uint64_t av, bv;
ff32e16e
PM
4569 a = float64_squash_input_denormal(a, status);
4570 b = float64_squash_input_denormal(b, status);
158142c2
FB
4571
4572 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4573 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4574 ) {
ff32e16e 4575 float_raise(float_flag_invalid, status);
158142c2
FB
4576 return 0;
4577 }
4578 aSign = extractFloat64Sign( a );
4579 bSign = extractFloat64Sign( b );
f090c9d4 4580 av = float64_val(a);
a1b91bb4 4581 bv = float64_val(b);
bb98fe42 4582 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 4583 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
4584
4585}
4586
4587/*----------------------------------------------------------------------------
4588| Returns 1 if the double-precision floating-point value `a' is less than
f5a64251
AJ
4589| the corresponding value `b', and 0 otherwise. The invalid exception is
4590| raised if either operand is a NaN. The comparison is performed according
4591| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
4592*----------------------------------------------------------------------------*/
4593
e5a41ffa 4594int float64_lt(float64 a, float64 b, float_status *status)
158142c2
FB
4595{
4596 flag aSign, bSign;
bb98fe42 4597 uint64_t av, bv;
158142c2 4598
ff32e16e
PM
4599 a = float64_squash_input_denormal(a, status);
4600 b = float64_squash_input_denormal(b, status);
158142c2
FB
4601 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4602 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4603 ) {
ff32e16e 4604 float_raise(float_flag_invalid, status);
158142c2
FB
4605 return 0;
4606 }
4607 aSign = extractFloat64Sign( a );
4608 bSign = extractFloat64Sign( b );
f090c9d4 4609 av = float64_val(a);
a1b91bb4 4610 bv = float64_val(b);
bb98fe42 4611 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 4612 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
4613
4614}
4615
67b7861d
AJ
4616/*----------------------------------------------------------------------------
4617| Returns 1 if the double-precision floating-point values `a' and `b' cannot
f5a64251
AJ
4618| be compared, and 0 otherwise. The invalid exception is raised if either
4619| operand is a NaN. The comparison is performed according to the IEC/IEEE
4620| Standard for Binary Floating-Point Arithmetic.
67b7861d
AJ
4621*----------------------------------------------------------------------------*/
4622
e5a41ffa 4623int float64_unordered(float64 a, float64 b, float_status *status)
67b7861d 4624{
ff32e16e
PM
4625 a = float64_squash_input_denormal(a, status);
4626 b = float64_squash_input_denormal(b, status);
67b7861d
AJ
4627
4628 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4629 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4630 ) {
ff32e16e 4631 float_raise(float_flag_invalid, status);
67b7861d
AJ
4632 return 1;
4633 }
4634 return 0;
4635}
4636
158142c2
FB
4637/*----------------------------------------------------------------------------
4638| Returns 1 if the double-precision floating-point value `a' is equal to the
f5a64251
AJ
4639| corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4640| exception.The comparison is performed according to the IEC/IEEE Standard
4641| for Binary Floating-Point Arithmetic.
158142c2
FB
4642*----------------------------------------------------------------------------*/
4643
e5a41ffa 4644int float64_eq_quiet(float64 a, float64 b, float_status *status)
158142c2 4645{
bb98fe42 4646 uint64_t av, bv;
ff32e16e
PM
4647 a = float64_squash_input_denormal(a, status);
4648 b = float64_squash_input_denormal(b, status);
158142c2
FB
4649
4650 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4651 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4652 ) {
b689362d 4653 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
ff32e16e 4654 float_raise(float_flag_invalid, status);
b689362d 4655 }
158142c2
FB
4656 return 0;
4657 }
f090c9d4 4658 av = float64_val(a);
a1b91bb4 4659 bv = float64_val(b);
bb98fe42 4660 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
158142c2
FB
4661
4662}
4663
4664/*----------------------------------------------------------------------------
4665| Returns 1 if the double-precision floating-point value `a' is less than or
4666| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4667| cause an exception. Otherwise, the comparison is performed according to the
4668| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4669*----------------------------------------------------------------------------*/
4670
e5a41ffa 4671int float64_le_quiet(float64 a, float64 b, float_status *status)
158142c2
FB
4672{
4673 flag aSign, bSign;
bb98fe42 4674 uint64_t av, bv;
ff32e16e
PM
4675 a = float64_squash_input_denormal(a, status);
4676 b = float64_squash_input_denormal(b, status);
158142c2
FB
4677
4678 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4679 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4680 ) {
4681 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
ff32e16e 4682 float_raise(float_flag_invalid, status);
158142c2
FB
4683 }
4684 return 0;
4685 }
4686 aSign = extractFloat64Sign( a );
4687 bSign = extractFloat64Sign( b );
f090c9d4 4688 av = float64_val(a);
a1b91bb4 4689 bv = float64_val(b);
bb98fe42 4690 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 4691 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
4692
4693}
4694
4695/*----------------------------------------------------------------------------
4696| Returns 1 if the double-precision floating-point value `a' is less than
4697| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4698| exception. Otherwise, the comparison is performed according to the IEC/IEEE
4699| Standard for Binary Floating-Point Arithmetic.
4700*----------------------------------------------------------------------------*/
4701
e5a41ffa 4702int float64_lt_quiet(float64 a, float64 b, float_status *status)
158142c2
FB
4703{
4704 flag aSign, bSign;
bb98fe42 4705 uint64_t av, bv;
ff32e16e
PM
4706 a = float64_squash_input_denormal(a, status);
4707 b = float64_squash_input_denormal(b, status);
158142c2
FB
4708
4709 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4710 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4711 ) {
4712 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
ff32e16e 4713 float_raise(float_flag_invalid, status);
158142c2
FB
4714 }
4715 return 0;
4716 }
4717 aSign = extractFloat64Sign( a );
4718 bSign = extractFloat64Sign( b );
f090c9d4 4719 av = float64_val(a);
a1b91bb4 4720 bv = float64_val(b);
bb98fe42 4721 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 4722 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
4723
4724}
4725
67b7861d
AJ
4726/*----------------------------------------------------------------------------
4727| Returns 1 if the double-precision floating-point values `a' and `b' cannot
4728| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4729| comparison is performed according to the IEC/IEEE Standard for Binary
4730| Floating-Point Arithmetic.
4731*----------------------------------------------------------------------------*/
4732
e5a41ffa 4733int float64_unordered_quiet(float64 a, float64 b, float_status *status)
67b7861d 4734{
ff32e16e
PM
4735 a = float64_squash_input_denormal(a, status);
4736 b = float64_squash_input_denormal(b, status);
67b7861d
AJ
4737
4738 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4739 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4740 ) {
4741 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
ff32e16e 4742 float_raise(float_flag_invalid, status);
67b7861d
AJ
4743 }
4744 return 1;
4745 }
4746 return 0;
4747}
4748
158142c2
FB
4749/*----------------------------------------------------------------------------
4750| Returns the result of converting the extended double-precision floating-
4751| point value `a' to the 32-bit two's complement integer format. The
4752| conversion is performed according to the IEC/IEEE Standard for Binary
4753| Floating-Point Arithmetic---which means in particular that the conversion
4754| is rounded according to the current rounding mode. If `a' is a NaN, the
4755| largest positive integer is returned. Otherwise, if the conversion
4756| overflows, the largest integer with the same sign as `a' is returned.
4757*----------------------------------------------------------------------------*/
4758
e5a41ffa 4759int32 floatx80_to_int32(floatx80 a, float_status *status)
158142c2
FB
4760{
4761 flag aSign;
4762 int32 aExp, shiftCount;
bb98fe42 4763 uint64_t aSig;
158142c2
FB
4764
4765 aSig = extractFloatx80Frac( a );
4766 aExp = extractFloatx80Exp( a );
4767 aSign = extractFloatx80Sign( a );
bb98fe42 4768 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
158142c2
FB
4769 shiftCount = 0x4037 - aExp;
4770 if ( shiftCount <= 0 ) shiftCount = 1;
4771 shift64RightJamming( aSig, shiftCount, &aSig );
ff32e16e 4772 return roundAndPackInt32(aSign, aSig, status);
158142c2
FB
4773
4774}
4775
4776/*----------------------------------------------------------------------------
4777| Returns the result of converting the extended double-precision floating-
4778| point value `a' to the 32-bit two's complement integer format. The
4779| conversion is performed according to the IEC/IEEE Standard for Binary
4780| Floating-Point Arithmetic, except that the conversion is always rounded
4781| toward zero. If `a' is a NaN, the largest positive integer is returned.
4782| Otherwise, if the conversion overflows, the largest integer with the same
4783| sign as `a' is returned.
4784*----------------------------------------------------------------------------*/
4785
e5a41ffa 4786int32 floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
158142c2
FB
4787{
4788 flag aSign;
4789 int32 aExp, shiftCount;
bb98fe42 4790 uint64_t aSig, savedASig;
b3a6a2e0 4791 int32_t z;
158142c2
FB
4792
4793 aSig = extractFloatx80Frac( a );
4794 aExp = extractFloatx80Exp( a );
4795 aSign = extractFloatx80Sign( a );
4796 if ( 0x401E < aExp ) {
bb98fe42 4797 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
158142c2
FB
4798 goto invalid;
4799 }
4800 else if ( aExp < 0x3FFF ) {
4801 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
4802 return 0;
4803 }
4804 shiftCount = 0x403E - aExp;
4805 savedASig = aSig;
4806 aSig >>= shiftCount;
4807 z = aSig;
4808 if ( aSign ) z = - z;
4809 if ( ( z < 0 ) ^ aSign ) {
4810 invalid:
ff32e16e 4811 float_raise(float_flag_invalid, status);
bb98fe42 4812 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2
FB
4813 }
4814 if ( ( aSig<<shiftCount ) != savedASig ) {
4815 STATUS(float_exception_flags) |= float_flag_inexact;
4816 }
4817 return z;
4818
4819}
4820
4821/*----------------------------------------------------------------------------
4822| Returns the result of converting the extended double-precision floating-
4823| point value `a' to the 64-bit two's complement integer format. The
4824| conversion is performed according to the IEC/IEEE Standard for Binary
4825| Floating-Point Arithmetic---which means in particular that the conversion
4826| is rounded according to the current rounding mode. If `a' is a NaN,
4827| the largest positive integer is returned. Otherwise, if the conversion
4828| overflows, the largest integer with the same sign as `a' is returned.
4829*----------------------------------------------------------------------------*/
4830
e5a41ffa 4831int64 floatx80_to_int64(floatx80 a, float_status *status)
158142c2
FB
4832{
4833 flag aSign;
4834 int32 aExp, shiftCount;
bb98fe42 4835 uint64_t aSig, aSigExtra;
158142c2
FB
4836
4837 aSig = extractFloatx80Frac( a );
4838 aExp = extractFloatx80Exp( a );
4839 aSign = extractFloatx80Sign( a );
4840 shiftCount = 0x403E - aExp;
4841 if ( shiftCount <= 0 ) {
4842 if ( shiftCount ) {
ff32e16e 4843 float_raise(float_flag_invalid, status);
158142c2
FB
4844 if ( ! aSign
4845 || ( ( aExp == 0x7FFF )
4846 && ( aSig != LIT64( 0x8000000000000000 ) ) )
4847 ) {
4848 return LIT64( 0x7FFFFFFFFFFFFFFF );
4849 }
bb98fe42 4850 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
4851 }
4852 aSigExtra = 0;
4853 }
4854 else {
4855 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4856 }
ff32e16e 4857 return roundAndPackInt64(aSign, aSig, aSigExtra, status);
158142c2
FB
4858
4859}
4860
4861/*----------------------------------------------------------------------------
4862| Returns the result of converting the extended double-precision floating-
4863| point value `a' to the 64-bit two's complement integer format. The
4864| conversion is performed according to the IEC/IEEE Standard for Binary
4865| Floating-Point Arithmetic, except that the conversion is always rounded
4866| toward zero. If `a' is a NaN, the largest positive integer is returned.
4867| Otherwise, if the conversion overflows, the largest integer with the same
4868| sign as `a' is returned.
4869*----------------------------------------------------------------------------*/
4870
e5a41ffa 4871int64 floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
158142c2
FB
4872{
4873 flag aSign;
4874 int32 aExp, shiftCount;
bb98fe42 4875 uint64_t aSig;
158142c2
FB
4876 int64 z;
4877
4878 aSig = extractFloatx80Frac( a );
4879 aExp = extractFloatx80Exp( a );
4880 aSign = extractFloatx80Sign( a );
4881 shiftCount = aExp - 0x403E;
4882 if ( 0 <= shiftCount ) {
4883 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4884 if ( ( a.high != 0xC03E ) || aSig ) {
ff32e16e 4885 float_raise(float_flag_invalid, status);
158142c2
FB
4886 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4887 return LIT64( 0x7FFFFFFFFFFFFFFF );
4888 }
4889 }
bb98fe42 4890 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
4891 }
4892 else if ( aExp < 0x3FFF ) {
4893 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
4894 return 0;
4895 }
4896 z = aSig>>( - shiftCount );
bb98fe42 4897 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
158142c2
FB
4898 STATUS(float_exception_flags) |= float_flag_inexact;
4899 }
4900 if ( aSign ) z = - z;
4901 return z;
4902
4903}
4904
4905/*----------------------------------------------------------------------------
4906| Returns the result of converting the extended double-precision floating-
4907| point value `a' to the single-precision floating-point format. The
4908| conversion is performed according to the IEC/IEEE Standard for Binary
4909| Floating-Point Arithmetic.
4910*----------------------------------------------------------------------------*/
4911
e5a41ffa 4912float32 floatx80_to_float32(floatx80 a, float_status *status)
158142c2
FB
4913{
4914 flag aSign;
4915 int32 aExp;
bb98fe42 4916 uint64_t aSig;
158142c2
FB
4917
4918 aSig = extractFloatx80Frac( a );
4919 aExp = extractFloatx80Exp( a );
4920 aSign = extractFloatx80Sign( a );
4921 if ( aExp == 0x7FFF ) {
bb98fe42 4922 if ( (uint64_t) ( aSig<<1 ) ) {
ff32e16e 4923 return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
158142c2
FB
4924 }
4925 return packFloat32( aSign, 0xFF, 0 );
4926 }
4927 shift64RightJamming( aSig, 33, &aSig );
4928 if ( aExp || aSig ) aExp -= 0x3F81;
ff32e16e 4929 return roundAndPackFloat32(aSign, aExp, aSig, status);
158142c2
FB
4930
4931}
4932
4933/*----------------------------------------------------------------------------
4934| Returns the result of converting the extended double-precision floating-
4935| point value `a' to the double-precision floating-point format. The
4936| conversion is performed according to the IEC/IEEE Standard for Binary
4937| Floating-Point Arithmetic.
4938*----------------------------------------------------------------------------*/
4939
e5a41ffa 4940float64 floatx80_to_float64(floatx80 a, float_status *status)
158142c2
FB
4941{
4942 flag aSign;
4943 int32 aExp;
bb98fe42 4944 uint64_t aSig, zSig;
158142c2
FB
4945
4946 aSig = extractFloatx80Frac( a );
4947 aExp = extractFloatx80Exp( a );
4948 aSign = extractFloatx80Sign( a );
4949 if ( aExp == 0x7FFF ) {
bb98fe42 4950 if ( (uint64_t) ( aSig<<1 ) ) {
ff32e16e 4951 return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
158142c2
FB
4952 }
4953 return packFloat64( aSign, 0x7FF, 0 );
4954 }
4955 shift64RightJamming( aSig, 1, &zSig );
4956 if ( aExp || aSig ) aExp -= 0x3C01;
ff32e16e 4957 return roundAndPackFloat64(aSign, aExp, zSig, status);
158142c2
FB
4958
4959}
4960
158142c2
FB
4961/*----------------------------------------------------------------------------
4962| Returns the result of converting the extended double-precision floating-
4963| point value `a' to the quadruple-precision floating-point format. The
4964| conversion is performed according to the IEC/IEEE Standard for Binary
4965| Floating-Point Arithmetic.
4966*----------------------------------------------------------------------------*/
4967
e5a41ffa 4968float128 floatx80_to_float128(floatx80 a, float_status *status)
158142c2
FB
4969{
4970 flag aSign;
94a49d86 4971 int_fast16_t aExp;
bb98fe42 4972 uint64_t aSig, zSig0, zSig1;
158142c2
FB
4973
4974 aSig = extractFloatx80Frac( a );
4975 aExp = extractFloatx80Exp( a );
4976 aSign = extractFloatx80Sign( a );
bb98fe42 4977 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
ff32e16e 4978 return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
158142c2
FB
4979 }
4980 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4981 return packFloat128( aSign, aExp, zSig0, zSig1 );
4982
4983}
4984
158142c2
FB
4985/*----------------------------------------------------------------------------
4986| Rounds the extended double-precision floating-point value `a' to an integer,
4987| and returns the result as an extended quadruple-precision floating-point
4988| value. The operation is performed according to the IEC/IEEE Standard for
4989| Binary Floating-Point Arithmetic.
4990*----------------------------------------------------------------------------*/
4991
e5a41ffa 4992floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
158142c2
FB
4993{
4994 flag aSign;
4995 int32 aExp;
bb98fe42 4996 uint64_t lastBitMask, roundBitsMask;
158142c2
FB
4997 floatx80 z;
4998
4999 aExp = extractFloatx80Exp( a );
5000 if ( 0x403E <= aExp ) {
bb98fe42 5001 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
ff32e16e 5002 return propagateFloatx80NaN(a, a, status);
158142c2
FB
5003 }
5004 return a;
5005 }
5006 if ( aExp < 0x3FFF ) {
5007 if ( ( aExp == 0 )
bb98fe42 5008 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
158142c2
FB
5009 return a;
5010 }
5011 STATUS(float_exception_flags) |= float_flag_inexact;
5012 aSign = extractFloatx80Sign( a );
5013 switch ( STATUS(float_rounding_mode) ) {
5014 case float_round_nearest_even:
bb98fe42 5015 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
158142c2
FB
5016 ) {
5017 return
5018 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
5019 }
5020 break;
f9288a76
PM
5021 case float_round_ties_away:
5022 if (aExp == 0x3FFE) {
5023 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
5024 }
5025 break;
158142c2
FB
5026 case float_round_down:
5027 return
5028 aSign ?
5029 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
5030 : packFloatx80( 0, 0, 0 );
5031 case float_round_up:
5032 return
5033 aSign ? packFloatx80( 1, 0, 0 )
5034 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
5035 }
5036 return packFloatx80( aSign, 0, 0 );
5037 }
5038 lastBitMask = 1;
5039 lastBitMask <<= 0x403E - aExp;
5040 roundBitsMask = lastBitMask - 1;
5041 z = a;
dc355b76
PM
5042 switch (STATUS(float_rounding_mode)) {
5043 case float_round_nearest_even:
158142c2 5044 z.low += lastBitMask>>1;
dc355b76
PM
5045 if ((z.low & roundBitsMask) == 0) {
5046 z.low &= ~lastBitMask;
5047 }
5048 break;
f9288a76
PM
5049 case float_round_ties_away:
5050 z.low += lastBitMask >> 1;
5051 break;
dc355b76
PM
5052 case float_round_to_zero:
5053 break;
5054 case float_round_up:
5055 if (!extractFloatx80Sign(z)) {
5056 z.low += roundBitsMask;
5057 }
5058 break;
5059 case float_round_down:
5060 if (extractFloatx80Sign(z)) {
158142c2
FB
5061 z.low += roundBitsMask;
5062 }
dc355b76
PM
5063 break;
5064 default:
5065 abort();
158142c2
FB
5066 }
5067 z.low &= ~ roundBitsMask;
5068 if ( z.low == 0 ) {
5069 ++z.high;
5070 z.low = LIT64( 0x8000000000000000 );
5071 }
5072 if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
5073 return z;
5074
5075}
5076
5077/*----------------------------------------------------------------------------
5078| Returns the result of adding the absolute values of the extended double-
5079| precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
5080| negated before being returned. `zSign' is ignored if the result is a NaN.
5081| The addition is performed according to the IEC/IEEE Standard for Binary
5082| Floating-Point Arithmetic.
5083*----------------------------------------------------------------------------*/
5084
e5a41ffa
PM
5085static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5086 float_status *status)
158142c2
FB
5087{
5088 int32 aExp, bExp, zExp;
bb98fe42 5089 uint64_t aSig, bSig, zSig0, zSig1;
158142c2
FB
5090 int32 expDiff;
5091
5092 aSig = extractFloatx80Frac( a );
5093 aExp = extractFloatx80Exp( a );
5094 bSig = extractFloatx80Frac( b );
5095 bExp = extractFloatx80Exp( b );
5096 expDiff = aExp - bExp;
5097 if ( 0 < expDiff ) {
5098 if ( aExp == 0x7FFF ) {
ff32e16e
PM
5099 if ((uint64_t)(aSig << 1)) {
5100 return propagateFloatx80NaN(a, b, status);
5101 }
158142c2
FB
5102 return a;
5103 }
5104 if ( bExp == 0 ) --expDiff;
5105 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5106 zExp = aExp;
5107 }
5108 else if ( expDiff < 0 ) {
5109 if ( bExp == 0x7FFF ) {
ff32e16e
PM
5110 if ((uint64_t)(bSig << 1)) {
5111 return propagateFloatx80NaN(a, b, status);
5112 }
158142c2
FB
5113 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5114 }
5115 if ( aExp == 0 ) ++expDiff;
5116 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5117 zExp = bExp;
5118 }
5119 else {
5120 if ( aExp == 0x7FFF ) {
bb98fe42 5121 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
ff32e16e 5122 return propagateFloatx80NaN(a, b, status);
158142c2
FB
5123 }
5124 return a;
5125 }
5126 zSig1 = 0;
5127 zSig0 = aSig + bSig;
5128 if ( aExp == 0 ) {
5129 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5130 goto roundAndPack;
5131 }
5132 zExp = aExp;
5133 goto shiftRight1;
5134 }
5135 zSig0 = aSig + bSig;
bb98fe42 5136 if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
158142c2
FB
5137 shiftRight1:
5138 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5139 zSig0 |= LIT64( 0x8000000000000000 );
5140 ++zExp;
5141 roundAndPack:
ff32e16e
PM
5142 return roundAndPackFloatx80(STATUS(floatx80_rounding_precision),
5143 zSign, zExp, zSig0, zSig1, status);
158142c2
FB
5144}
5145
5146/*----------------------------------------------------------------------------
5147| Returns the result of subtracting the absolute values of the extended
5148| double-precision floating-point values `a' and `b'. If `zSign' is 1, the
5149| difference is negated before being returned. `zSign' is ignored if the
5150| result is a NaN. The subtraction is performed according to the IEC/IEEE
5151| Standard for Binary Floating-Point Arithmetic.
5152*----------------------------------------------------------------------------*/
5153
e5a41ffa
PM
5154static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5155 float_status *status)
158142c2
FB
5156{
5157 int32 aExp, bExp, zExp;
bb98fe42 5158 uint64_t aSig, bSig, zSig0, zSig1;
158142c2
FB
5159 int32 expDiff;
5160 floatx80 z;
5161
5162 aSig = extractFloatx80Frac( a );
5163 aExp = extractFloatx80Exp( a );
5164 bSig = extractFloatx80Frac( b );
5165 bExp = extractFloatx80Exp( b );
5166 expDiff = aExp - bExp;
5167 if ( 0 < expDiff ) goto aExpBigger;
5168 if ( expDiff < 0 ) goto bExpBigger;
5169 if ( aExp == 0x7FFF ) {
bb98fe42 5170 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
ff32e16e 5171 return propagateFloatx80NaN(a, b, status);
158142c2 5172 }
ff32e16e 5173 float_raise(float_flag_invalid, status);
158142c2
FB
5174 z.low = floatx80_default_nan_low;
5175 z.high = floatx80_default_nan_high;
5176 return z;
5177 }
5178 if ( aExp == 0 ) {
5179 aExp = 1;
5180 bExp = 1;
5181 }
5182 zSig1 = 0;
5183 if ( bSig < aSig ) goto aBigger;
5184 if ( aSig < bSig ) goto bBigger;
5185 return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
5186 bExpBigger:
5187 if ( bExp == 0x7FFF ) {
ff32e16e
PM
5188 if ((uint64_t)(bSig << 1)) {
5189 return propagateFloatx80NaN(a, b, status);
5190 }
158142c2
FB
5191 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
5192 }
5193 if ( aExp == 0 ) ++expDiff;
5194 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5195 bBigger:
5196 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5197 zExp = bExp;
5198 zSign ^= 1;
5199 goto normalizeRoundAndPack;
5200 aExpBigger:
5201 if ( aExp == 0x7FFF ) {
ff32e16e
PM
5202 if ((uint64_t)(aSig << 1)) {
5203 return propagateFloatx80NaN(a, b, status);
5204 }
158142c2
FB
5205 return a;
5206 }
5207 if ( bExp == 0 ) --expDiff;
5208 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5209 aBigger:
5210 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5211 zExp = aExp;
5212 normalizeRoundAndPack:
ff32e16e
PM
5213 return normalizeRoundAndPackFloatx80(STATUS(floatx80_rounding_precision),
5214 zSign, zExp, zSig0, zSig1, status);
158142c2
FB
5215}
5216
5217/*----------------------------------------------------------------------------
5218| Returns the result of adding the extended double-precision floating-point
5219| values `a' and `b'. The operation is performed according to the IEC/IEEE
5220| Standard for Binary Floating-Point Arithmetic.
5221*----------------------------------------------------------------------------*/
5222
e5a41ffa 5223floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5224{
5225 flag aSign, bSign;
5226
5227 aSign = extractFloatx80Sign( a );
5228 bSign = extractFloatx80Sign( b );
5229 if ( aSign == bSign ) {
ff32e16e 5230 return addFloatx80Sigs(a, b, aSign, status);
158142c2
FB
5231 }
5232 else {
ff32e16e 5233 return subFloatx80Sigs(a, b, aSign, status);
158142c2
FB
5234 }
5235
5236}
5237
5238/*----------------------------------------------------------------------------
5239| Returns the result of subtracting the extended double-precision floating-
5240| point values `a' and `b'. The operation is performed according to the
5241| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5242*----------------------------------------------------------------------------*/
5243
e5a41ffa 5244floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5245{
5246 flag aSign, bSign;
5247
5248 aSign = extractFloatx80Sign( a );
5249 bSign = extractFloatx80Sign( b );
5250 if ( aSign == bSign ) {
ff32e16e 5251 return subFloatx80Sigs(a, b, aSign, status);
158142c2
FB
5252 }
5253 else {
ff32e16e 5254 return addFloatx80Sigs(a, b, aSign, status);
158142c2
FB
5255 }
5256
5257}
5258
5259/*----------------------------------------------------------------------------
5260| Returns the result of multiplying the extended double-precision floating-
5261| point values `a' and `b'. The operation is performed according to the
5262| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5263*----------------------------------------------------------------------------*/
5264
e5a41ffa 5265floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5266{
5267 flag aSign, bSign, zSign;
5268 int32 aExp, bExp, zExp;
bb98fe42 5269 uint64_t aSig, bSig, zSig0, zSig1;
158142c2
FB
5270 floatx80 z;
5271
5272 aSig = extractFloatx80Frac( a );
5273 aExp = extractFloatx80Exp( a );
5274 aSign = extractFloatx80Sign( a );
5275 bSig = extractFloatx80Frac( b );
5276 bExp = extractFloatx80Exp( b );
5277 bSign = extractFloatx80Sign( b );
5278 zSign = aSign ^ bSign;
5279 if ( aExp == 0x7FFF ) {
bb98fe42
AF
5280 if ( (uint64_t) ( aSig<<1 )
5281 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
ff32e16e 5282 return propagateFloatx80NaN(a, b, status);
158142c2
FB
5283 }
5284 if ( ( bExp | bSig ) == 0 ) goto invalid;
5285 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5286 }
5287 if ( bExp == 0x7FFF ) {
ff32e16e
PM
5288 if ((uint64_t)(bSig << 1)) {
5289 return propagateFloatx80NaN(a, b, status);
5290 }
158142c2
FB
5291 if ( ( aExp | aSig ) == 0 ) {
5292 invalid:
ff32e16e 5293 float_raise(float_flag_invalid, status);
158142c2
FB
5294 z.low = floatx80_default_nan_low;
5295 z.high = floatx80_default_nan_high;
5296 return z;
5297 }
5298 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5299 }
5300 if ( aExp == 0 ) {
5301 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5302 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5303 }
5304 if ( bExp == 0 ) {
5305 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5306 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5307 }
5308 zExp = aExp + bExp - 0x3FFE;
5309 mul64To128( aSig, bSig, &zSig0, &zSig1 );
bb98fe42 5310 if ( 0 < (int64_t) zSig0 ) {
158142c2
FB
5311 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5312 --zExp;
5313 }
ff32e16e
PM
5314 return roundAndPackFloatx80(STATUS(floatx80_rounding_precision),
5315 zSign, zExp, zSig0, zSig1, status);
158142c2
FB
5316}
5317
5318/*----------------------------------------------------------------------------
5319| Returns the result of dividing the extended double-precision floating-point
5320| value `a' by the corresponding value `b'. The operation is performed
5321| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5322*----------------------------------------------------------------------------*/
5323
e5a41ffa 5324floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5325{
5326 flag aSign, bSign, zSign;
5327 int32 aExp, bExp, zExp;
bb98fe42
AF
5328 uint64_t aSig, bSig, zSig0, zSig1;
5329 uint64_t rem0, rem1, rem2, term0, term1, term2;
158142c2
FB
5330 floatx80 z;
5331
5332 aSig = extractFloatx80Frac( a );
5333 aExp = extractFloatx80Exp( a );
5334 aSign = extractFloatx80Sign( a );
5335 bSig = extractFloatx80Frac( b );
5336 bExp = extractFloatx80Exp( b );
5337 bSign = extractFloatx80Sign( b );
5338 zSign = aSign ^ bSign;
5339 if ( aExp == 0x7FFF ) {
ff32e16e
PM
5340 if ((uint64_t)(aSig << 1)) {
5341 return propagateFloatx80NaN(a, b, status);
5342 }
158142c2 5343 if ( bExp == 0x7FFF ) {
ff32e16e
PM
5344 if ((uint64_t)(bSig << 1)) {
5345 return propagateFloatx80NaN(a, b, status);
5346 }
158142c2
FB
5347 goto invalid;
5348 }
5349 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5350 }
5351 if ( bExp == 0x7FFF ) {
ff32e16e
PM
5352 if ((uint64_t)(bSig << 1)) {
5353 return propagateFloatx80NaN(a, b, status);
5354 }
158142c2
FB
5355 return packFloatx80( zSign, 0, 0 );
5356 }
5357 if ( bExp == 0 ) {
5358 if ( bSig == 0 ) {
5359 if ( ( aExp | aSig ) == 0 ) {
5360 invalid:
ff32e16e 5361 float_raise(float_flag_invalid, status);
158142c2
FB
5362 z.low = floatx80_default_nan_low;
5363 z.high = floatx80_default_nan_high;
5364 return z;
5365 }
ff32e16e 5366 float_raise(float_flag_divbyzero, status);
158142c2
FB
5367 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5368 }
5369 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5370 }
5371 if ( aExp == 0 ) {
5372 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5373 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5374 }
5375 zExp = aExp - bExp + 0x3FFE;
5376 rem1 = 0;
5377 if ( bSig <= aSig ) {
5378 shift128Right( aSig, 0, 1, &aSig, &rem1 );
5379 ++zExp;
5380 }
5381 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5382 mul64To128( bSig, zSig0, &term0, &term1 );
5383 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
bb98fe42 5384 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
5385 --zSig0;
5386 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5387 }
5388 zSig1 = estimateDiv128To64( rem1, 0, bSig );
bb98fe42 5389 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
158142c2
FB
5390 mul64To128( bSig, zSig1, &term1, &term2 );
5391 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
bb98fe42 5392 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
5393 --zSig1;
5394 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5395 }
5396 zSig1 |= ( ( rem1 | rem2 ) != 0 );
5397 }
ff32e16e
PM
5398 return roundAndPackFloatx80(STATUS(floatx80_rounding_precision),
5399 zSign, zExp, zSig0, zSig1, status);
158142c2
FB
5400}
5401
5402/*----------------------------------------------------------------------------
5403| Returns the remainder of the extended double-precision floating-point value
5404| `a' with respect to the corresponding value `b'. The operation is performed
5405| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5406*----------------------------------------------------------------------------*/
5407
e5a41ffa 5408floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
158142c2 5409{
ed086f3d 5410 flag aSign, zSign;
158142c2 5411 int32 aExp, bExp, expDiff;
bb98fe42
AF
5412 uint64_t aSig0, aSig1, bSig;
5413 uint64_t q, term0, term1, alternateASig0, alternateASig1;
158142c2
FB
5414 floatx80 z;
5415
5416 aSig0 = extractFloatx80Frac( a );
5417 aExp = extractFloatx80Exp( a );
5418 aSign = extractFloatx80Sign( a );
5419 bSig = extractFloatx80Frac( b );
5420 bExp = extractFloatx80Exp( b );
158142c2 5421 if ( aExp == 0x7FFF ) {
bb98fe42
AF
5422 if ( (uint64_t) ( aSig0<<1 )
5423 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
ff32e16e 5424 return propagateFloatx80NaN(a, b, status);
158142c2
FB
5425 }
5426 goto invalid;
5427 }
5428 if ( bExp == 0x7FFF ) {
ff32e16e
PM
5429 if ((uint64_t)(bSig << 1)) {
5430 return propagateFloatx80NaN(a, b, status);
5431 }
158142c2
FB
5432 return a;
5433 }
5434 if ( bExp == 0 ) {
5435 if ( bSig == 0 ) {
5436 invalid:
ff32e16e 5437 float_raise(float_flag_invalid, status);
158142c2
FB
5438 z.low = floatx80_default_nan_low;
5439 z.high = floatx80_default_nan_high;
5440 return z;
5441 }
5442 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5443 }
5444 if ( aExp == 0 ) {
bb98fe42 5445 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
158142c2
FB
5446 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5447 }
5448 bSig |= LIT64( 0x8000000000000000 );
5449 zSign = aSign;
5450 expDiff = aExp - bExp;
5451 aSig1 = 0;
5452 if ( expDiff < 0 ) {
5453 if ( expDiff < -1 ) return a;
5454 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5455 expDiff = 0;
5456 }
5457 q = ( bSig <= aSig0 );
5458 if ( q ) aSig0 -= bSig;
5459 expDiff -= 64;
5460 while ( 0 < expDiff ) {
5461 q = estimateDiv128To64( aSig0, aSig1, bSig );
5462 q = ( 2 < q ) ? q - 2 : 0;
5463 mul64To128( bSig, q, &term0, &term1 );
5464 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5465 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5466 expDiff -= 62;
5467 }
5468 expDiff += 64;
5469 if ( 0 < expDiff ) {
5470 q = estimateDiv128To64( aSig0, aSig1, bSig );
5471 q = ( 2 < q ) ? q - 2 : 0;
5472 q >>= 64 - expDiff;
5473 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5474 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5475 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5476 while ( le128( term0, term1, aSig0, aSig1 ) ) {
5477 ++q;
5478 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5479 }
5480 }
5481 else {
5482 term1 = 0;
5483 term0 = bSig;
5484 }
5485 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5486 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5487 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5488 && ( q & 1 ) )
5489 ) {
5490 aSig0 = alternateASig0;
5491 aSig1 = alternateASig1;
5492 zSign = ! zSign;
5493 }
5494 return
5495 normalizeRoundAndPackFloatx80(
ff32e16e 5496 80, zSign, bExp + expDiff, aSig0, aSig1, status);
158142c2
FB
5497
5498}
5499
5500/*----------------------------------------------------------------------------
5501| Returns the square root of the extended double-precision floating-point
5502| value `a'. The operation is performed according to the IEC/IEEE Standard
5503| for Binary Floating-Point Arithmetic.
5504*----------------------------------------------------------------------------*/
5505
e5a41ffa 5506floatx80 floatx80_sqrt(floatx80 a, float_status *status)
158142c2
FB
5507{
5508 flag aSign;
5509 int32 aExp, zExp;
bb98fe42
AF
5510 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5511 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
158142c2
FB
5512 floatx80 z;
5513
5514 aSig0 = extractFloatx80Frac( a );
5515 aExp = extractFloatx80Exp( a );
5516 aSign = extractFloatx80Sign( a );
5517 if ( aExp == 0x7FFF ) {
ff32e16e
PM
5518 if ((uint64_t)(aSig0 << 1)) {
5519 return propagateFloatx80NaN(a, a, status);
5520 }
158142c2
FB
5521 if ( ! aSign ) return a;
5522 goto invalid;
5523 }
5524 if ( aSign ) {
5525 if ( ( aExp | aSig0 ) == 0 ) return a;
5526 invalid:
ff32e16e 5527 float_raise(float_flag_invalid, status);
158142c2
FB
5528 z.low = floatx80_default_nan_low;
5529 z.high = floatx80_default_nan_high;
5530 return z;
5531 }
5532 if ( aExp == 0 ) {
5533 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5534 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5535 }
5536 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5537 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5538 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5539 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5540 doubleZSig0 = zSig0<<1;
5541 mul64To128( zSig0, zSig0, &term0, &term1 );
5542 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
bb98fe42 5543 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
5544 --zSig0;
5545 doubleZSig0 -= 2;
5546 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5547 }
5548 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5549 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5550 if ( zSig1 == 0 ) zSig1 = 1;
5551 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5552 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5553 mul64To128( zSig1, zSig1, &term2, &term3 );
5554 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
bb98fe42 5555 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
5556 --zSig1;
5557 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5558 term3 |= 1;
5559 term2 |= doubleZSig0;
5560 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5561 }
5562 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5563 }
5564 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5565 zSig0 |= doubleZSig0;
5566 return
5567 roundAndPackFloatx80(
ff32e16e 5568 STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1, status);
158142c2
FB
5569
5570}
5571
5572/*----------------------------------------------------------------------------
b689362d
AJ
5573| Returns 1 if the extended double-precision floating-point value `a' is equal
5574| to the corresponding value `b', and 0 otherwise. The invalid exception is
5575| raised if either operand is a NaN. Otherwise, the comparison is performed
5576| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
5577*----------------------------------------------------------------------------*/
5578
e5a41ffa 5579int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5580{
5581
5582 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5583 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5584 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5585 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2 5586 ) {
ff32e16e 5587 float_raise(float_flag_invalid, status);
158142c2
FB
5588 return 0;
5589 }
5590 return
5591 ( a.low == b.low )
5592 && ( ( a.high == b.high )
5593 || ( ( a.low == 0 )
bb98fe42 5594 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
5595 );
5596
5597}
5598
5599/*----------------------------------------------------------------------------
5600| Returns 1 if the extended double-precision floating-point value `a' is
5601| less than or equal to the corresponding value `b', and 0 otherwise. The
f5a64251
AJ
5602| invalid exception is raised if either operand is a NaN. The comparison is
5603| performed according to the IEC/IEEE Standard for Binary Floating-Point
5604| Arithmetic.
158142c2
FB
5605*----------------------------------------------------------------------------*/
5606
e5a41ffa 5607int floatx80_le(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5608{
5609 flag aSign, bSign;
5610
5611 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5612 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5613 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5614 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2 5615 ) {
ff32e16e 5616 float_raise(float_flag_invalid, status);
158142c2
FB
5617 return 0;
5618 }
5619 aSign = extractFloatx80Sign( a );
5620 bSign = extractFloatx80Sign( b );
5621 if ( aSign != bSign ) {
5622 return
5623 aSign
bb98fe42 5624 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5625 == 0 );
5626 }
5627 return
5628 aSign ? le128( b.high, b.low, a.high, a.low )
5629 : le128( a.high, a.low, b.high, b.low );
5630
5631}
5632
5633/*----------------------------------------------------------------------------
5634| Returns 1 if the extended double-precision floating-point value `a' is
f5a64251
AJ
5635| less than the corresponding value `b', and 0 otherwise. The invalid
5636| exception is raised if either operand is a NaN. The comparison is performed
5637| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
5638*----------------------------------------------------------------------------*/
5639
e5a41ffa 5640int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5641{
5642 flag aSign, bSign;
5643
5644 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5645 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5646 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5647 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2 5648 ) {
ff32e16e 5649 float_raise(float_flag_invalid, status);
158142c2
FB
5650 return 0;
5651 }
5652 aSign = extractFloatx80Sign( a );
5653 bSign = extractFloatx80Sign( b );
5654 if ( aSign != bSign ) {
5655 return
5656 aSign
bb98fe42 5657 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5658 != 0 );
5659 }
5660 return
5661 aSign ? lt128( b.high, b.low, a.high, a.low )
5662 : lt128( a.high, a.low, b.high, b.low );
5663
5664}
5665
67b7861d
AJ
5666/*----------------------------------------------------------------------------
5667| Returns 1 if the extended double-precision floating-point values `a' and `b'
f5a64251
AJ
5668| cannot be compared, and 0 otherwise. The invalid exception is raised if
5669| either operand is a NaN. The comparison is performed according to the
5670| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
67b7861d 5671*----------------------------------------------------------------------------*/
e5a41ffa 5672int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
67b7861d
AJ
5673{
5674 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5675 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5676 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5677 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5678 ) {
ff32e16e 5679 float_raise(float_flag_invalid, status);
67b7861d
AJ
5680 return 1;
5681 }
5682 return 0;
5683}
5684
158142c2 5685/*----------------------------------------------------------------------------
b689362d 5686| Returns 1 if the extended double-precision floating-point value `a' is
f5a64251
AJ
5687| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5688| cause an exception. The comparison is performed according to the IEC/IEEE
5689| Standard for Binary Floating-Point Arithmetic.
158142c2
FB
5690*----------------------------------------------------------------------------*/
5691
e5a41ffa 5692int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5693{
5694
5695 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5696 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5697 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5698 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2 5699 ) {
b689362d
AJ
5700 if ( floatx80_is_signaling_nan( a )
5701 || floatx80_is_signaling_nan( b ) ) {
ff32e16e 5702 float_raise(float_flag_invalid, status);
b689362d 5703 }
158142c2
FB
5704 return 0;
5705 }
5706 return
5707 ( a.low == b.low )
5708 && ( ( a.high == b.high )
5709 || ( ( a.low == 0 )
bb98fe42 5710 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
5711 );
5712
5713}
5714
5715/*----------------------------------------------------------------------------
5716| Returns 1 if the extended double-precision floating-point value `a' is less
5717| than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
5718| do not cause an exception. Otherwise, the comparison is performed according
5719| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5720*----------------------------------------------------------------------------*/
5721
e5a41ffa 5722int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5723{
5724 flag aSign, bSign;
5725
5726 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5727 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5728 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5729 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2
FB
5730 ) {
5731 if ( floatx80_is_signaling_nan( a )
5732 || floatx80_is_signaling_nan( b ) ) {
ff32e16e 5733 float_raise(float_flag_invalid, status);
158142c2
FB
5734 }
5735 return 0;
5736 }
5737 aSign = extractFloatx80Sign( a );
5738 bSign = extractFloatx80Sign( b );
5739 if ( aSign != bSign ) {
5740 return
5741 aSign
bb98fe42 5742 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5743 == 0 );
5744 }
5745 return
5746 aSign ? le128( b.high, b.low, a.high, a.low )
5747 : le128( a.high, a.low, b.high, b.low );
5748
5749}
5750
5751/*----------------------------------------------------------------------------
5752| Returns 1 if the extended double-precision floating-point value `a' is less
5753| than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
5754| an exception. Otherwise, the comparison is performed according to the
5755| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5756*----------------------------------------------------------------------------*/
5757
e5a41ffa 5758int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5759{
5760 flag aSign, bSign;
5761
5762 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5763 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5764 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5765 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2
FB
5766 ) {
5767 if ( floatx80_is_signaling_nan( a )
5768 || floatx80_is_signaling_nan( b ) ) {
ff32e16e 5769 float_raise(float_flag_invalid, status);
158142c2
FB
5770 }
5771 return 0;
5772 }
5773 aSign = extractFloatx80Sign( a );
5774 bSign = extractFloatx80Sign( b );
5775 if ( aSign != bSign ) {
5776 return
5777 aSign
bb98fe42 5778 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5779 != 0 );
5780 }
5781 return
5782 aSign ? lt128( b.high, b.low, a.high, a.low )
5783 : lt128( a.high, a.low, b.high, b.low );
5784
5785}
5786
67b7861d
AJ
5787/*----------------------------------------------------------------------------
5788| Returns 1 if the extended double-precision floating-point values `a' and `b'
5789| cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
5790| The comparison is performed according to the IEC/IEEE Standard for Binary
5791| Floating-Point Arithmetic.
5792*----------------------------------------------------------------------------*/
e5a41ffa 5793int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
67b7861d
AJ
5794{
5795 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5796 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5797 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5798 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5799 ) {
5800 if ( floatx80_is_signaling_nan( a )
5801 || floatx80_is_signaling_nan( b ) ) {
ff32e16e 5802 float_raise(float_flag_invalid, status);
67b7861d
AJ
5803 }
5804 return 1;
5805 }
5806 return 0;
5807}
5808
158142c2
FB
5809/*----------------------------------------------------------------------------
5810| Returns the result of converting the quadruple-precision floating-point
5811| value `a' to the 32-bit two's complement integer format. The conversion
5812| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5813| Arithmetic---which means in particular that the conversion is rounded
5814| according to the current rounding mode. If `a' is a NaN, the largest
5815| positive integer is returned. Otherwise, if the conversion overflows, the
5816| largest integer with the same sign as `a' is returned.
5817*----------------------------------------------------------------------------*/
5818
e5a41ffa 5819int32 float128_to_int32(float128 a, float_status *status)
158142c2
FB
5820{
5821 flag aSign;
5822 int32 aExp, shiftCount;
bb98fe42 5823 uint64_t aSig0, aSig1;
158142c2
FB
5824
5825 aSig1 = extractFloat128Frac1( a );
5826 aSig0 = extractFloat128Frac0( a );
5827 aExp = extractFloat128Exp( a );
5828 aSign = extractFloat128Sign( a );
5829 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5830 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5831 aSig0 |= ( aSig1 != 0 );
5832 shiftCount = 0x4028 - aExp;
5833 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
ff32e16e 5834 return roundAndPackInt32(aSign, aSig0, status);
158142c2
FB
5835
5836}
5837
5838/*----------------------------------------------------------------------------
5839| Returns the result of converting the quadruple-precision floating-point
5840| value `a' to the 32-bit two's complement integer format. The conversion
5841| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5842| Arithmetic, except that the conversion is always rounded toward zero. If
5843| `a' is a NaN, the largest positive integer is returned. Otherwise, if the
5844| conversion overflows, the largest integer with the same sign as `a' is
5845| returned.
5846*----------------------------------------------------------------------------*/
5847
e5a41ffa 5848int32 float128_to_int32_round_to_zero(float128 a, float_status *status)
158142c2
FB
5849{
5850 flag aSign;
5851 int32 aExp, shiftCount;
bb98fe42 5852 uint64_t aSig0, aSig1, savedASig;
b3a6a2e0 5853 int32_t z;
158142c2
FB
5854
5855 aSig1 = extractFloat128Frac1( a );
5856 aSig0 = extractFloat128Frac0( a );
5857 aExp = extractFloat128Exp( a );
5858 aSign = extractFloat128Sign( a );
5859 aSig0 |= ( aSig1 != 0 );
5860 if ( 0x401E < aExp ) {
5861 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5862 goto invalid;
5863 }
5864 else if ( aExp < 0x3FFF ) {
5865 if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
5866 return 0;
5867 }
5868 aSig0 |= LIT64( 0x0001000000000000 );
5869 shiftCount = 0x402F - aExp;
5870 savedASig = aSig0;
5871 aSig0 >>= shiftCount;
5872 z = aSig0;
5873 if ( aSign ) z = - z;
5874 if ( ( z < 0 ) ^ aSign ) {
5875 invalid:
ff32e16e 5876 float_raise(float_flag_invalid, status);
bb98fe42 5877 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2
FB
5878 }
5879 if ( ( aSig0<<shiftCount ) != savedASig ) {
5880 STATUS(float_exception_flags) |= float_flag_inexact;
5881 }
5882 return z;
5883
5884}
5885
5886/*----------------------------------------------------------------------------
5887| Returns the result of converting the quadruple-precision floating-point
5888| value `a' to the 64-bit two's complement integer format. The conversion
5889| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5890| Arithmetic---which means in particular that the conversion is rounded
5891| according to the current rounding mode. If `a' is a NaN, the largest
5892| positive integer is returned. Otherwise, if the conversion overflows, the
5893| largest integer with the same sign as `a' is returned.
5894*----------------------------------------------------------------------------*/
5895
e5a41ffa 5896int64 float128_to_int64(float128 a, float_status *status)
158142c2
FB
5897{
5898 flag aSign;
5899 int32 aExp, shiftCount;
bb98fe42 5900 uint64_t aSig0, aSig1;
158142c2
FB
5901
5902 aSig1 = extractFloat128Frac1( a );
5903 aSig0 = extractFloat128Frac0( a );
5904 aExp = extractFloat128Exp( a );
5905 aSign = extractFloat128Sign( a );
5906 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5907 shiftCount = 0x402F - aExp;
5908 if ( shiftCount <= 0 ) {
5909 if ( 0x403E < aExp ) {
ff32e16e 5910 float_raise(float_flag_invalid, status);
158142c2
FB
5911 if ( ! aSign
5912 || ( ( aExp == 0x7FFF )
5913 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5914 )
5915 ) {
5916 return LIT64( 0x7FFFFFFFFFFFFFFF );
5917 }
bb98fe42 5918 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
5919 }
5920 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5921 }
5922 else {
5923 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5924 }
ff32e16e 5925 return roundAndPackInt64(aSign, aSig0, aSig1, status);
158142c2
FB
5926
5927}
5928
5929/*----------------------------------------------------------------------------
5930| Returns the result of converting the quadruple-precision floating-point
5931| value `a' to the 64-bit two's complement integer format. The conversion
5932| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5933| Arithmetic, except that the conversion is always rounded toward zero.
5934| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
5935| the conversion overflows, the largest integer with the same sign as `a' is
5936| returned.
5937*----------------------------------------------------------------------------*/
5938
e5a41ffa 5939int64 float128_to_int64_round_to_zero(float128 a, float_status *status)
158142c2
FB
5940{
5941 flag aSign;
5942 int32 aExp, shiftCount;
bb98fe42 5943 uint64_t aSig0, aSig1;
158142c2
FB
5944 int64 z;
5945
5946 aSig1 = extractFloat128Frac1( a );
5947 aSig0 = extractFloat128Frac0( a );
5948 aExp = extractFloat128Exp( a );
5949 aSign = extractFloat128Sign( a );
5950 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5951 shiftCount = aExp - 0x402F;
5952 if ( 0 < shiftCount ) {
5953 if ( 0x403E <= aExp ) {
5954 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5955 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
5956 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5957 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
5958 }
5959 else {
ff32e16e 5960 float_raise(float_flag_invalid, status);
158142c2
FB
5961 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5962 return LIT64( 0x7FFFFFFFFFFFFFFF );
5963 }
5964 }
bb98fe42 5965 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
5966 }
5967 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
bb98fe42 5968 if ( (uint64_t) ( aSig1<<shiftCount ) ) {
158142c2
FB
5969 STATUS(float_exception_flags) |= float_flag_inexact;
5970 }
5971 }
5972 else {
5973 if ( aExp < 0x3FFF ) {
5974 if ( aExp | aSig0 | aSig1 ) {
5975 STATUS(float_exception_flags) |= float_flag_inexact;
5976 }
5977 return 0;
5978 }
5979 z = aSig0>>( - shiftCount );
5980 if ( aSig1
bb98fe42 5981 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
158142c2
FB
5982 STATUS(float_exception_flags) |= float_flag_inexact;
5983 }
5984 }
5985 if ( aSign ) z = - z;
5986 return z;
5987
5988}
5989
5990/*----------------------------------------------------------------------------
5991| Returns the result of converting the quadruple-precision floating-point
5992| value `a' to the single-precision floating-point format. The conversion
5993| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5994| Arithmetic.
5995*----------------------------------------------------------------------------*/
5996
e5a41ffa 5997float32 float128_to_float32(float128 a, float_status *status)
158142c2
FB
5998{
5999 flag aSign;
6000 int32 aExp;
bb98fe42
AF
6001 uint64_t aSig0, aSig1;
6002 uint32_t zSig;
158142c2
FB
6003
6004 aSig1 = extractFloat128Frac1( a );
6005 aSig0 = extractFloat128Frac0( a );
6006 aExp = extractFloat128Exp( a );
6007 aSign = extractFloat128Sign( a );
6008 if ( aExp == 0x7FFF ) {
6009 if ( aSig0 | aSig1 ) {
ff32e16e 6010 return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
158142c2
FB
6011 }
6012 return packFloat32( aSign, 0xFF, 0 );
6013 }
6014 aSig0 |= ( aSig1 != 0 );
6015 shift64RightJamming( aSig0, 18, &aSig0 );
6016 zSig = aSig0;
6017 if ( aExp || zSig ) {
6018 zSig |= 0x40000000;
6019 aExp -= 0x3F81;
6020 }
ff32e16e 6021 return roundAndPackFloat32(aSign, aExp, zSig, status);
158142c2
FB
6022
6023}
6024
6025/*----------------------------------------------------------------------------
6026| Returns the result of converting the quadruple-precision floating-point
6027| value `a' to the double-precision floating-point format. The conversion
6028| is performed according to the IEC/IEEE Standard for Binary Floating-Point
6029| Arithmetic.
6030*----------------------------------------------------------------------------*/
6031
e5a41ffa 6032float64 float128_to_float64(float128 a, float_status *status)
158142c2
FB
6033{
6034 flag aSign;
6035 int32 aExp;
bb98fe42 6036 uint64_t aSig0, aSig1;
158142c2
FB
6037
6038 aSig1 = extractFloat128Frac1( a );
6039 aSig0 = extractFloat128Frac0( a );
6040 aExp = extractFloat128Exp( a );
6041 aSign = extractFloat128Sign( a );
6042 if ( aExp == 0x7FFF ) {
6043 if ( aSig0 | aSig1 ) {
ff32e16e 6044 return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
158142c2
FB
6045 }
6046 return packFloat64( aSign, 0x7FF, 0 );
6047 }
6048 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6049 aSig0 |= ( aSig1 != 0 );
6050 if ( aExp || aSig0 ) {
6051 aSig0 |= LIT64( 0x4000000000000000 );
6052 aExp -= 0x3C01;
6053 }
ff32e16e 6054 return roundAndPackFloat64(aSign, aExp, aSig0, status);
158142c2
FB
6055
6056}
6057
158142c2
FB
6058/*----------------------------------------------------------------------------
6059| Returns the result of converting the quadruple-precision floating-point
6060| value `a' to the extended double-precision floating-point format. The
6061| conversion is performed according to the IEC/IEEE Standard for Binary
6062| Floating-Point Arithmetic.
6063*----------------------------------------------------------------------------*/
6064
e5a41ffa 6065floatx80 float128_to_floatx80(float128 a, float_status *status)
158142c2
FB
6066{
6067 flag aSign;
6068 int32 aExp;
bb98fe42 6069 uint64_t aSig0, aSig1;
158142c2
FB
6070
6071 aSig1 = extractFloat128Frac1( a );
6072 aSig0 = extractFloat128Frac0( a );
6073 aExp = extractFloat128Exp( a );
6074 aSign = extractFloat128Sign( a );
6075 if ( aExp == 0x7FFF ) {
6076 if ( aSig0 | aSig1 ) {
ff32e16e 6077 return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
158142c2
FB
6078 }
6079 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
6080 }
6081 if ( aExp == 0 ) {
6082 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6083 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6084 }
6085 else {
6086 aSig0 |= LIT64( 0x0001000000000000 );
6087 }
6088 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
ff32e16e 6089 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
158142c2
FB
6090
6091}
6092
158142c2
FB
6093/*----------------------------------------------------------------------------
6094| Rounds the quadruple-precision floating-point value `a' to an integer, and
6095| returns the result as a quadruple-precision floating-point value. The
6096| operation is performed according to the IEC/IEEE Standard for Binary
6097| Floating-Point Arithmetic.
6098*----------------------------------------------------------------------------*/
6099
e5a41ffa 6100float128 float128_round_to_int(float128 a, float_status *status)
158142c2
FB
6101{
6102 flag aSign;
6103 int32 aExp;
bb98fe42 6104 uint64_t lastBitMask, roundBitsMask;
158142c2
FB
6105 float128 z;
6106
6107 aExp = extractFloat128Exp( a );
6108 if ( 0x402F <= aExp ) {
6109 if ( 0x406F <= aExp ) {
6110 if ( ( aExp == 0x7FFF )
6111 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6112 ) {
ff32e16e 6113 return propagateFloat128NaN(a, a, status);
158142c2
FB
6114 }
6115 return a;
6116 }
6117 lastBitMask = 1;
6118 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6119 roundBitsMask = lastBitMask - 1;
6120 z = a;
dc355b76
PM
6121 switch (STATUS(float_rounding_mode)) {
6122 case float_round_nearest_even:
158142c2
FB
6123 if ( lastBitMask ) {
6124 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6125 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6126 }
6127 else {
bb98fe42 6128 if ( (int64_t) z.low < 0 ) {
158142c2 6129 ++z.high;
bb98fe42 6130 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
158142c2
FB
6131 }
6132 }
dc355b76 6133 break;
f9288a76
PM
6134 case float_round_ties_away:
6135 if (lastBitMask) {
6136 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6137 } else {
6138 if ((int64_t) z.low < 0) {
6139 ++z.high;
6140 }
6141 }
6142 break;
dc355b76
PM
6143 case float_round_to_zero:
6144 break;
6145 case float_round_up:
6146 if (!extractFloat128Sign(z)) {
6147 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6148 }
6149 break;
6150 case float_round_down:
6151 if (extractFloat128Sign(z)) {
6152 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
158142c2 6153 }
dc355b76
PM
6154 break;
6155 default:
6156 abort();
158142c2
FB
6157 }
6158 z.low &= ~ roundBitsMask;
6159 }
6160 else {
6161 if ( aExp < 0x3FFF ) {
bb98fe42 6162 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
158142c2
FB
6163 STATUS(float_exception_flags) |= float_flag_inexact;
6164 aSign = extractFloat128Sign( a );
6165 switch ( STATUS(float_rounding_mode) ) {
6166 case float_round_nearest_even:
6167 if ( ( aExp == 0x3FFE )
6168 && ( extractFloat128Frac0( a )
6169 | extractFloat128Frac1( a ) )
6170 ) {
6171 return packFloat128( aSign, 0x3FFF, 0, 0 );
6172 }
6173 break;
f9288a76
PM
6174 case float_round_ties_away:
6175 if (aExp == 0x3FFE) {
6176 return packFloat128(aSign, 0x3FFF, 0, 0);
6177 }
6178 break;
158142c2
FB
6179 case float_round_down:
6180 return
6181 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6182 : packFloat128( 0, 0, 0, 0 );
6183 case float_round_up:
6184 return
6185 aSign ? packFloat128( 1, 0, 0, 0 )
6186 : packFloat128( 0, 0x3FFF, 0, 0 );
6187 }
6188 return packFloat128( aSign, 0, 0, 0 );
6189 }
6190 lastBitMask = 1;
6191 lastBitMask <<= 0x402F - aExp;
6192 roundBitsMask = lastBitMask - 1;
6193 z.low = 0;
6194 z.high = a.high;
dc355b76
PM
6195 switch (STATUS(float_rounding_mode)) {
6196 case float_round_nearest_even:
158142c2
FB
6197 z.high += lastBitMask>>1;
6198 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6199 z.high &= ~ lastBitMask;
6200 }
dc355b76 6201 break;
f9288a76
PM
6202 case float_round_ties_away:
6203 z.high += lastBitMask>>1;
6204 break;
dc355b76
PM
6205 case float_round_to_zero:
6206 break;
6207 case float_round_up:
6208 if (!extractFloat128Sign(z)) {
158142c2
FB
6209 z.high |= ( a.low != 0 );
6210 z.high += roundBitsMask;
6211 }
dc355b76
PM
6212 break;
6213 case float_round_down:
6214 if (extractFloat128Sign(z)) {
6215 z.high |= (a.low != 0);
6216 z.high += roundBitsMask;
6217 }
6218 break;
6219 default:
6220 abort();
158142c2
FB
6221 }
6222 z.high &= ~ roundBitsMask;
6223 }
6224 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6225 STATUS(float_exception_flags) |= float_flag_inexact;
6226 }
6227 return z;
6228
6229}
6230
6231/*----------------------------------------------------------------------------
6232| Returns the result of adding the absolute values of the quadruple-precision
6233| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
6234| before being returned. `zSign' is ignored if the result is a NaN.
6235| The addition is performed according to the IEC/IEEE Standard for Binary
6236| Floating-Point Arithmetic.
6237*----------------------------------------------------------------------------*/
6238
e5a41ffa
PM
6239static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
6240 float_status *status)
158142c2
FB
6241{
6242 int32 aExp, bExp, zExp;
bb98fe42 6243 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
158142c2
FB
6244 int32 expDiff;
6245
6246 aSig1 = extractFloat128Frac1( a );
6247 aSig0 = extractFloat128Frac0( a );
6248 aExp = extractFloat128Exp( a );
6249 bSig1 = extractFloat128Frac1( b );
6250 bSig0 = extractFloat128Frac0( b );
6251 bExp = extractFloat128Exp( b );
6252 expDiff = aExp - bExp;
6253 if ( 0 < expDiff ) {
6254 if ( aExp == 0x7FFF ) {
ff32e16e
PM
6255 if (aSig0 | aSig1) {
6256 return propagateFloat128NaN(a, b, status);
6257 }
158142c2
FB
6258 return a;
6259 }
6260 if ( bExp == 0 ) {
6261 --expDiff;
6262 }
6263 else {
6264 bSig0 |= LIT64( 0x0001000000000000 );
6265 }
6266 shift128ExtraRightJamming(
6267 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6268 zExp = aExp;
6269 }
6270 else if ( expDiff < 0 ) {
6271 if ( bExp == 0x7FFF ) {
ff32e16e
PM
6272 if (bSig0 | bSig1) {
6273 return propagateFloat128NaN(a, b, status);
6274 }
158142c2
FB
6275 return packFloat128( zSign, 0x7FFF, 0, 0 );
6276 }
6277 if ( aExp == 0 ) {
6278 ++expDiff;
6279 }
6280 else {
6281 aSig0 |= LIT64( 0x0001000000000000 );
6282 }
6283 shift128ExtraRightJamming(
6284 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6285 zExp = bExp;
6286 }
6287 else {
6288 if ( aExp == 0x7FFF ) {
6289 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
ff32e16e 6290 return propagateFloat128NaN(a, b, status);
158142c2
FB
6291 }
6292 return a;
6293 }
6294 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
fe76d976 6295 if ( aExp == 0 ) {
e6afc87f
PM
6296 if (STATUS(flush_to_zero)) {
6297 if (zSig0 | zSig1) {
ff32e16e 6298 float_raise(float_flag_output_denormal, status);
e6afc87f
PM
6299 }
6300 return packFloat128(zSign, 0, 0, 0);
6301 }
fe76d976
PB
6302 return packFloat128( zSign, 0, zSig0, zSig1 );
6303 }
158142c2
FB
6304 zSig2 = 0;
6305 zSig0 |= LIT64( 0x0002000000000000 );
6306 zExp = aExp;
6307 goto shiftRight1;
6308 }
6309 aSig0 |= LIT64( 0x0001000000000000 );
6310 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6311 --zExp;
6312 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6313 ++zExp;
6314 shiftRight1:
6315 shift128ExtraRightJamming(
6316 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6317 roundAndPack:
ff32e16e 6318 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
158142c2
FB
6319
6320}
6321
6322/*----------------------------------------------------------------------------
6323| Returns the result of subtracting the absolute values of the quadruple-
6324| precision floating-point values `a' and `b'. If `zSign' is 1, the
6325| difference is negated before being returned. `zSign' is ignored if the
6326| result is a NaN. The subtraction is performed according to the IEC/IEEE
6327| Standard for Binary Floating-Point Arithmetic.
6328*----------------------------------------------------------------------------*/
6329
e5a41ffa
PM
6330static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6331 float_status *status)
158142c2
FB
6332{
6333 int32 aExp, bExp, zExp;
bb98fe42 6334 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
158142c2
FB
6335 int32 expDiff;
6336 float128 z;
6337
6338 aSig1 = extractFloat128Frac1( a );
6339 aSig0 = extractFloat128Frac0( a );
6340 aExp = extractFloat128Exp( a );
6341 bSig1 = extractFloat128Frac1( b );
6342 bSig0 = extractFloat128Frac0( b );
6343 bExp = extractFloat128Exp( b );
6344 expDiff = aExp - bExp;
6345 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6346 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6347 if ( 0 < expDiff ) goto aExpBigger;
6348 if ( expDiff < 0 ) goto bExpBigger;
6349 if ( aExp == 0x7FFF ) {
6350 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
ff32e16e 6351 return propagateFloat128NaN(a, b, status);
158142c2 6352 }
ff32e16e 6353 float_raise(float_flag_invalid, status);
158142c2
FB
6354 z.low = float128_default_nan_low;
6355 z.high = float128_default_nan_high;
6356 return z;
6357 }
6358 if ( aExp == 0 ) {
6359 aExp = 1;
6360 bExp = 1;
6361 }
6362 if ( bSig0 < aSig0 ) goto aBigger;
6363 if ( aSig0 < bSig0 ) goto bBigger;
6364 if ( bSig1 < aSig1 ) goto aBigger;
6365 if ( aSig1 < bSig1 ) goto bBigger;
6366 return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
6367 bExpBigger:
6368 if ( bExp == 0x7FFF ) {
ff32e16e
PM
6369 if (bSig0 | bSig1) {
6370 return propagateFloat128NaN(a, b, status);
6371 }
158142c2
FB
6372 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6373 }
6374 if ( aExp == 0 ) {
6375 ++expDiff;
6376 }
6377 else {
6378 aSig0 |= LIT64( 0x4000000000000000 );
6379 }
6380 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6381 bSig0 |= LIT64( 0x4000000000000000 );
6382 bBigger:
6383 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6384 zExp = bExp;
6385 zSign ^= 1;
6386 goto normalizeRoundAndPack;
6387 aExpBigger:
6388 if ( aExp == 0x7FFF ) {
ff32e16e
PM
6389 if (aSig0 | aSig1) {
6390 return propagateFloat128NaN(a, b, status);
6391 }
158142c2
FB
6392 return a;
6393 }
6394 if ( bExp == 0 ) {
6395 --expDiff;
6396 }
6397 else {
6398 bSig0 |= LIT64( 0x4000000000000000 );
6399 }
6400 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6401 aSig0 |= LIT64( 0x4000000000000000 );
6402 aBigger:
6403 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6404 zExp = aExp;
6405 normalizeRoundAndPack:
6406 --zExp;
ff32e16e
PM
6407 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6408 status);
158142c2
FB
6409
6410}
6411
6412/*----------------------------------------------------------------------------
6413| Returns the result of adding the quadruple-precision floating-point values
6414| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
6415| for Binary Floating-Point Arithmetic.
6416*----------------------------------------------------------------------------*/
6417
e5a41ffa 6418float128 float128_add(float128 a, float128 b, float_status *status)
158142c2
FB
6419{
6420 flag aSign, bSign;
6421
6422 aSign = extractFloat128Sign( a );
6423 bSign = extractFloat128Sign( b );
6424 if ( aSign == bSign ) {
ff32e16e 6425 return addFloat128Sigs(a, b, aSign, status);
158142c2
FB
6426 }
6427 else {
ff32e16e 6428 return subFloat128Sigs(a, b, aSign, status);
158142c2
FB
6429 }
6430
6431}
6432
6433/*----------------------------------------------------------------------------
6434| Returns the result of subtracting the quadruple-precision floating-point
6435| values `a' and `b'. The operation is performed according to the IEC/IEEE
6436| Standard for Binary Floating-Point Arithmetic.
6437*----------------------------------------------------------------------------*/
6438
e5a41ffa 6439float128 float128_sub(float128 a, float128 b, float_status *status)
158142c2
FB
6440{
6441 flag aSign, bSign;
6442
6443 aSign = extractFloat128Sign( a );
6444 bSign = extractFloat128Sign( b );
6445 if ( aSign == bSign ) {
ff32e16e 6446 return subFloat128Sigs(a, b, aSign, status);
158142c2
FB
6447 }
6448 else {
ff32e16e 6449 return addFloat128Sigs(a, b, aSign, status);
158142c2
FB
6450 }
6451
6452}
6453
6454/*----------------------------------------------------------------------------
6455| Returns the result of multiplying the quadruple-precision floating-point
6456| values `a' and `b'. The operation is performed according to the IEC/IEEE
6457| Standard for Binary Floating-Point Arithmetic.
6458*----------------------------------------------------------------------------*/
6459
e5a41ffa 6460float128 float128_mul(float128 a, float128 b, float_status *status)
158142c2
FB
6461{
6462 flag aSign, bSign, zSign;
6463 int32 aExp, bExp, zExp;
bb98fe42 6464 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
158142c2
FB
6465 float128 z;
6466
6467 aSig1 = extractFloat128Frac1( a );
6468 aSig0 = extractFloat128Frac0( a );
6469 aExp = extractFloat128Exp( a );
6470 aSign = extractFloat128Sign( a );
6471 bSig1 = extractFloat128Frac1( b );
6472 bSig0 = extractFloat128Frac0( b );
6473 bExp = extractFloat128Exp( b );
6474 bSign = extractFloat128Sign( b );
6475 zSign = aSign ^ bSign;
6476 if ( aExp == 0x7FFF ) {
6477 if ( ( aSig0 | aSig1 )
6478 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
ff32e16e 6479 return propagateFloat128NaN(a, b, status);
158142c2
FB
6480 }
6481 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6482 return packFloat128( zSign, 0x7FFF, 0, 0 );
6483 }
6484 if ( bExp == 0x7FFF ) {
ff32e16e
PM
6485 if (bSig0 | bSig1) {
6486 return propagateFloat128NaN(a, b, status);
6487 }
158142c2
FB
6488 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6489 invalid:
ff32e16e 6490 float_raise(float_flag_invalid, status);
158142c2
FB
6491 z.low = float128_default_nan_low;
6492 z.high = float128_default_nan_high;
6493 return z;
6494 }
6495 return packFloat128( zSign, 0x7FFF, 0, 0 );
6496 }
6497 if ( aExp == 0 ) {
6498 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6499 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6500 }
6501 if ( bExp == 0 ) {
6502 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6503 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6504 }
6505 zExp = aExp + bExp - 0x4000;
6506 aSig0 |= LIT64( 0x0001000000000000 );
6507 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6508 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6509 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6510 zSig2 |= ( zSig3 != 0 );
6511 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6512 shift128ExtraRightJamming(
6513 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6514 ++zExp;
6515 }
ff32e16e 6516 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
158142c2
FB
6517
6518}
6519
6520/*----------------------------------------------------------------------------
6521| Returns the result of dividing the quadruple-precision floating-point value
6522| `a' by the corresponding value `b'. The operation is performed according to
6523| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6524*----------------------------------------------------------------------------*/
6525
e5a41ffa 6526float128 float128_div(float128 a, float128 b, float_status *status)
158142c2
FB
6527{
6528 flag aSign, bSign, zSign;
6529 int32 aExp, bExp, zExp;
bb98fe42
AF
6530 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6531 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
158142c2
FB
6532 float128 z;
6533
6534 aSig1 = extractFloat128Frac1( a );
6535 aSig0 = extractFloat128Frac0( a );
6536 aExp = extractFloat128Exp( a );
6537 aSign = extractFloat128Sign( a );
6538 bSig1 = extractFloat128Frac1( b );
6539 bSig0 = extractFloat128Frac0( b );
6540 bExp = extractFloat128Exp( b );
6541 bSign = extractFloat128Sign( b );
6542 zSign = aSign ^ bSign;
6543 if ( aExp == 0x7FFF ) {
ff32e16e
PM
6544 if (aSig0 | aSig1) {
6545 return propagateFloat128NaN(a, b, status);
6546 }
158142c2 6547 if ( bExp == 0x7FFF ) {
ff32e16e
PM
6548 if (bSig0 | bSig1) {
6549 return propagateFloat128NaN(a, b, status);
6550 }
158142c2
FB
6551 goto invalid;
6552 }
6553 return packFloat128( zSign, 0x7FFF, 0, 0 );
6554 }
6555 if ( bExp == 0x7FFF ) {
ff32e16e
PM
6556 if (bSig0 | bSig1) {
6557 return propagateFloat128NaN(a, b, status);
6558 }
158142c2
FB
6559 return packFloat128( zSign, 0, 0, 0 );
6560 }
6561 if ( bExp == 0 ) {
6562 if ( ( bSig0 | bSig1 ) == 0 ) {
6563 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6564 invalid:
ff32e16e 6565 float_raise(float_flag_invalid, status);
158142c2
FB
6566 z.low = float128_default_nan_low;
6567 z.high = float128_default_nan_high;
6568 return z;
6569 }
ff32e16e 6570 float_raise(float_flag_divbyzero, status);
158142c2
FB
6571 return packFloat128( zSign, 0x7FFF, 0, 0 );
6572 }
6573 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6574 }
6575 if ( aExp == 0 ) {
6576 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6577 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6578 }
6579 zExp = aExp - bExp + 0x3FFD;
6580 shortShift128Left(
6581 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6582 shortShift128Left(
6583 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6584 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6585 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6586 ++zExp;
6587 }
6588 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6589 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6590 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
bb98fe42 6591 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
6592 --zSig0;
6593 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6594 }
6595 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6596 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6597 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6598 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
bb98fe42 6599 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
6600 --zSig1;
6601 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6602 }
6603 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6604 }
6605 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
ff32e16e 6606 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
158142c2
FB
6607
6608}
6609
6610/*----------------------------------------------------------------------------
6611| Returns the remainder of the quadruple-precision floating-point value `a'
6612| with respect to the corresponding value `b'. The operation is performed
6613| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6614*----------------------------------------------------------------------------*/
6615
e5a41ffa 6616float128 float128_rem(float128 a, float128 b, float_status *status)
158142c2 6617{
ed086f3d 6618 flag aSign, zSign;
158142c2 6619 int32 aExp, bExp, expDiff;
bb98fe42
AF
6620 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6621 uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6622 int64_t sigMean0;
158142c2
FB
6623 float128 z;
6624
6625 aSig1 = extractFloat128Frac1( a );
6626 aSig0 = extractFloat128Frac0( a );
6627 aExp = extractFloat128Exp( a );
6628 aSign = extractFloat128Sign( a );
6629 bSig1 = extractFloat128Frac1( b );
6630 bSig0 = extractFloat128Frac0( b );
6631 bExp = extractFloat128Exp( b );
158142c2
FB
6632 if ( aExp == 0x7FFF ) {
6633 if ( ( aSig0 | aSig1 )
6634 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
ff32e16e 6635 return propagateFloat128NaN(a, b, status);
158142c2
FB
6636 }
6637 goto invalid;
6638 }
6639 if ( bExp == 0x7FFF ) {
ff32e16e
PM
6640 if (bSig0 | bSig1) {
6641 return propagateFloat128NaN(a, b, status);
6642 }
158142c2
FB
6643 return a;
6644 }
6645 if ( bExp == 0 ) {
6646 if ( ( bSig0 | bSig1 ) == 0 ) {
6647 invalid:
ff32e16e 6648 float_raise(float_flag_invalid, status);
158142c2
FB
6649 z.low = float128_default_nan_low;
6650 z.high = float128_default_nan_high;
6651 return z;
6652 }
6653 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6654 }
6655 if ( aExp == 0 ) {
6656 if ( ( aSig0 | aSig1 ) == 0 ) return a;
6657 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6658 }
6659 expDiff = aExp - bExp;
6660 if ( expDiff < -1 ) return a;
6661 shortShift128Left(
6662 aSig0 | LIT64( 0x0001000000000000 ),
6663 aSig1,
6664 15 - ( expDiff < 0 ),
6665 &aSig0,
6666 &aSig1
6667 );
6668 shortShift128Left(
6669 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6670 q = le128( bSig0, bSig1, aSig0, aSig1 );
6671 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6672 expDiff -= 64;
6673 while ( 0 < expDiff ) {
6674 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6675 q = ( 4 < q ) ? q - 4 : 0;
6676 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6677 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6678 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6679 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6680 expDiff -= 61;
6681 }
6682 if ( -64 < expDiff ) {
6683 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6684 q = ( 4 < q ) ? q - 4 : 0;
6685 q >>= - expDiff;
6686 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6687 expDiff += 52;
6688 if ( expDiff < 0 ) {
6689 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6690 }
6691 else {
6692 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6693 }
6694 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6695 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6696 }
6697 else {
6698 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6699 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6700 }
6701 do {
6702 alternateASig0 = aSig0;
6703 alternateASig1 = aSig1;
6704 ++q;
6705 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
bb98fe42 6706 } while ( 0 <= (int64_t) aSig0 );
158142c2 6707 add128(
bb98fe42 6708 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
158142c2
FB
6709 if ( ( sigMean0 < 0 )
6710 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6711 aSig0 = alternateASig0;
6712 aSig1 = alternateASig1;
6713 }
bb98fe42 6714 zSign = ( (int64_t) aSig0 < 0 );
158142c2 6715 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
ff32e16e
PM
6716 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
6717 status);
158142c2
FB
6718}
6719
6720/*----------------------------------------------------------------------------
6721| Returns the square root of the quadruple-precision floating-point value `a'.
6722| The operation is performed according to the IEC/IEEE Standard for Binary
6723| Floating-Point Arithmetic.
6724*----------------------------------------------------------------------------*/
6725
e5a41ffa 6726float128 float128_sqrt(float128 a, float_status *status)
158142c2
FB
6727{
6728 flag aSign;
6729 int32 aExp, zExp;
bb98fe42
AF
6730 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6731 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
158142c2
FB
6732 float128 z;
6733
6734 aSig1 = extractFloat128Frac1( a );
6735 aSig0 = extractFloat128Frac0( a );
6736 aExp = extractFloat128Exp( a );
6737 aSign = extractFloat128Sign( a );
6738 if ( aExp == 0x7FFF ) {
ff32e16e
PM
6739 if (aSig0 | aSig1) {
6740 return propagateFloat128NaN(a, a, status);
6741 }
158142c2
FB
6742 if ( ! aSign ) return a;
6743 goto invalid;
6744 }
6745 if ( aSign ) {
6746 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6747 invalid:
ff32e16e 6748 float_raise(float_flag_invalid, status);
158142c2
FB
6749 z.low = float128_default_nan_low;
6750 z.high = float128_default_nan_high;
6751 return z;
6752 }
6753 if ( aExp == 0 ) {
6754 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6755 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6756 }
6757 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6758 aSig0 |= LIT64( 0x0001000000000000 );
6759 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6760 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6761 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6762 doubleZSig0 = zSig0<<1;
6763 mul64To128( zSig0, zSig0, &term0, &term1 );
6764 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
bb98fe42 6765 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
6766 --zSig0;
6767 doubleZSig0 -= 2;
6768 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6769 }
6770 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6771 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6772 if ( zSig1 == 0 ) zSig1 = 1;
6773 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6774 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6775 mul64To128( zSig1, zSig1, &term2, &term3 );
6776 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
bb98fe42 6777 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
6778 --zSig1;
6779 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6780 term3 |= 1;
6781 term2 |= doubleZSig0;
6782 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6783 }
6784 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6785 }
6786 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
ff32e16e 6787 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
158142c2
FB
6788
6789}
6790
6791/*----------------------------------------------------------------------------
6792| Returns 1 if the quadruple-precision floating-point value `a' is equal to
b689362d
AJ
6793| the corresponding value `b', and 0 otherwise. The invalid exception is
6794| raised if either operand is a NaN. Otherwise, the comparison is performed
158142c2
FB
6795| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6796*----------------------------------------------------------------------------*/
6797
e5a41ffa 6798int float128_eq(float128 a, float128 b, float_status *status)
158142c2
FB
6799{
6800
6801 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6802 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6803 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6804 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6805 ) {
ff32e16e 6806 float_raise(float_flag_invalid, status);
158142c2
FB
6807 return 0;
6808 }
6809 return
6810 ( a.low == b.low )
6811 && ( ( a.high == b.high )
6812 || ( ( a.low == 0 )
bb98fe42 6813 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
6814 );
6815
6816}
6817
6818/*----------------------------------------------------------------------------
6819| Returns 1 if the quadruple-precision floating-point value `a' is less than
f5a64251
AJ
6820| or equal to the corresponding value `b', and 0 otherwise. The invalid
6821| exception is raised if either operand is a NaN. The comparison is performed
6822| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
6823*----------------------------------------------------------------------------*/
6824
e5a41ffa 6825int float128_le(float128 a, float128 b, float_status *status)
158142c2
FB
6826{
6827 flag aSign, bSign;
6828
6829 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6830 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6831 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6832 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6833 ) {
ff32e16e 6834 float_raise(float_flag_invalid, status);
158142c2
FB
6835 return 0;
6836 }
6837 aSign = extractFloat128Sign( a );
6838 bSign = extractFloat128Sign( b );
6839 if ( aSign != bSign ) {
6840 return
6841 aSign
bb98fe42 6842 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
6843 == 0 );
6844 }
6845 return
6846 aSign ? le128( b.high, b.low, a.high, a.low )
6847 : le128( a.high, a.low, b.high, b.low );
6848
6849}
6850
6851/*----------------------------------------------------------------------------
6852| Returns 1 if the quadruple-precision floating-point value `a' is less than
f5a64251
AJ
6853| the corresponding value `b', and 0 otherwise. The invalid exception is
6854| raised if either operand is a NaN. The comparison is performed according
6855| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
6856*----------------------------------------------------------------------------*/
6857
e5a41ffa 6858int float128_lt(float128 a, float128 b, float_status *status)
158142c2
FB
6859{
6860 flag aSign, bSign;
6861
6862 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6863 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6864 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6865 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6866 ) {
ff32e16e 6867 float_raise(float_flag_invalid, status);
158142c2
FB
6868 return 0;
6869 }
6870 aSign = extractFloat128Sign( a );
6871 bSign = extractFloat128Sign( b );
6872 if ( aSign != bSign ) {
6873 return
6874 aSign
bb98fe42 6875 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
6876 != 0 );
6877 }
6878 return
6879 aSign ? lt128( b.high, b.low, a.high, a.low )
6880 : lt128( a.high, a.low, b.high, b.low );
6881
6882}
6883
67b7861d
AJ
6884/*----------------------------------------------------------------------------
6885| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
f5a64251
AJ
6886| be compared, and 0 otherwise. The invalid exception is raised if either
6887| operand is a NaN. The comparison is performed according to the IEC/IEEE
6888| Standard for Binary Floating-Point Arithmetic.
67b7861d
AJ
6889*----------------------------------------------------------------------------*/
6890
e5a41ffa 6891int float128_unordered(float128 a, float128 b, float_status *status)
67b7861d
AJ
6892{
6893 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6894 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6895 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6896 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6897 ) {
ff32e16e 6898 float_raise(float_flag_invalid, status);
67b7861d
AJ
6899 return 1;
6900 }
6901 return 0;
6902}
6903
158142c2
FB
6904/*----------------------------------------------------------------------------
6905| Returns 1 if the quadruple-precision floating-point value `a' is equal to
f5a64251
AJ
6906| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6907| exception. The comparison is performed according to the IEC/IEEE Standard
6908| for Binary Floating-Point Arithmetic.
158142c2
FB
6909*----------------------------------------------------------------------------*/
6910
e5a41ffa 6911int float128_eq_quiet(float128 a, float128 b, float_status *status)
158142c2
FB
6912{
6913
6914 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6915 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6916 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6917 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6918 ) {
b689362d
AJ
6919 if ( float128_is_signaling_nan( a )
6920 || float128_is_signaling_nan( b ) ) {
ff32e16e 6921 float_raise(float_flag_invalid, status);
b689362d 6922 }
158142c2
FB
6923 return 0;
6924 }
6925 return
6926 ( a.low == b.low )
6927 && ( ( a.high == b.high )
6928 || ( ( a.low == 0 )
bb98fe42 6929 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
6930 );
6931
6932}
6933
6934/*----------------------------------------------------------------------------
6935| Returns 1 if the quadruple-precision floating-point value `a' is less than
6936| or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
6937| cause an exception. Otherwise, the comparison is performed according to the
6938| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6939*----------------------------------------------------------------------------*/
6940
e5a41ffa 6941int float128_le_quiet(float128 a, float128 b, float_status *status)
158142c2
FB
6942{
6943 flag aSign, bSign;
6944
6945 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6946 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6947 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6948 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6949 ) {
6950 if ( float128_is_signaling_nan( a )
6951 || float128_is_signaling_nan( b ) ) {
ff32e16e 6952 float_raise(float_flag_invalid, status);
158142c2
FB
6953 }
6954 return 0;
6955 }
6956 aSign = extractFloat128Sign( a );
6957 bSign = extractFloat128Sign( b );
6958 if ( aSign != bSign ) {
6959 return
6960 aSign
bb98fe42 6961 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
6962 == 0 );
6963 }
6964 return
6965 aSign ? le128( b.high, b.low, a.high, a.low )
6966 : le128( a.high, a.low, b.high, b.low );
6967
6968}
6969
6970/*----------------------------------------------------------------------------
6971| Returns 1 if the quadruple-precision floating-point value `a' is less than
6972| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6973| exception. Otherwise, the comparison is performed according to the IEC/IEEE
6974| Standard for Binary Floating-Point Arithmetic.
6975*----------------------------------------------------------------------------*/
6976
e5a41ffa 6977int float128_lt_quiet(float128 a, float128 b, float_status *status)
158142c2
FB
6978{
6979 flag aSign, bSign;
6980
6981 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6982 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6983 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6984 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6985 ) {
6986 if ( float128_is_signaling_nan( a )
6987 || float128_is_signaling_nan( b ) ) {
ff32e16e 6988 float_raise(float_flag_invalid, status);
158142c2
FB
6989 }
6990 return 0;
6991 }
6992 aSign = extractFloat128Sign( a );
6993 bSign = extractFloat128Sign( b );
6994 if ( aSign != bSign ) {
6995 return
6996 aSign
bb98fe42 6997 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
6998 != 0 );
6999 }
7000 return
7001 aSign ? lt128( b.high, b.low, a.high, a.low )
7002 : lt128( a.high, a.low, b.high, b.low );
7003
7004}
7005
67b7861d
AJ
7006/*----------------------------------------------------------------------------
7007| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7008| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
7009| comparison is performed according to the IEC/IEEE Standard for Binary
7010| Floating-Point Arithmetic.
7011*----------------------------------------------------------------------------*/
7012
e5a41ffa 7013int float128_unordered_quiet(float128 a, float128 b, float_status *status)
67b7861d
AJ
7014{
7015 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
7016 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7017 || ( ( extractFloat128Exp( b ) == 0x7FFF )
7018 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7019 ) {
7020 if ( float128_is_signaling_nan( a )
7021 || float128_is_signaling_nan( b ) ) {
ff32e16e 7022 float_raise(float_flag_invalid, status);
67b7861d
AJ
7023 }
7024 return 1;
7025 }
7026 return 0;
7027}
7028
1d6bda35 7029/* misc functions */
e5a41ffa 7030float32 uint32_to_float32(uint32_t a, float_status *status)
1d6bda35 7031{
ff32e16e 7032 return int64_to_float32(a, status);
1d6bda35
FB
7033}
7034
e5a41ffa 7035float64 uint32_to_float64(uint32_t a, float_status *status)
1d6bda35 7036{
ff32e16e 7037 return int64_to_float64(a, status);
1d6bda35
FB
7038}
7039
e5a41ffa 7040uint32 float32_to_uint32(float32 a, float_status *status)
1d6bda35
FB
7041{
7042 int64_t v;
9f8d2a09 7043 uint32 res;
34e1c27b 7044 int old_exc_flags = get_float_exception_flags(status);
1d6bda35 7045
ff32e16e 7046 v = float32_to_int64(a, status);
1d6bda35
FB
7047 if (v < 0) {
7048 res = 0;
1d6bda35
FB
7049 } else if (v > 0xffffffff) {
7050 res = 0xffffffff;
1d6bda35 7051 } else {
34e1c27b 7052 return v;
1d6bda35 7053 }
34e1c27b 7054 set_float_exception_flags(old_exc_flags, status);
ff32e16e 7055 float_raise(float_flag_invalid, status);
1d6bda35
FB
7056 return res;
7057}
7058
e5a41ffa 7059uint32 float32_to_uint32_round_to_zero(float32 a, float_status *status)
1d6bda35
FB
7060{
7061 int64_t v;
9f8d2a09 7062 uint32 res;
34e1c27b 7063 int old_exc_flags = get_float_exception_flags(status);
1d6bda35 7064
ff32e16e 7065 v = float32_to_int64_round_to_zero(a, status);
1d6bda35
FB
7066 if (v < 0) {
7067 res = 0;
1d6bda35
FB
7068 } else if (v > 0xffffffff) {
7069 res = 0xffffffff;
1d6bda35 7070 } else {
34e1c27b 7071 return v;
1d6bda35 7072 }
34e1c27b 7073 set_float_exception_flags(old_exc_flags, status);
ff32e16e 7074 float_raise(float_flag_invalid, status);
1d6bda35
FB
7075 return res;
7076}
7077
e5a41ffa 7078int_fast16_t float32_to_int16(float32 a, float_status *status)
f581bf54
WN
7079{
7080 int32_t v;
7081 int_fast16_t res;
7082 int old_exc_flags = get_float_exception_flags(status);
7083
ff32e16e 7084 v = float32_to_int32(a, status);
f581bf54
WN
7085 if (v < -0x8000) {
7086 res = -0x8000;
7087 } else if (v > 0x7fff) {
7088 res = 0x7fff;
7089 } else {
7090 return v;
7091 }
7092
7093 set_float_exception_flags(old_exc_flags, status);
ff32e16e 7094 float_raise(float_flag_invalid, status);
f581bf54
WN
7095 return res;
7096}
7097
e5a41ffa 7098uint_fast16_t float32_to_uint16(float32 a, float_status *status)
f581bf54
WN
7099{
7100 int32_t v;
7101 uint_fast16_t res;
7102 int old_exc_flags = get_float_exception_flags(status);
7103
ff32e16e 7104 v = float32_to_int32(a, status);
f581bf54
WN
7105 if (v < 0) {
7106 res = 0;
7107 } else if (v > 0xffff) {
7108 res = 0xffff;
7109 } else {
7110 return v;
7111 }
7112
7113 set_float_exception_flags(old_exc_flags, status);
ff32e16e 7114 float_raise(float_flag_invalid, status);
f581bf54
WN
7115 return res;
7116}
7117
e5a41ffa 7118uint_fast16_t float32_to_uint16_round_to_zero(float32 a, float_status *status)
cbcef455
PM
7119{
7120 int64_t v;
5aea4c58 7121 uint_fast16_t res;
34e1c27b 7122 int old_exc_flags = get_float_exception_flags(status);
cbcef455 7123
ff32e16e 7124 v = float32_to_int64_round_to_zero(a, status);
cbcef455
PM
7125 if (v < 0) {
7126 res = 0;
cbcef455
PM
7127 } else if (v > 0xffff) {
7128 res = 0xffff;
cbcef455 7129 } else {
34e1c27b 7130 return v;
cbcef455 7131 }
34e1c27b 7132 set_float_exception_flags(old_exc_flags, status);
ff32e16e 7133 float_raise(float_flag_invalid, status);
cbcef455
PM
7134 return res;
7135}
7136
e5a41ffa 7137uint32 float64_to_uint32(float64 a, float_status *status)
1d6bda35 7138{
5e7f654f 7139 uint64_t v;
9f8d2a09 7140 uint32 res;
5e7f654f 7141 int old_exc_flags = get_float_exception_flags(status);
1d6bda35 7142
ff32e16e 7143 v = float64_to_uint64(a, status);
5e7f654f 7144 if (v > 0xffffffff) {
1d6bda35 7145 res = 0xffffffff;
1d6bda35 7146 } else {
5e7f654f 7147 return v;
1d6bda35 7148 }
5e7f654f 7149 set_float_exception_flags(old_exc_flags, status);
ff32e16e 7150 float_raise(float_flag_invalid, status);
1d6bda35
FB
7151 return res;
7152}
7153
e5a41ffa 7154uint32 float64_to_uint32_round_to_zero(float64 a, float_status *status)
1d6bda35 7155{
fd728f2f 7156 uint64_t v;
9f8d2a09 7157 uint32 res;
fd728f2f 7158 int old_exc_flags = get_float_exception_flags(status);
1d6bda35 7159
ff32e16e 7160 v = float64_to_uint64_round_to_zero(a, status);
fd728f2f 7161 if (v > 0xffffffff) {
1d6bda35 7162 res = 0xffffffff;
1d6bda35 7163 } else {
fd728f2f 7164 return v;
1d6bda35 7165 }
fd728f2f 7166 set_float_exception_flags(old_exc_flags, status);
ff32e16e 7167 float_raise(float_flag_invalid, status);
1d6bda35
FB
7168 return res;
7169}
7170
e5a41ffa 7171int_fast16_t float64_to_int16(float64 a, float_status *status)
f581bf54
WN
7172{
7173 int64_t v;
7174 int_fast16_t res;
7175 int old_exc_flags = get_float_exception_flags(status);
7176
ff32e16e 7177 v = float64_to_int32(a, status);
f581bf54
WN
7178 if (v < -0x8000) {
7179 res = -0x8000;
7180 } else if (v > 0x7fff) {
7181 res = 0x7fff;
7182 } else {
7183 return v;
7184 }
7185
7186 set_float_exception_flags(old_exc_flags, status);
ff32e16e 7187 float_raise(float_flag_invalid, status);
f581bf54
WN
7188 return res;
7189}
7190
e5a41ffa 7191uint_fast16_t float64_to_uint16(float64 a, float_status *status)
f581bf54
WN
7192{
7193 int64_t v;
7194 uint_fast16_t res;
7195 int old_exc_flags = get_float_exception_flags(status);
7196
ff32e16e 7197 v = float64_to_int32(a, status);
f581bf54
WN
7198 if (v < 0) {
7199 res = 0;
7200 } else if (v > 0xffff) {
7201 res = 0xffff;
7202 } else {
7203 return v;
7204 }
7205
7206 set_float_exception_flags(old_exc_flags, status);
ff32e16e 7207 float_raise(float_flag_invalid, status);
f581bf54
WN
7208 return res;
7209}
7210
e5a41ffa 7211uint_fast16_t float64_to_uint16_round_to_zero(float64 a, float_status *status)
cbcef455
PM
7212{
7213 int64_t v;
5aea4c58 7214 uint_fast16_t res;
34e1c27b 7215 int old_exc_flags = get_float_exception_flags(status);
cbcef455 7216
ff32e16e 7217 v = float64_to_int64_round_to_zero(a, status);
cbcef455
PM
7218 if (v < 0) {
7219 res = 0;
cbcef455
PM
7220 } else if (v > 0xffff) {
7221 res = 0xffff;
cbcef455 7222 } else {
34e1c27b 7223 return v;
cbcef455 7224 }
34e1c27b 7225 set_float_exception_flags(old_exc_flags, status);
ff32e16e 7226 float_raise(float_flag_invalid, status);
cbcef455
PM
7227 return res;
7228}
7229
fb3ea83a
TM
7230/*----------------------------------------------------------------------------
7231| Returns the result of converting the double-precision floating-point value
7232| `a' to the 64-bit unsigned integer format. The conversion is
7233| performed according to the IEC/IEEE Standard for Binary Floating-Point
7234| Arithmetic---which means in particular that the conversion is rounded
7235| according to the current rounding mode. If `a' is a NaN, the largest
7236| positive integer is returned. If the conversion overflows, the
7237| largest unsigned integer is returned. If 'a' is negative, the value is
7238| rounded and zero is returned; negative values that do not round to zero
7239| will raise the inexact exception.
7240*----------------------------------------------------------------------------*/
75d62a58 7241
e5a41ffa 7242uint64_t float64_to_uint64(float64 a, float_status *status)
fb3ea83a
TM
7243{
7244 flag aSign;
7245 int_fast16_t aExp, shiftCount;
7246 uint64_t aSig, aSigExtra;
ff32e16e 7247 a = float64_squash_input_denormal(a, status);
75d62a58 7248
fb3ea83a
TM
7249 aSig = extractFloat64Frac(a);
7250 aExp = extractFloat64Exp(a);
7251 aSign = extractFloat64Sign(a);
7252 if (aSign && (aExp > 1022)) {
ff32e16e 7253 float_raise(float_flag_invalid, status);
fb3ea83a
TM
7254 if (float64_is_any_nan(a)) {
7255 return LIT64(0xFFFFFFFFFFFFFFFF);
7256 } else {
7257 return 0;
7258 }
7259 }
7260 if (aExp) {
7261 aSig |= LIT64(0x0010000000000000);
7262 }
7263 shiftCount = 0x433 - aExp;
7264 if (shiftCount <= 0) {
7265 if (0x43E < aExp) {
ff32e16e 7266 float_raise(float_flag_invalid, status);
fb3ea83a
TM
7267 return LIT64(0xFFFFFFFFFFFFFFFF);
7268 }
7269 aSigExtra = 0;
7270 aSig <<= -shiftCount;
7271 } else {
7272 shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra);
7273 }
ff32e16e 7274 return roundAndPackUint64(aSign, aSig, aSigExtra, status);
75d62a58
JM
7275}
7276
e5a41ffa 7277uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *status)
75d62a58 7278{
0a87a310 7279 signed char current_rounding_mode = STATUS(float_rounding_mode);
ff32e16e
PM
7280 set_float_rounding_mode(float_round_to_zero, status);
7281 int64_t v = float64_to_uint64(a, status);
7282 set_float_rounding_mode(current_rounding_mode, status);
0a87a310 7283 return v;
75d62a58
JM
7284}
7285
1d6bda35 7286#define COMPARE(s, nan_exp) \
e5a41ffa
PM
7287static inline int float ## s ## _compare_internal(float ## s a, float ## s b,\
7288 int is_quiet, float_status *status) \
1d6bda35
FB
7289{ \
7290 flag aSign, bSign; \
bb98fe42 7291 uint ## s ## _t av, bv; \
ff32e16e
PM
7292 a = float ## s ## _squash_input_denormal(a, status); \
7293 b = float ## s ## _squash_input_denormal(b, status); \
1d6bda35
FB
7294 \
7295 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
7296 extractFloat ## s ## Frac( a ) ) || \
7297 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
7298 extractFloat ## s ## Frac( b ) )) { \
7299 if (!is_quiet || \
7300 float ## s ## _is_signaling_nan( a ) || \
7301 float ## s ## _is_signaling_nan( b ) ) { \
ff32e16e 7302 float_raise(float_flag_invalid, status); \
1d6bda35
FB
7303 } \
7304 return float_relation_unordered; \
7305 } \
7306 aSign = extractFloat ## s ## Sign( a ); \
7307 bSign = extractFloat ## s ## Sign( b ); \
f090c9d4 7308 av = float ## s ## _val(a); \
cd8a2533 7309 bv = float ## s ## _val(b); \
1d6bda35 7310 if ( aSign != bSign ) { \
bb98fe42 7311 if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) { \
1d6bda35
FB
7312 /* zero case */ \
7313 return float_relation_equal; \
7314 } else { \
7315 return 1 - (2 * aSign); \
7316 } \
7317 } else { \
f090c9d4 7318 if (av == bv) { \
1d6bda35
FB
7319 return float_relation_equal; \
7320 } else { \
f090c9d4 7321 return 1 - 2 * (aSign ^ ( av < bv )); \
1d6bda35
FB
7322 } \
7323 } \
7324} \
7325 \
e5a41ffa 7326int float ## s ## _compare(float ## s a, float ## s b, float_status *status) \
1d6bda35 7327{ \
ff32e16e 7328 return float ## s ## _compare_internal(a, b, 0, status); \
1d6bda35
FB
7329} \
7330 \
e5a41ffa
PM
7331int float ## s ## _compare_quiet(float ## s a, float ## s b, \
7332 float_status *status) \
1d6bda35 7333{ \
ff32e16e 7334 return float ## s ## _compare_internal(a, b, 1, status); \
1d6bda35
FB
7335}
7336
7337COMPARE(32, 0xff)
7338COMPARE(64, 0x7ff)
9ee6e8bb 7339
e5a41ffa
PM
7340static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
7341 int is_quiet, float_status *status)
f6714d36
AJ
7342{
7343 flag aSign, bSign;
7344
7345 if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7346 ( extractFloatx80Frac( a )<<1 ) ) ||
7347 ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7348 ( extractFloatx80Frac( b )<<1 ) )) {
7349 if (!is_quiet ||
7350 floatx80_is_signaling_nan( a ) ||
7351 floatx80_is_signaling_nan( b ) ) {
ff32e16e 7352 float_raise(float_flag_invalid, status);
f6714d36
AJ
7353 }
7354 return float_relation_unordered;
7355 }
7356 aSign = extractFloatx80Sign( a );
7357 bSign = extractFloatx80Sign( b );
7358 if ( aSign != bSign ) {
7359
7360 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7361 ( ( a.low | b.low ) == 0 ) ) {
7362 /* zero case */
7363 return float_relation_equal;
7364 } else {
7365 return 1 - (2 * aSign);
7366 }
7367 } else {
7368 if (a.low == b.low && a.high == b.high) {
7369 return float_relation_equal;
7370 } else {
7371 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7372 }
7373 }
7374}
7375
e5a41ffa 7376int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
f6714d36 7377{
ff32e16e 7378 return floatx80_compare_internal(a, b, 0, status);
f6714d36
AJ
7379}
7380
e5a41ffa 7381int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
f6714d36 7382{
ff32e16e 7383 return floatx80_compare_internal(a, b, 1, status);
f6714d36
AJ
7384}
7385
e5a41ffa
PM
7386static inline int float128_compare_internal(float128 a, float128 b,
7387 int is_quiet, float_status *status)
1f587329
BS
7388{
7389 flag aSign, bSign;
7390
7391 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7392 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7393 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7394 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7395 if (!is_quiet ||
7396 float128_is_signaling_nan( a ) ||
7397 float128_is_signaling_nan( b ) ) {
ff32e16e 7398 float_raise(float_flag_invalid, status);
1f587329
BS
7399 }
7400 return float_relation_unordered;
7401 }
7402 aSign = extractFloat128Sign( a );
7403 bSign = extractFloat128Sign( b );
7404 if ( aSign != bSign ) {
7405 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7406 /* zero case */
7407 return float_relation_equal;
7408 } else {
7409 return 1 - (2 * aSign);
7410 }
7411 } else {
7412 if (a.low == b.low && a.high == b.high) {
7413 return float_relation_equal;
7414 } else {
7415 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7416 }
7417 }
7418}
7419
e5a41ffa 7420int float128_compare(float128 a, float128 b, float_status *status)
1f587329 7421{
ff32e16e 7422 return float128_compare_internal(a, b, 0, status);
1f587329
BS
7423}
7424
e5a41ffa 7425int float128_compare_quiet(float128 a, float128 b, float_status *status)
1f587329 7426{
ff32e16e 7427 return float128_compare_internal(a, b, 1, status);
1f587329
BS
7428}
7429
274f1b04
PM
7430/* min() and max() functions. These can't be implemented as
7431 * 'compare and pick one input' because that would mishandle
7432 * NaNs and +0 vs -0.
e17ab310
WN
7433 *
7434 * minnum() and maxnum() functions. These are similar to the min()
7435 * and max() functions but if one of the arguments is a QNaN and
7436 * the other is numerical then the numerical argument is returned.
7437 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
7438 * and maxNum() operations. min() and max() are the typical min/max
7439 * semantics provided by many CPUs which predate that specification.
2d31e060
LA
7440 *
7441 * minnummag() and maxnummag() functions correspond to minNumMag()
7442 * and minNumMag() from the IEEE-754 2008.
274f1b04 7443 */
e70614ea 7444#define MINMAX(s) \
a49db98d 7445static inline float ## s float ## s ## _minmax(float ## s a, float ## s b, \
2d31e060 7446 int ismin, int isieee, \
e5a41ffa
PM
7447 int ismag, \
7448 float_status *status) \
274f1b04
PM
7449{ \
7450 flag aSign, bSign; \
2d31e060 7451 uint ## s ## _t av, bv, aav, abv; \
ff32e16e
PM
7452 a = float ## s ## _squash_input_denormal(a, status); \
7453 b = float ## s ## _squash_input_denormal(b, status); \
274f1b04
PM
7454 if (float ## s ## _is_any_nan(a) || \
7455 float ## s ## _is_any_nan(b)) { \
e17ab310
WN
7456 if (isieee) { \
7457 if (float ## s ## _is_quiet_nan(a) && \
7458 !float ## s ##_is_any_nan(b)) { \
7459 return b; \
7460 } else if (float ## s ## _is_quiet_nan(b) && \
7461 !float ## s ## _is_any_nan(a)) { \
7462 return a; \
7463 } \
7464 } \
ff32e16e 7465 return propagateFloat ## s ## NaN(a, b, status); \
274f1b04
PM
7466 } \
7467 aSign = extractFloat ## s ## Sign(a); \
7468 bSign = extractFloat ## s ## Sign(b); \
7469 av = float ## s ## _val(a); \
7470 bv = float ## s ## _val(b); \
2d31e060
LA
7471 if (ismag) { \
7472 aav = float ## s ## _abs(av); \
7473 abv = float ## s ## _abs(bv); \
7474 if (aav != abv) { \
7475 if (ismin) { \
7476 return (aav < abv) ? a : b; \
7477 } else { \
7478 return (aav < abv) ? b : a; \
7479 } \
7480 } \
7481 } \
274f1b04
PM
7482 if (aSign != bSign) { \
7483 if (ismin) { \
7484 return aSign ? a : b; \
7485 } else { \
7486 return aSign ? b : a; \
7487 } \
7488 } else { \
7489 if (ismin) { \
7490 return (aSign ^ (av < bv)) ? a : b; \
7491 } else { \
7492 return (aSign ^ (av < bv)) ? b : a; \
7493 } \
7494 } \
7495} \
7496 \
e5a41ffa
PM
7497float ## s float ## s ## _min(float ## s a, float ## s b, \
7498 float_status *status) \
274f1b04 7499{ \
ff32e16e 7500 return float ## s ## _minmax(a, b, 1, 0, 0, status); \
274f1b04
PM
7501} \
7502 \
e5a41ffa
PM
7503float ## s float ## s ## _max(float ## s a, float ## s b, \
7504 float_status *status) \
274f1b04 7505{ \
ff32e16e 7506 return float ## s ## _minmax(a, b, 0, 0, 0, status); \
e17ab310
WN
7507} \
7508 \
e5a41ffa
PM
7509float ## s float ## s ## _minnum(float ## s a, float ## s b, \
7510 float_status *status) \
e17ab310 7511{ \
ff32e16e 7512 return float ## s ## _minmax(a, b, 1, 1, 0, status); \
e17ab310
WN
7513} \
7514 \
e5a41ffa
PM
7515float ## s float ## s ## _maxnum(float ## s a, float ## s b, \
7516 float_status *status) \
e17ab310 7517{ \
ff32e16e 7518 return float ## s ## _minmax(a, b, 0, 1, 0, status); \
2d31e060
LA
7519} \
7520 \
e5a41ffa
PM
7521float ## s float ## s ## _minnummag(float ## s a, float ## s b, \
7522 float_status *status) \
2d31e060 7523{ \
ff32e16e 7524 return float ## s ## _minmax(a, b, 1, 1, 1, status); \
2d31e060
LA
7525} \
7526 \
e5a41ffa
PM
7527float ## s float ## s ## _maxnummag(float ## s a, float ## s b, \
7528 float_status *status) \
2d31e060 7529{ \
ff32e16e 7530 return float ## s ## _minmax(a, b, 0, 1, 1, status); \
274f1b04
PM
7531}
7532
e70614ea
WN
7533MINMAX(32)
7534MINMAX(64)
274f1b04
PM
7535
7536
9ee6e8bb 7537/* Multiply A by 2 raised to the power N. */
e5a41ffa 7538float32 float32_scalbn(float32 a, int n, float_status *status)
9ee6e8bb
PB
7539{
7540 flag aSign;
326b9e98 7541 int16_t aExp;
bb98fe42 7542 uint32_t aSig;
9ee6e8bb 7543
ff32e16e 7544 a = float32_squash_input_denormal(a, status);
9ee6e8bb
PB
7545 aSig = extractFloat32Frac( a );
7546 aExp = extractFloat32Exp( a );
7547 aSign = extractFloat32Sign( a );
7548
7549 if ( aExp == 0xFF ) {
326b9e98 7550 if ( aSig ) {
ff32e16e 7551 return propagateFloat32NaN(a, a, status);
326b9e98 7552 }
9ee6e8bb
PB
7553 return a;
7554 }
3c85c37f 7555 if (aExp != 0) {
69397542 7556 aSig |= 0x00800000;
3c85c37f 7557 } else if (aSig == 0) {
69397542 7558 return a;
3c85c37f
PM
7559 } else {
7560 aExp++;
7561 }
69397542 7562
326b9e98
AJ
7563 if (n > 0x200) {
7564 n = 0x200;
7565 } else if (n < -0x200) {
7566 n = -0x200;
7567 }
7568
69397542
PB
7569 aExp += n - 1;
7570 aSig <<= 7;
ff32e16e 7571 return normalizeRoundAndPackFloat32(aSign, aExp, aSig, status);
9ee6e8bb
PB
7572}
7573
e5a41ffa 7574float64 float64_scalbn(float64 a, int n, float_status *status)
9ee6e8bb
PB
7575{
7576 flag aSign;
326b9e98 7577 int16_t aExp;
bb98fe42 7578 uint64_t aSig;
9ee6e8bb 7579
ff32e16e 7580 a = float64_squash_input_denormal(a, status);
9ee6e8bb
PB
7581 aSig = extractFloat64Frac( a );
7582 aExp = extractFloat64Exp( a );
7583 aSign = extractFloat64Sign( a );
7584
7585 if ( aExp == 0x7FF ) {
326b9e98 7586 if ( aSig ) {
ff32e16e 7587 return propagateFloat64NaN(a, a, status);
326b9e98 7588 }
9ee6e8bb
PB
7589 return a;
7590 }
3c85c37f 7591 if (aExp != 0) {
69397542 7592 aSig |= LIT64( 0x0010000000000000 );
3c85c37f 7593 } else if (aSig == 0) {
69397542 7594 return a;
3c85c37f
PM
7595 } else {
7596 aExp++;
7597 }
69397542 7598
326b9e98
AJ
7599 if (n > 0x1000) {
7600 n = 0x1000;
7601 } else if (n < -0x1000) {
7602 n = -0x1000;
7603 }
7604
69397542
PB
7605 aExp += n - 1;
7606 aSig <<= 10;
ff32e16e 7607 return normalizeRoundAndPackFloat64(aSign, aExp, aSig, status);
9ee6e8bb
PB
7608}
7609
e5a41ffa 7610floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
9ee6e8bb
PB
7611{
7612 flag aSign;
326b9e98 7613 int32_t aExp;
bb98fe42 7614 uint64_t aSig;
9ee6e8bb
PB
7615
7616 aSig = extractFloatx80Frac( a );
7617 aExp = extractFloatx80Exp( a );
7618 aSign = extractFloatx80Sign( a );
7619
326b9e98
AJ
7620 if ( aExp == 0x7FFF ) {
7621 if ( aSig<<1 ) {
ff32e16e 7622 return propagateFloatx80NaN(a, a, status);
326b9e98 7623 }
9ee6e8bb
PB
7624 return a;
7625 }
326b9e98 7626
3c85c37f
PM
7627 if (aExp == 0) {
7628 if (aSig == 0) {
7629 return a;
7630 }
7631 aExp++;
7632 }
69397542 7633
326b9e98
AJ
7634 if (n > 0x10000) {
7635 n = 0x10000;
7636 } else if (n < -0x10000) {
7637 n = -0x10000;
7638 }
7639
9ee6e8bb 7640 aExp += n;
69397542 7641 return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
ff32e16e 7642 aSign, aExp, aSig, 0, status);
9ee6e8bb 7643}
9ee6e8bb 7644
e5a41ffa 7645float128 float128_scalbn(float128 a, int n, float_status *status)
9ee6e8bb
PB
7646{
7647 flag aSign;
326b9e98 7648 int32_t aExp;
bb98fe42 7649 uint64_t aSig0, aSig1;
9ee6e8bb
PB
7650
7651 aSig1 = extractFloat128Frac1( a );
7652 aSig0 = extractFloat128Frac0( a );
7653 aExp = extractFloat128Exp( a );
7654 aSign = extractFloat128Sign( a );
7655 if ( aExp == 0x7FFF ) {
326b9e98 7656 if ( aSig0 | aSig1 ) {
ff32e16e 7657 return propagateFloat128NaN(a, a, status);
326b9e98 7658 }
9ee6e8bb
PB
7659 return a;
7660 }
3c85c37f 7661 if (aExp != 0) {
69397542 7662 aSig0 |= LIT64( 0x0001000000000000 );
3c85c37f 7663 } else if (aSig0 == 0 && aSig1 == 0) {
69397542 7664 return a;
3c85c37f
PM
7665 } else {
7666 aExp++;
7667 }
69397542 7668
326b9e98
AJ
7669 if (n > 0x10000) {
7670 n = 0x10000;
7671 } else if (n < -0x10000) {
7672 n = -0x10000;
7673 }
7674
69397542
PB
7675 aExp += n - 1;
7676 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
ff32e16e 7677 , status);
9ee6e8bb
PB
7678
7679}