]> git.proxmox.com Git - mirror_qemu.git/blame - fpu/softfloat.c
fpu: Remove use of int_fast16_t in conversions to int16
[mirror_qemu.git] / fpu / softfloat.c
CommitLineData
8d725fac
AF
1/*
2 * QEMU float support
3 *
16017c48
PM
4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
10 * the BSD license
11 * GPL-v2-or-later
12 *
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
8d725fac 16 */
158142c2 17
a7d1ac78
PM
18/*
19===============================================================================
20This C source file is part of the SoftFloat IEC/IEEE Floating-point
21Arithmetic Package, Release 2a.
158142c2
FB
22
23Written by John R. Hauser. This work was made possible in part by the
24International Computer Science Institute, located at Suite 600, 1947 Center
25Street, Berkeley, California 94704. Funding was partially provided by the
26National Science Foundation under grant MIP-9311980. The original version
27of this code was written as part of a project to build a fixed-point vector
28processor in collaboration with the University of California at Berkeley,
29overseen by Profs. Nelson Morgan and John Wawrzynek. More information
a7d1ac78 30is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
158142c2
FB
31arithmetic/SoftFloat.html'.
32
a7d1ac78
PM
33THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
158142c2
FB
38
39Derivative works are acceptable, even for commercial purposes, so long as
a7d1ac78
PM
40(1) they include prominent notice that the work is derivative, and (2) they
41include prominent notice akin to these four paragraphs for those parts of
42this code that are retained.
158142c2 43
a7d1ac78
PM
44===============================================================================
45*/
158142c2 46
16017c48
PM
47/* BSD licensing:
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
50 *
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
53 *
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
56 *
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
60 *
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
64 *
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
76 */
77
78/* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
80 */
81
2ac8bd03
PM
82/* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
84 */
d38ea87a 85#include "qemu/osdep.h"
2ac8bd03 86
6b4c305c 87#include "fpu/softfloat.h"
158142c2 88
dc355b76 89/* We only need stdlib for abort() */
dc355b76 90
158142c2
FB
91/*----------------------------------------------------------------------------
92| Primitive arithmetic functions, including multi-word arithmetic, and
93| division and square root approximations. (Can be specialized to target if
94| desired.)
95*----------------------------------------------------------------------------*/
96#include "softfloat-macros.h"
97
98/*----------------------------------------------------------------------------
99| Functions and definitions to determine: (1) whether tininess for underflow
100| is detected before or after rounding by default, (2) what (if anything)
101| happens when exceptions are raised, (3) how signaling NaNs are distinguished
102| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
103| are propagated from function inputs to output. These details are target-
104| specific.
105*----------------------------------------------------------------------------*/
106#include "softfloat-specialize.h"
107
bb4d4bb3
PM
108/*----------------------------------------------------------------------------
109| Returns the fraction bits of the half-precision floating-point value `a'.
110*----------------------------------------------------------------------------*/
111
a49db98d 112static inline uint32_t extractFloat16Frac(float16 a)
bb4d4bb3
PM
113{
114 return float16_val(a) & 0x3ff;
115}
116
117/*----------------------------------------------------------------------------
118| Returns the exponent bits of the half-precision floating-point value `a'.
119*----------------------------------------------------------------------------*/
120
a49db98d 121static inline int_fast16_t extractFloat16Exp(float16 a)
bb4d4bb3
PM
122{
123 return (float16_val(a) >> 10) & 0x1f;
124}
125
126/*----------------------------------------------------------------------------
127| Returns the sign bit of the single-precision floating-point value `a'.
128*----------------------------------------------------------------------------*/
129
a49db98d 130static inline flag extractFloat16Sign(float16 a)
bb4d4bb3
PM
131{
132 return float16_val(a)>>15;
133}
134
158142c2
FB
135/*----------------------------------------------------------------------------
136| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
137| and 7, and returns the properly rounded 32-bit integer corresponding to the
138| input. If `zSign' is 1, the input is negated before being converted to an
139| integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
140| is simply rounded to an integer, with the inexact exception raised if the
141| input cannot be represented exactly as an integer. However, if the fixed-
142| point input is too large, the invalid exception is raised and the largest
143| positive or negative integer is returned.
144*----------------------------------------------------------------------------*/
145
f4014512 146static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
158142c2 147{
8f506c70 148 int8_t roundingMode;
158142c2 149 flag roundNearestEven;
8f506c70 150 int8_t roundIncrement, roundBits;
760e1416 151 int32_t z;
158142c2 152
a2f2d288 153 roundingMode = status->float_rounding_mode;
158142c2 154 roundNearestEven = ( roundingMode == float_round_nearest_even );
dc355b76
PM
155 switch (roundingMode) {
156 case float_round_nearest_even:
f9288a76 157 case float_round_ties_away:
dc355b76
PM
158 roundIncrement = 0x40;
159 break;
160 case float_round_to_zero:
161 roundIncrement = 0;
162 break;
163 case float_round_up:
164 roundIncrement = zSign ? 0 : 0x7f;
165 break;
166 case float_round_down:
167 roundIncrement = zSign ? 0x7f : 0;
168 break;
169 default:
170 abort();
158142c2
FB
171 }
172 roundBits = absZ & 0x7F;
173 absZ = ( absZ + roundIncrement )>>7;
174 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
175 z = absZ;
176 if ( zSign ) z = - z;
177 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
ff32e16e 178 float_raise(float_flag_invalid, status);
bb98fe42 179 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2 180 }
a2f2d288
PM
181 if (roundBits) {
182 status->float_exception_flags |= float_flag_inexact;
183 }
158142c2
FB
184 return z;
185
186}
187
188/*----------------------------------------------------------------------------
189| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
190| `absZ1', with binary point between bits 63 and 64 (between the input words),
191| and returns the properly rounded 64-bit integer corresponding to the input.
192| If `zSign' is 1, the input is negated before being converted to an integer.
193| Ordinarily, the fixed-point input is simply rounded to an integer, with
194| the inexact exception raised if the input cannot be represented exactly as
195| an integer. However, if the fixed-point input is too large, the invalid
196| exception is raised and the largest positive or negative integer is
197| returned.
198*----------------------------------------------------------------------------*/
199
f42c2224 200static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
e5a41ffa 201 float_status *status)
158142c2 202{
8f506c70 203 int8_t roundingMode;
158142c2 204 flag roundNearestEven, increment;
760e1416 205 int64_t z;
158142c2 206
a2f2d288 207 roundingMode = status->float_rounding_mode;
158142c2 208 roundNearestEven = ( roundingMode == float_round_nearest_even );
dc355b76
PM
209 switch (roundingMode) {
210 case float_round_nearest_even:
f9288a76 211 case float_round_ties_away:
dc355b76
PM
212 increment = ((int64_t) absZ1 < 0);
213 break;
214 case float_round_to_zero:
215 increment = 0;
216 break;
217 case float_round_up:
218 increment = !zSign && absZ1;
219 break;
220 case float_round_down:
221 increment = zSign && absZ1;
222 break;
223 default:
224 abort();
158142c2
FB
225 }
226 if ( increment ) {
227 ++absZ0;
228 if ( absZ0 == 0 ) goto overflow;
bb98fe42 229 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
158142c2
FB
230 }
231 z = absZ0;
232 if ( zSign ) z = - z;
233 if ( z && ( ( z < 0 ) ^ zSign ) ) {
234 overflow:
ff32e16e 235 float_raise(float_flag_invalid, status);
158142c2 236 return
bb98fe42 237 zSign ? (int64_t) LIT64( 0x8000000000000000 )
158142c2
FB
238 : LIT64( 0x7FFFFFFFFFFFFFFF );
239 }
a2f2d288
PM
240 if (absZ1) {
241 status->float_exception_flags |= float_flag_inexact;
242 }
158142c2
FB
243 return z;
244
245}
246
fb3ea83a
TM
247/*----------------------------------------------------------------------------
248| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
249| `absZ1', with binary point between bits 63 and 64 (between the input words),
250| and returns the properly rounded 64-bit unsigned integer corresponding to the
251| input. Ordinarily, the fixed-point input is simply rounded to an integer,
252| with the inexact exception raised if the input cannot be represented exactly
253| as an integer. However, if the fixed-point input is too large, the invalid
254| exception is raised and the largest unsigned integer is returned.
255*----------------------------------------------------------------------------*/
256
f42c2224 257static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
e5a41ffa 258 uint64_t absZ1, float_status *status)
fb3ea83a 259{
8f506c70 260 int8_t roundingMode;
fb3ea83a
TM
261 flag roundNearestEven, increment;
262
a2f2d288 263 roundingMode = status->float_rounding_mode;
fb3ea83a 264 roundNearestEven = (roundingMode == float_round_nearest_even);
dc355b76
PM
265 switch (roundingMode) {
266 case float_round_nearest_even:
f9288a76 267 case float_round_ties_away:
dc355b76
PM
268 increment = ((int64_t)absZ1 < 0);
269 break;
270 case float_round_to_zero:
271 increment = 0;
272 break;
273 case float_round_up:
274 increment = !zSign && absZ1;
275 break;
276 case float_round_down:
277 increment = zSign && absZ1;
278 break;
279 default:
280 abort();
fb3ea83a
TM
281 }
282 if (increment) {
283 ++absZ0;
284 if (absZ0 == 0) {
ff32e16e 285 float_raise(float_flag_invalid, status);
fb3ea83a
TM
286 return LIT64(0xFFFFFFFFFFFFFFFF);
287 }
288 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
289 }
290
291 if (zSign && absZ0) {
ff32e16e 292 float_raise(float_flag_invalid, status);
fb3ea83a
TM
293 return 0;
294 }
295
296 if (absZ1) {
a2f2d288 297 status->float_exception_flags |= float_flag_inexact;
fb3ea83a
TM
298 }
299 return absZ0;
300}
301
158142c2
FB
302/*----------------------------------------------------------------------------
303| Returns the fraction bits of the single-precision floating-point value `a'.
304*----------------------------------------------------------------------------*/
305
a49db98d 306static inline uint32_t extractFloat32Frac( float32 a )
158142c2
FB
307{
308
f090c9d4 309 return float32_val(a) & 0x007FFFFF;
158142c2
FB
310
311}
312
313/*----------------------------------------------------------------------------
314| Returns the exponent bits of the single-precision floating-point value `a'.
315*----------------------------------------------------------------------------*/
316
a49db98d 317static inline int_fast16_t extractFloat32Exp(float32 a)
158142c2
FB
318{
319
f090c9d4 320 return ( float32_val(a)>>23 ) & 0xFF;
158142c2
FB
321
322}
323
324/*----------------------------------------------------------------------------
325| Returns the sign bit of the single-precision floating-point value `a'.
326*----------------------------------------------------------------------------*/
327
a49db98d 328static inline flag extractFloat32Sign( float32 a )
158142c2
FB
329{
330
f090c9d4 331 return float32_val(a)>>31;
158142c2
FB
332
333}
334
37d18660
PM
335/*----------------------------------------------------------------------------
336| If `a' is denormal and we are in flush-to-zero mode then set the
337| input-denormal exception and return zero. Otherwise just return the value.
338*----------------------------------------------------------------------------*/
e5a41ffa 339float32 float32_squash_input_denormal(float32 a, float_status *status)
37d18660 340{
a2f2d288 341 if (status->flush_inputs_to_zero) {
37d18660 342 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
ff32e16e 343 float_raise(float_flag_input_denormal, status);
37d18660
PM
344 return make_float32(float32_val(a) & 0x80000000);
345 }
346 }
347 return a;
348}
349
158142c2
FB
350/*----------------------------------------------------------------------------
351| Normalizes the subnormal single-precision floating-point value represented
352| by the denormalized significand `aSig'. The normalized exponent and
353| significand are stored at the locations pointed to by `zExpPtr' and
354| `zSigPtr', respectively.
355*----------------------------------------------------------------------------*/
356
357static void
94a49d86 358 normalizeFloat32Subnormal(uint32_t aSig, int_fast16_t *zExpPtr, uint32_t *zSigPtr)
158142c2 359{
8f506c70 360 int8_t shiftCount;
158142c2
FB
361
362 shiftCount = countLeadingZeros32( aSig ) - 8;
363 *zSigPtr = aSig<<shiftCount;
364 *zExpPtr = 1 - shiftCount;
365
366}
367
368/*----------------------------------------------------------------------------
369| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
370| single-precision floating-point value, returning the result. After being
371| shifted into the proper positions, the three fields are simply added
372| together to form the result. This means that any integer portion of `zSig'
373| will be added into the exponent. Since a properly normalized significand
374| will have an integer portion equal to 1, the `zExp' input should be 1 less
375| than the desired result exponent whenever `zSig' is a complete, normalized
376| significand.
377*----------------------------------------------------------------------------*/
378
a49db98d 379static inline float32 packFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig)
158142c2
FB
380{
381
f090c9d4 382 return make_float32(
bb98fe42 383 ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
158142c2
FB
384
385}
386
387/*----------------------------------------------------------------------------
388| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
389| and significand `zSig', and returns the proper single-precision floating-
390| point value corresponding to the abstract input. Ordinarily, the abstract
391| value is simply rounded and packed into the single-precision format, with
392| the inexact exception raised if the abstract input cannot be represented
393| exactly. However, if the abstract value is too large, the overflow and
394| inexact exceptions are raised and an infinity or maximal finite value is
395| returned. If the abstract value is too small, the input value is rounded to
396| a subnormal number, and the underflow and inexact exceptions are raised if
397| the abstract input cannot be represented exactly as a subnormal single-
398| precision floating-point number.
399| The input significand `zSig' has its binary point between bits 30
400| and 29, which is 7 bits to the left of the usual location. This shifted
401| significand must be normalized or smaller. If `zSig' is not normalized,
402| `zExp' must be 0; in that case, the result returned is a subnormal number,
403| and it must not require rounding. In the usual case that `zSig' is
404| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
405| The handling of underflow and overflow follows the IEC/IEEE Standard for
406| Binary Floating-Point Arithmetic.
407*----------------------------------------------------------------------------*/
408
e5a41ffa
PM
409static float32 roundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig,
410 float_status *status)
158142c2 411{
8f506c70 412 int8_t roundingMode;
158142c2 413 flag roundNearestEven;
8f506c70 414 int8_t roundIncrement, roundBits;
158142c2
FB
415 flag isTiny;
416
a2f2d288 417 roundingMode = status->float_rounding_mode;
158142c2 418 roundNearestEven = ( roundingMode == float_round_nearest_even );
dc355b76
PM
419 switch (roundingMode) {
420 case float_round_nearest_even:
f9288a76 421 case float_round_ties_away:
dc355b76
PM
422 roundIncrement = 0x40;
423 break;
424 case float_round_to_zero:
425 roundIncrement = 0;
426 break;
427 case float_round_up:
428 roundIncrement = zSign ? 0 : 0x7f;
429 break;
430 case float_round_down:
431 roundIncrement = zSign ? 0x7f : 0;
432 break;
433 default:
434 abort();
435 break;
158142c2
FB
436 }
437 roundBits = zSig & 0x7F;
bb98fe42 438 if ( 0xFD <= (uint16_t) zExp ) {
158142c2
FB
439 if ( ( 0xFD < zExp )
440 || ( ( zExp == 0xFD )
bb98fe42 441 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
158142c2 442 ) {
ff32e16e 443 float_raise(float_flag_overflow | float_flag_inexact, status);
f090c9d4 444 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
158142c2
FB
445 }
446 if ( zExp < 0 ) {
a2f2d288 447 if (status->flush_to_zero) {
ff32e16e 448 float_raise(float_flag_output_denormal, status);
e6afc87f
PM
449 return packFloat32(zSign, 0, 0);
450 }
158142c2 451 isTiny =
a2f2d288
PM
452 (status->float_detect_tininess
453 == float_tininess_before_rounding)
158142c2
FB
454 || ( zExp < -1 )
455 || ( zSig + roundIncrement < 0x80000000 );
456 shift32RightJamming( zSig, - zExp, &zSig );
457 zExp = 0;
458 roundBits = zSig & 0x7F;
ff32e16e
PM
459 if (isTiny && roundBits) {
460 float_raise(float_flag_underflow, status);
461 }
158142c2
FB
462 }
463 }
a2f2d288
PM
464 if (roundBits) {
465 status->float_exception_flags |= float_flag_inexact;
466 }
158142c2
FB
467 zSig = ( zSig + roundIncrement )>>7;
468 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
469 if ( zSig == 0 ) zExp = 0;
470 return packFloat32( zSign, zExp, zSig );
471
472}
473
474/*----------------------------------------------------------------------------
475| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
476| and significand `zSig', and returns the proper single-precision floating-
477| point value corresponding to the abstract input. This routine is just like
478| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
479| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
480| floating-point exponent.
481*----------------------------------------------------------------------------*/
482
483static float32
e5a41ffa
PM
484 normalizeRoundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig,
485 float_status *status)
158142c2 486{
8f506c70 487 int8_t shiftCount;
158142c2
FB
488
489 shiftCount = countLeadingZeros32( zSig ) - 1;
ff32e16e
PM
490 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
491 status);
158142c2
FB
492
493}
494
495/*----------------------------------------------------------------------------
496| Returns the fraction bits of the double-precision floating-point value `a'.
497*----------------------------------------------------------------------------*/
498
a49db98d 499static inline uint64_t extractFloat64Frac( float64 a )
158142c2
FB
500{
501
f090c9d4 502 return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
158142c2
FB
503
504}
505
506/*----------------------------------------------------------------------------
507| Returns the exponent bits of the double-precision floating-point value `a'.
508*----------------------------------------------------------------------------*/
509
a49db98d 510static inline int_fast16_t extractFloat64Exp(float64 a)
158142c2
FB
511{
512
f090c9d4 513 return ( float64_val(a)>>52 ) & 0x7FF;
158142c2
FB
514
515}
516
517/*----------------------------------------------------------------------------
518| Returns the sign bit of the double-precision floating-point value `a'.
519*----------------------------------------------------------------------------*/
520
a49db98d 521static inline flag extractFloat64Sign( float64 a )
158142c2
FB
522{
523
f090c9d4 524 return float64_val(a)>>63;
158142c2
FB
525
526}
527
37d18660
PM
528/*----------------------------------------------------------------------------
529| If `a' is denormal and we are in flush-to-zero mode then set the
530| input-denormal exception and return zero. Otherwise just return the value.
531*----------------------------------------------------------------------------*/
e5a41ffa 532float64 float64_squash_input_denormal(float64 a, float_status *status)
37d18660 533{
a2f2d288 534 if (status->flush_inputs_to_zero) {
37d18660 535 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
ff32e16e 536 float_raise(float_flag_input_denormal, status);
37d18660
PM
537 return make_float64(float64_val(a) & (1ULL << 63));
538 }
539 }
540 return a;
541}
542
158142c2
FB
543/*----------------------------------------------------------------------------
544| Normalizes the subnormal double-precision floating-point value represented
545| by the denormalized significand `aSig'. The normalized exponent and
546| significand are stored at the locations pointed to by `zExpPtr' and
547| `zSigPtr', respectively.
548*----------------------------------------------------------------------------*/
549
550static void
94a49d86 551 normalizeFloat64Subnormal(uint64_t aSig, int_fast16_t *zExpPtr, uint64_t *zSigPtr)
158142c2 552{
8f506c70 553 int8_t shiftCount;
158142c2
FB
554
555 shiftCount = countLeadingZeros64( aSig ) - 11;
556 *zSigPtr = aSig<<shiftCount;
557 *zExpPtr = 1 - shiftCount;
558
559}
560
561/*----------------------------------------------------------------------------
562| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
563| double-precision floating-point value, returning the result. After being
564| shifted into the proper positions, the three fields are simply added
565| together to form the result. This means that any integer portion of `zSig'
566| will be added into the exponent. Since a properly normalized significand
567| will have an integer portion equal to 1, the `zExp' input should be 1 less
568| than the desired result exponent whenever `zSig' is a complete, normalized
569| significand.
570*----------------------------------------------------------------------------*/
571
a49db98d 572static inline float64 packFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig)
158142c2
FB
573{
574
f090c9d4 575 return make_float64(
bb98fe42 576 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
158142c2
FB
577
578}
579
580/*----------------------------------------------------------------------------
581| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
582| and significand `zSig', and returns the proper double-precision floating-
583| point value corresponding to the abstract input. Ordinarily, the abstract
584| value is simply rounded and packed into the double-precision format, with
585| the inexact exception raised if the abstract input cannot be represented
586| exactly. However, if the abstract value is too large, the overflow and
587| inexact exceptions are raised and an infinity or maximal finite value is
a7d1ac78
PM
588| returned. If the abstract value is too small, the input value is rounded to
589| a subnormal number, and the underflow and inexact exceptions are raised if
590| the abstract input cannot be represented exactly as a subnormal double-
158142c2
FB
591| precision floating-point number.
592| The input significand `zSig' has its binary point between bits 62
593| and 61, which is 10 bits to the left of the usual location. This shifted
594| significand must be normalized or smaller. If `zSig' is not normalized,
595| `zExp' must be 0; in that case, the result returned is a subnormal number,
596| and it must not require rounding. In the usual case that `zSig' is
597| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
598| The handling of underflow and overflow follows the IEC/IEEE Standard for
599| Binary Floating-Point Arithmetic.
600*----------------------------------------------------------------------------*/
601
e5a41ffa
PM
602static float64 roundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig,
603 float_status *status)
158142c2 604{
8f506c70 605 int8_t roundingMode;
158142c2 606 flag roundNearestEven;
94a49d86 607 int_fast16_t roundIncrement, roundBits;
158142c2
FB
608 flag isTiny;
609
a2f2d288 610 roundingMode = status->float_rounding_mode;
158142c2 611 roundNearestEven = ( roundingMode == float_round_nearest_even );
dc355b76
PM
612 switch (roundingMode) {
613 case float_round_nearest_even:
f9288a76 614 case float_round_ties_away:
dc355b76
PM
615 roundIncrement = 0x200;
616 break;
617 case float_round_to_zero:
618 roundIncrement = 0;
619 break;
620 case float_round_up:
621 roundIncrement = zSign ? 0 : 0x3ff;
622 break;
623 case float_round_down:
624 roundIncrement = zSign ? 0x3ff : 0;
625 break;
626 default:
627 abort();
158142c2
FB
628 }
629 roundBits = zSig & 0x3FF;
bb98fe42 630 if ( 0x7FD <= (uint16_t) zExp ) {
158142c2
FB
631 if ( ( 0x7FD < zExp )
632 || ( ( zExp == 0x7FD )
bb98fe42 633 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
158142c2 634 ) {
ff32e16e 635 float_raise(float_flag_overflow | float_flag_inexact, status);
f090c9d4 636 return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
158142c2
FB
637 }
638 if ( zExp < 0 ) {
a2f2d288 639 if (status->flush_to_zero) {
ff32e16e 640 float_raise(float_flag_output_denormal, status);
e6afc87f
PM
641 return packFloat64(zSign, 0, 0);
642 }
158142c2 643 isTiny =
a2f2d288
PM
644 (status->float_detect_tininess
645 == float_tininess_before_rounding)
158142c2
FB
646 || ( zExp < -1 )
647 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
648 shift64RightJamming( zSig, - zExp, &zSig );
649 zExp = 0;
650 roundBits = zSig & 0x3FF;
ff32e16e
PM
651 if (isTiny && roundBits) {
652 float_raise(float_flag_underflow, status);
653 }
158142c2
FB
654 }
655 }
a2f2d288
PM
656 if (roundBits) {
657 status->float_exception_flags |= float_flag_inexact;
658 }
158142c2
FB
659 zSig = ( zSig + roundIncrement )>>10;
660 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
661 if ( zSig == 0 ) zExp = 0;
662 return packFloat64( zSign, zExp, zSig );
663
664}
665
666/*----------------------------------------------------------------------------
667| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
668| and significand `zSig', and returns the proper double-precision floating-
669| point value corresponding to the abstract input. This routine is just like
670| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
671| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
672| floating-point exponent.
673*----------------------------------------------------------------------------*/
674
675static float64
e5a41ffa
PM
676 normalizeRoundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig,
677 float_status *status)
158142c2 678{
8f506c70 679 int8_t shiftCount;
158142c2
FB
680
681 shiftCount = countLeadingZeros64( zSig ) - 1;
ff32e16e
PM
682 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
683 status);
158142c2
FB
684
685}
686
158142c2
FB
687/*----------------------------------------------------------------------------
688| Returns the fraction bits of the extended double-precision floating-point
689| value `a'.
690*----------------------------------------------------------------------------*/
691
a49db98d 692static inline uint64_t extractFloatx80Frac( floatx80 a )
158142c2
FB
693{
694
695 return a.low;
696
697}
698
699/*----------------------------------------------------------------------------
700| Returns the exponent bits of the extended double-precision floating-point
701| value `a'.
702*----------------------------------------------------------------------------*/
703
f4014512 704static inline int32_t extractFloatx80Exp( floatx80 a )
158142c2
FB
705{
706
707 return a.high & 0x7FFF;
708
709}
710
711/*----------------------------------------------------------------------------
712| Returns the sign bit of the extended double-precision floating-point value
713| `a'.
714*----------------------------------------------------------------------------*/
715
a49db98d 716static inline flag extractFloatx80Sign( floatx80 a )
158142c2
FB
717{
718
719 return a.high>>15;
720
721}
722
723/*----------------------------------------------------------------------------
724| Normalizes the subnormal extended double-precision floating-point value
725| represented by the denormalized significand `aSig'. The normalized exponent
726| and significand are stored at the locations pointed to by `zExpPtr' and
727| `zSigPtr', respectively.
728*----------------------------------------------------------------------------*/
729
730static void
f4014512 731 normalizeFloatx80Subnormal( uint64_t aSig, int32_t *zExpPtr, uint64_t *zSigPtr )
158142c2 732{
8f506c70 733 int8_t shiftCount;
158142c2
FB
734
735 shiftCount = countLeadingZeros64( aSig );
736 *zSigPtr = aSig<<shiftCount;
737 *zExpPtr = 1 - shiftCount;
738
739}
740
741/*----------------------------------------------------------------------------
742| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
743| extended double-precision floating-point value, returning the result.
744*----------------------------------------------------------------------------*/
745
f4014512 746static inline floatx80 packFloatx80( flag zSign, int32_t zExp, uint64_t zSig )
158142c2
FB
747{
748 floatx80 z;
749
750 z.low = zSig;
bb98fe42 751 z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
158142c2
FB
752 return z;
753
754}
755
756/*----------------------------------------------------------------------------
757| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
758| and extended significand formed by the concatenation of `zSig0' and `zSig1',
759| and returns the proper extended double-precision floating-point value
760| corresponding to the abstract input. Ordinarily, the abstract value is
761| rounded and packed into the extended double-precision format, with the
762| inexact exception raised if the abstract input cannot be represented
763| exactly. However, if the abstract value is too large, the overflow and
764| inexact exceptions are raised and an infinity or maximal finite value is
765| returned. If the abstract value is too small, the input value is rounded to
766| a subnormal number, and the underflow and inexact exceptions are raised if
767| the abstract input cannot be represented exactly as a subnormal extended
768| double-precision floating-point number.
769| If `roundingPrecision' is 32 or 64, the result is rounded to the same
770| number of bits as single or double precision, respectively. Otherwise, the
771| result is rounded to the full precision of the extended double-precision
772| format.
773| The input significand must be normalized or smaller. If the input
774| significand is not normalized, `zExp' must be 0; in that case, the result
775| returned is a subnormal number, and it must not require rounding. The
776| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
777| Floating-Point Arithmetic.
778*----------------------------------------------------------------------------*/
779
8f506c70 780static floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
f4014512 781 int32_t zExp, uint64_t zSig0, uint64_t zSig1,
e5a41ffa 782 float_status *status)
158142c2 783{
8f506c70 784 int8_t roundingMode;
158142c2 785 flag roundNearestEven, increment, isTiny;
f42c2224 786 int64_t roundIncrement, roundMask, roundBits;
158142c2 787
a2f2d288 788 roundingMode = status->float_rounding_mode;
158142c2
FB
789 roundNearestEven = ( roundingMode == float_round_nearest_even );
790 if ( roundingPrecision == 80 ) goto precision80;
791 if ( roundingPrecision == 64 ) {
792 roundIncrement = LIT64( 0x0000000000000400 );
793 roundMask = LIT64( 0x00000000000007FF );
794 }
795 else if ( roundingPrecision == 32 ) {
796 roundIncrement = LIT64( 0x0000008000000000 );
797 roundMask = LIT64( 0x000000FFFFFFFFFF );
798 }
799 else {
800 goto precision80;
801 }
802 zSig0 |= ( zSig1 != 0 );
dc355b76
PM
803 switch (roundingMode) {
804 case float_round_nearest_even:
f9288a76 805 case float_round_ties_away:
dc355b76
PM
806 break;
807 case float_round_to_zero:
808 roundIncrement = 0;
809 break;
810 case float_round_up:
811 roundIncrement = zSign ? 0 : roundMask;
812 break;
813 case float_round_down:
814 roundIncrement = zSign ? roundMask : 0;
815 break;
816 default:
817 abort();
158142c2
FB
818 }
819 roundBits = zSig0 & roundMask;
bb98fe42 820 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
158142c2
FB
821 if ( ( 0x7FFE < zExp )
822 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
823 ) {
824 goto overflow;
825 }
826 if ( zExp <= 0 ) {
a2f2d288 827 if (status->flush_to_zero) {
ff32e16e 828 float_raise(float_flag_output_denormal, status);
e6afc87f
PM
829 return packFloatx80(zSign, 0, 0);
830 }
158142c2 831 isTiny =
a2f2d288
PM
832 (status->float_detect_tininess
833 == float_tininess_before_rounding)
158142c2
FB
834 || ( zExp < 0 )
835 || ( zSig0 <= zSig0 + roundIncrement );
836 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
837 zExp = 0;
838 roundBits = zSig0 & roundMask;
ff32e16e
PM
839 if (isTiny && roundBits) {
840 float_raise(float_flag_underflow, status);
841 }
a2f2d288
PM
842 if (roundBits) {
843 status->float_exception_flags |= float_flag_inexact;
844 }
158142c2 845 zSig0 += roundIncrement;
bb98fe42 846 if ( (int64_t) zSig0 < 0 ) zExp = 1;
158142c2
FB
847 roundIncrement = roundMask + 1;
848 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
849 roundMask |= roundIncrement;
850 }
851 zSig0 &= ~ roundMask;
852 return packFloatx80( zSign, zExp, zSig0 );
853 }
854 }
a2f2d288
PM
855 if (roundBits) {
856 status->float_exception_flags |= float_flag_inexact;
857 }
158142c2
FB
858 zSig0 += roundIncrement;
859 if ( zSig0 < roundIncrement ) {
860 ++zExp;
861 zSig0 = LIT64( 0x8000000000000000 );
862 }
863 roundIncrement = roundMask + 1;
864 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
865 roundMask |= roundIncrement;
866 }
867 zSig0 &= ~ roundMask;
868 if ( zSig0 == 0 ) zExp = 0;
869 return packFloatx80( zSign, zExp, zSig0 );
870 precision80:
dc355b76
PM
871 switch (roundingMode) {
872 case float_round_nearest_even:
f9288a76 873 case float_round_ties_away:
dc355b76
PM
874 increment = ((int64_t)zSig1 < 0);
875 break;
876 case float_round_to_zero:
877 increment = 0;
878 break;
879 case float_round_up:
880 increment = !zSign && zSig1;
881 break;
882 case float_round_down:
883 increment = zSign && zSig1;
884 break;
885 default:
886 abort();
158142c2 887 }
bb98fe42 888 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
158142c2
FB
889 if ( ( 0x7FFE < zExp )
890 || ( ( zExp == 0x7FFE )
891 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
892 && increment
893 )
894 ) {
895 roundMask = 0;
896 overflow:
ff32e16e 897 float_raise(float_flag_overflow | float_flag_inexact, status);
158142c2
FB
898 if ( ( roundingMode == float_round_to_zero )
899 || ( zSign && ( roundingMode == float_round_up ) )
900 || ( ! zSign && ( roundingMode == float_round_down ) )
901 ) {
902 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
903 }
904 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
905 }
906 if ( zExp <= 0 ) {
907 isTiny =
a2f2d288
PM
908 (status->float_detect_tininess
909 == float_tininess_before_rounding)
158142c2
FB
910 || ( zExp < 0 )
911 || ! increment
912 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
913 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
914 zExp = 0;
ff32e16e
PM
915 if (isTiny && zSig1) {
916 float_raise(float_flag_underflow, status);
917 }
a2f2d288
PM
918 if (zSig1) {
919 status->float_exception_flags |= float_flag_inexact;
920 }
dc355b76
PM
921 switch (roundingMode) {
922 case float_round_nearest_even:
f9288a76 923 case float_round_ties_away:
dc355b76
PM
924 increment = ((int64_t)zSig1 < 0);
925 break;
926 case float_round_to_zero:
927 increment = 0;
928 break;
929 case float_round_up:
930 increment = !zSign && zSig1;
931 break;
932 case float_round_down:
933 increment = zSign && zSig1;
934 break;
935 default:
936 abort();
158142c2
FB
937 }
938 if ( increment ) {
939 ++zSig0;
940 zSig0 &=
bb98fe42
AF
941 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
942 if ( (int64_t) zSig0 < 0 ) zExp = 1;
158142c2
FB
943 }
944 return packFloatx80( zSign, zExp, zSig0 );
945 }
946 }
a2f2d288
PM
947 if (zSig1) {
948 status->float_exception_flags |= float_flag_inexact;
949 }
158142c2
FB
950 if ( increment ) {
951 ++zSig0;
952 if ( zSig0 == 0 ) {
953 ++zExp;
954 zSig0 = LIT64( 0x8000000000000000 );
955 }
956 else {
bb98fe42 957 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
158142c2
FB
958 }
959 }
960 else {
961 if ( zSig0 == 0 ) zExp = 0;
962 }
963 return packFloatx80( zSign, zExp, zSig0 );
964
965}
966
967/*----------------------------------------------------------------------------
968| Takes an abstract floating-point value having sign `zSign', exponent
969| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
970| and returns the proper extended double-precision floating-point value
971| corresponding to the abstract input. This routine is just like
972| `roundAndPackFloatx80' except that the input significand does not have to be
973| normalized.
974*----------------------------------------------------------------------------*/
975
8f506c70 976static floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
f4014512 977 flag zSign, int32_t zExp,
e5a41ffa
PM
978 uint64_t zSig0, uint64_t zSig1,
979 float_status *status)
158142c2 980{
8f506c70 981 int8_t shiftCount;
158142c2
FB
982
983 if ( zSig0 == 0 ) {
984 zSig0 = zSig1;
985 zSig1 = 0;
986 zExp -= 64;
987 }
988 shiftCount = countLeadingZeros64( zSig0 );
989 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
990 zExp -= shiftCount;
ff32e16e
PM
991 return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
992 zSig0, zSig1, status);
158142c2
FB
993
994}
995
158142c2
FB
996/*----------------------------------------------------------------------------
997| Returns the least-significant 64 fraction bits of the quadruple-precision
998| floating-point value `a'.
999*----------------------------------------------------------------------------*/
1000
a49db98d 1001static inline uint64_t extractFloat128Frac1( float128 a )
158142c2
FB
1002{
1003
1004 return a.low;
1005
1006}
1007
1008/*----------------------------------------------------------------------------
1009| Returns the most-significant 48 fraction bits of the quadruple-precision
1010| floating-point value `a'.
1011*----------------------------------------------------------------------------*/
1012
a49db98d 1013static inline uint64_t extractFloat128Frac0( float128 a )
158142c2
FB
1014{
1015
1016 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
1017
1018}
1019
1020/*----------------------------------------------------------------------------
1021| Returns the exponent bits of the quadruple-precision floating-point value
1022| `a'.
1023*----------------------------------------------------------------------------*/
1024
f4014512 1025static inline int32_t extractFloat128Exp( float128 a )
158142c2
FB
1026{
1027
1028 return ( a.high>>48 ) & 0x7FFF;
1029
1030}
1031
1032/*----------------------------------------------------------------------------
1033| Returns the sign bit of the quadruple-precision floating-point value `a'.
1034*----------------------------------------------------------------------------*/
1035
a49db98d 1036static inline flag extractFloat128Sign( float128 a )
158142c2
FB
1037{
1038
1039 return a.high>>63;
1040
1041}
1042
1043/*----------------------------------------------------------------------------
1044| Normalizes the subnormal quadruple-precision floating-point value
1045| represented by the denormalized significand formed by the concatenation of
1046| `aSig0' and `aSig1'. The normalized exponent is stored at the location
1047| pointed to by `zExpPtr'. The most significant 49 bits of the normalized
1048| significand are stored at the location pointed to by `zSig0Ptr', and the
1049| least significant 64 bits of the normalized significand are stored at the
1050| location pointed to by `zSig1Ptr'.
1051*----------------------------------------------------------------------------*/
1052
1053static void
1054 normalizeFloat128Subnormal(
bb98fe42
AF
1055 uint64_t aSig0,
1056 uint64_t aSig1,
f4014512 1057 int32_t *zExpPtr,
bb98fe42
AF
1058 uint64_t *zSig0Ptr,
1059 uint64_t *zSig1Ptr
158142c2
FB
1060 )
1061{
8f506c70 1062 int8_t shiftCount;
158142c2
FB
1063
1064 if ( aSig0 == 0 ) {
1065 shiftCount = countLeadingZeros64( aSig1 ) - 15;
1066 if ( shiftCount < 0 ) {
1067 *zSig0Ptr = aSig1>>( - shiftCount );
1068 *zSig1Ptr = aSig1<<( shiftCount & 63 );
1069 }
1070 else {
1071 *zSig0Ptr = aSig1<<shiftCount;
1072 *zSig1Ptr = 0;
1073 }
1074 *zExpPtr = - shiftCount - 63;
1075 }
1076 else {
1077 shiftCount = countLeadingZeros64( aSig0 ) - 15;
1078 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
1079 *zExpPtr = 1 - shiftCount;
1080 }
1081
1082}
1083
1084/*----------------------------------------------------------------------------
1085| Packs the sign `zSign', the exponent `zExp', and the significand formed
1086| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
1087| floating-point value, returning the result. After being shifted into the
1088| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
1089| added together to form the most significant 32 bits of the result. This
1090| means that any integer portion of `zSig0' will be added into the exponent.
1091| Since a properly normalized significand will have an integer portion equal
1092| to 1, the `zExp' input should be 1 less than the desired result exponent
1093| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
1094| significand.
1095*----------------------------------------------------------------------------*/
1096
a49db98d 1097static inline float128
f4014512 1098 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
158142c2
FB
1099{
1100 float128 z;
1101
1102 z.low = zSig1;
bb98fe42 1103 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
158142c2
FB
1104 return z;
1105
1106}
1107
1108/*----------------------------------------------------------------------------
1109| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1110| and extended significand formed by the concatenation of `zSig0', `zSig1',
1111| and `zSig2', and returns the proper quadruple-precision floating-point value
1112| corresponding to the abstract input. Ordinarily, the abstract value is
1113| simply rounded and packed into the quadruple-precision format, with the
1114| inexact exception raised if the abstract input cannot be represented
1115| exactly. However, if the abstract value is too large, the overflow and
1116| inexact exceptions are raised and an infinity or maximal finite value is
1117| returned. If the abstract value is too small, the input value is rounded to
1118| a subnormal number, and the underflow and inexact exceptions are raised if
1119| the abstract input cannot be represented exactly as a subnormal quadruple-
1120| precision floating-point number.
1121| The input significand must be normalized or smaller. If the input
1122| significand is not normalized, `zExp' must be 0; in that case, the result
1123| returned is a subnormal number, and it must not require rounding. In the
1124| usual case that the input significand is normalized, `zExp' must be 1 less
1125| than the ``true'' floating-point exponent. The handling of underflow and
1126| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1127*----------------------------------------------------------------------------*/
1128
f4014512 1129static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
e5a41ffa
PM
1130 uint64_t zSig0, uint64_t zSig1,
1131 uint64_t zSig2, float_status *status)
158142c2 1132{
8f506c70 1133 int8_t roundingMode;
158142c2
FB
1134 flag roundNearestEven, increment, isTiny;
1135
a2f2d288 1136 roundingMode = status->float_rounding_mode;
158142c2 1137 roundNearestEven = ( roundingMode == float_round_nearest_even );
dc355b76
PM
1138 switch (roundingMode) {
1139 case float_round_nearest_even:
f9288a76 1140 case float_round_ties_away:
dc355b76
PM
1141 increment = ((int64_t)zSig2 < 0);
1142 break;
1143 case float_round_to_zero:
1144 increment = 0;
1145 break;
1146 case float_round_up:
1147 increment = !zSign && zSig2;
1148 break;
1149 case float_round_down:
1150 increment = zSign && zSig2;
1151 break;
1152 default:
1153 abort();
158142c2 1154 }
bb98fe42 1155 if ( 0x7FFD <= (uint32_t) zExp ) {
158142c2
FB
1156 if ( ( 0x7FFD < zExp )
1157 || ( ( zExp == 0x7FFD )
1158 && eq128(
1159 LIT64( 0x0001FFFFFFFFFFFF ),
1160 LIT64( 0xFFFFFFFFFFFFFFFF ),
1161 zSig0,
1162 zSig1
1163 )
1164 && increment
1165 )
1166 ) {
ff32e16e 1167 float_raise(float_flag_overflow | float_flag_inexact, status);
158142c2
FB
1168 if ( ( roundingMode == float_round_to_zero )
1169 || ( zSign && ( roundingMode == float_round_up ) )
1170 || ( ! zSign && ( roundingMode == float_round_down ) )
1171 ) {
1172 return
1173 packFloat128(
1174 zSign,
1175 0x7FFE,
1176 LIT64( 0x0000FFFFFFFFFFFF ),
1177 LIT64( 0xFFFFFFFFFFFFFFFF )
1178 );
1179 }
1180 return packFloat128( zSign, 0x7FFF, 0, 0 );
1181 }
1182 if ( zExp < 0 ) {
a2f2d288 1183 if (status->flush_to_zero) {
ff32e16e 1184 float_raise(float_flag_output_denormal, status);
e6afc87f
PM
1185 return packFloat128(zSign, 0, 0, 0);
1186 }
158142c2 1187 isTiny =
a2f2d288
PM
1188 (status->float_detect_tininess
1189 == float_tininess_before_rounding)
158142c2
FB
1190 || ( zExp < -1 )
1191 || ! increment
1192 || lt128(
1193 zSig0,
1194 zSig1,
1195 LIT64( 0x0001FFFFFFFFFFFF ),
1196 LIT64( 0xFFFFFFFFFFFFFFFF )
1197 );
1198 shift128ExtraRightJamming(
1199 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1200 zExp = 0;
ff32e16e
PM
1201 if (isTiny && zSig2) {
1202 float_raise(float_flag_underflow, status);
1203 }
dc355b76
PM
1204 switch (roundingMode) {
1205 case float_round_nearest_even:
f9288a76 1206 case float_round_ties_away:
dc355b76
PM
1207 increment = ((int64_t)zSig2 < 0);
1208 break;
1209 case float_round_to_zero:
1210 increment = 0;
1211 break;
1212 case float_round_up:
1213 increment = !zSign && zSig2;
1214 break;
1215 case float_round_down:
1216 increment = zSign && zSig2;
1217 break;
1218 default:
1219 abort();
158142c2
FB
1220 }
1221 }
1222 }
a2f2d288
PM
1223 if (zSig2) {
1224 status->float_exception_flags |= float_flag_inexact;
1225 }
158142c2
FB
1226 if ( increment ) {
1227 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1228 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1229 }
1230 else {
1231 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1232 }
1233 return packFloat128( zSign, zExp, zSig0, zSig1 );
1234
1235}
1236
1237/*----------------------------------------------------------------------------
1238| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1239| and significand formed by the concatenation of `zSig0' and `zSig1', and
1240| returns the proper quadruple-precision floating-point value corresponding
1241| to the abstract input. This routine is just like `roundAndPackFloat128'
1242| except that the input significand has fewer bits and does not have to be
1243| normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1244| point exponent.
1245*----------------------------------------------------------------------------*/
1246
f4014512 1247static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
e5a41ffa
PM
1248 uint64_t zSig0, uint64_t zSig1,
1249 float_status *status)
158142c2 1250{
8f506c70 1251 int8_t shiftCount;
bb98fe42 1252 uint64_t zSig2;
158142c2
FB
1253
1254 if ( zSig0 == 0 ) {
1255 zSig0 = zSig1;
1256 zSig1 = 0;
1257 zExp -= 64;
1258 }
1259 shiftCount = countLeadingZeros64( zSig0 ) - 15;
1260 if ( 0 <= shiftCount ) {
1261 zSig2 = 0;
1262 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1263 }
1264 else {
1265 shift128ExtraRightJamming(
1266 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1267 }
1268 zExp -= shiftCount;
ff32e16e 1269 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
158142c2
FB
1270
1271}
1272
158142c2
FB
1273/*----------------------------------------------------------------------------
1274| Returns the result of converting the 32-bit two's complement integer `a'
1275| to the single-precision floating-point format. The conversion is performed
1276| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1277*----------------------------------------------------------------------------*/
1278
e5a41ffa 1279float32 int32_to_float32(int32_t a, float_status *status)
158142c2
FB
1280{
1281 flag zSign;
1282
f090c9d4 1283 if ( a == 0 ) return float32_zero;
bb98fe42 1284 if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
158142c2 1285 zSign = ( a < 0 );
ff32e16e 1286 return normalizeRoundAndPackFloat32(zSign, 0x9C, zSign ? -a : a, status);
158142c2
FB
1287}
1288
1289/*----------------------------------------------------------------------------
1290| Returns the result of converting the 32-bit two's complement integer `a'
1291| to the double-precision floating-point format. The conversion is performed
1292| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1293*----------------------------------------------------------------------------*/
1294
e5a41ffa 1295float64 int32_to_float64(int32_t a, float_status *status)
158142c2
FB
1296{
1297 flag zSign;
3a87d009 1298 uint32_t absA;
8f506c70 1299 int8_t shiftCount;
bb98fe42 1300 uint64_t zSig;
158142c2 1301
f090c9d4 1302 if ( a == 0 ) return float64_zero;
158142c2
FB
1303 zSign = ( a < 0 );
1304 absA = zSign ? - a : a;
1305 shiftCount = countLeadingZeros32( absA ) + 21;
1306 zSig = absA;
1307 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1308
1309}
1310
158142c2
FB
1311/*----------------------------------------------------------------------------
1312| Returns the result of converting the 32-bit two's complement integer `a'
1313| to the extended double-precision floating-point format. The conversion
1314| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1315| Arithmetic.
1316*----------------------------------------------------------------------------*/
1317
e5a41ffa 1318floatx80 int32_to_floatx80(int32_t a, float_status *status)
158142c2
FB
1319{
1320 flag zSign;
3a87d009 1321 uint32_t absA;
8f506c70 1322 int8_t shiftCount;
bb98fe42 1323 uint64_t zSig;
158142c2
FB
1324
1325 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1326 zSign = ( a < 0 );
1327 absA = zSign ? - a : a;
1328 shiftCount = countLeadingZeros32( absA ) + 32;
1329 zSig = absA;
1330 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1331
1332}
1333
158142c2
FB
1334/*----------------------------------------------------------------------------
1335| Returns the result of converting the 32-bit two's complement integer `a' to
1336| the quadruple-precision floating-point format. The conversion is performed
1337| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1338*----------------------------------------------------------------------------*/
1339
e5a41ffa 1340float128 int32_to_float128(int32_t a, float_status *status)
158142c2
FB
1341{
1342 flag zSign;
3a87d009 1343 uint32_t absA;
8f506c70 1344 int8_t shiftCount;
bb98fe42 1345 uint64_t zSig0;
158142c2
FB
1346
1347 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1348 zSign = ( a < 0 );
1349 absA = zSign ? - a : a;
1350 shiftCount = countLeadingZeros32( absA ) + 17;
1351 zSig0 = absA;
1352 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1353
1354}
1355
158142c2
FB
1356/*----------------------------------------------------------------------------
1357| Returns the result of converting the 64-bit two's complement integer `a'
1358| to the single-precision floating-point format. The conversion is performed
1359| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1360*----------------------------------------------------------------------------*/
1361
e5a41ffa 1362float32 int64_to_float32(int64_t a, float_status *status)
158142c2
FB
1363{
1364 flag zSign;
182f42fd 1365 uint64_t absA;
8f506c70 1366 int8_t shiftCount;
158142c2 1367
f090c9d4 1368 if ( a == 0 ) return float32_zero;
158142c2
FB
1369 zSign = ( a < 0 );
1370 absA = zSign ? - a : a;
1371 shiftCount = countLeadingZeros64( absA ) - 40;
1372 if ( 0 <= shiftCount ) {
1373 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1374 }
1375 else {
1376 shiftCount += 7;
1377 if ( shiftCount < 0 ) {
1378 shift64RightJamming( absA, - shiftCount, &absA );
1379 }
1380 else {
1381 absA <<= shiftCount;
1382 }
ff32e16e 1383 return roundAndPackFloat32(zSign, 0x9C - shiftCount, absA, status);
158142c2
FB
1384 }
1385
1386}
1387
1388/*----------------------------------------------------------------------------
1389| Returns the result of converting the 64-bit two's complement integer `a'
1390| to the double-precision floating-point format. The conversion is performed
1391| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1392*----------------------------------------------------------------------------*/
1393
e5a41ffa 1394float64 int64_to_float64(int64_t a, float_status *status)
158142c2
FB
1395{
1396 flag zSign;
1397
f090c9d4 1398 if ( a == 0 ) return float64_zero;
bb98fe42 1399 if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
158142c2
FB
1400 return packFloat64( 1, 0x43E, 0 );
1401 }
1402 zSign = ( a < 0 );
ff32e16e 1403 return normalizeRoundAndPackFloat64(zSign, 0x43C, zSign ? -a : a, status);
158142c2
FB
1404}
1405
158142c2
FB
1406/*----------------------------------------------------------------------------
1407| Returns the result of converting the 64-bit two's complement integer `a'
1408| to the extended double-precision floating-point format. The conversion
1409| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1410| Arithmetic.
1411*----------------------------------------------------------------------------*/
1412
e5a41ffa 1413floatx80 int64_to_floatx80(int64_t a, float_status *status)
158142c2
FB
1414{
1415 flag zSign;
182f42fd 1416 uint64_t absA;
8f506c70 1417 int8_t shiftCount;
158142c2
FB
1418
1419 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1420 zSign = ( a < 0 );
1421 absA = zSign ? - a : a;
1422 shiftCount = countLeadingZeros64( absA );
1423 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1424
1425}
1426
158142c2
FB
1427/*----------------------------------------------------------------------------
1428| Returns the result of converting the 64-bit two's complement integer `a' to
1429| the quadruple-precision floating-point format. The conversion is performed
1430| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1431*----------------------------------------------------------------------------*/
1432
e5a41ffa 1433float128 int64_to_float128(int64_t a, float_status *status)
158142c2
FB
1434{
1435 flag zSign;
182f42fd 1436 uint64_t absA;
8f506c70 1437 int8_t shiftCount;
f4014512 1438 int32_t zExp;
bb98fe42 1439 uint64_t zSig0, zSig1;
158142c2
FB
1440
1441 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1442 zSign = ( a < 0 );
1443 absA = zSign ? - a : a;
1444 shiftCount = countLeadingZeros64( absA ) + 49;
1445 zExp = 0x406E - shiftCount;
1446 if ( 64 <= shiftCount ) {
1447 zSig1 = 0;
1448 zSig0 = absA;
1449 shiftCount -= 64;
1450 }
1451 else {
1452 zSig1 = absA;
1453 zSig0 = 0;
1454 }
1455 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1456 return packFloat128( zSign, zExp, zSig0, zSig1 );
1457
1458}
1459
6bb8e0f1
PM
1460/*----------------------------------------------------------------------------
1461| Returns the result of converting the 64-bit unsigned integer `a'
1462| to the single-precision floating-point format. The conversion is performed
1463| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1464*----------------------------------------------------------------------------*/
1465
e5a41ffa 1466float32 uint64_to_float32(uint64_t a, float_status *status)
6bb8e0f1
PM
1467{
1468 int shiftcount;
1469
1470 if (a == 0) {
1471 return float32_zero;
1472 }
1473
1474 /* Determine (left) shift needed to put first set bit into bit posn 23
1475 * (since packFloat32() expects the binary point between bits 23 and 22);
1476 * this is the fast case for smallish numbers.
1477 */
1478 shiftcount = countLeadingZeros64(a) - 40;
1479 if (shiftcount >= 0) {
1480 return packFloat32(0, 0x95 - shiftcount, a << shiftcount);
1481 }
1482 /* Otherwise we need to do a round-and-pack. roundAndPackFloat32()
1483 * expects the binary point between bits 30 and 29, hence the + 7.
1484 */
1485 shiftcount += 7;
1486 if (shiftcount < 0) {
1487 shift64RightJamming(a, -shiftcount, &a);
1488 } else {
1489 a <<= shiftcount;
1490 }
1491
ff32e16e 1492 return roundAndPackFloat32(0, 0x9c - shiftcount, a, status);
6bb8e0f1
PM
1493}
1494
1495/*----------------------------------------------------------------------------
1496| Returns the result of converting the 64-bit unsigned integer `a'
1497| to the double-precision floating-point format. The conversion is performed
1498| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1499*----------------------------------------------------------------------------*/
1500
e5a41ffa 1501float64 uint64_to_float64(uint64_t a, float_status *status)
6bb8e0f1
PM
1502{
1503 int exp = 0x43C;
1504 int shiftcount;
1505
1506 if (a == 0) {
1507 return float64_zero;
1508 }
1509
1510 shiftcount = countLeadingZeros64(a) - 1;
1511 if (shiftcount < 0) {
1512 shift64RightJamming(a, -shiftcount, &a);
1513 } else {
1514 a <<= shiftcount;
1515 }
ff32e16e 1516 return roundAndPackFloat64(0, exp - shiftcount, a, status);
6bb8e0f1
PM
1517}
1518
1519/*----------------------------------------------------------------------------
1520| Returns the result of converting the 64-bit unsigned integer `a'
1521| to the quadruple-precision floating-point format. The conversion is performed
1522| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1523*----------------------------------------------------------------------------*/
1524
e5a41ffa 1525float128 uint64_to_float128(uint64_t a, float_status *status)
1e397ead
RH
1526{
1527 if (a == 0) {
1528 return float128_zero;
1529 }
ff32e16e 1530 return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status);
1e397ead
RH
1531}
1532
158142c2
FB
1533/*----------------------------------------------------------------------------
1534| Returns the result of converting the single-precision floating-point value
1535| `a' to the 32-bit two's complement integer format. The conversion is
1536| performed according to the IEC/IEEE Standard for Binary Floating-Point
1537| Arithmetic---which means in particular that the conversion is rounded
1538| according to the current rounding mode. If `a' is a NaN, the largest
1539| positive integer is returned. Otherwise, if the conversion overflows, the
1540| largest integer with the same sign as `a' is returned.
1541*----------------------------------------------------------------------------*/
1542
f4014512 1543int32_t float32_to_int32(float32 a, float_status *status)
158142c2
FB
1544{
1545 flag aSign;
94a49d86 1546 int_fast16_t aExp, shiftCount;
bb98fe42
AF
1547 uint32_t aSig;
1548 uint64_t aSig64;
158142c2 1549
ff32e16e 1550 a = float32_squash_input_denormal(a, status);
158142c2
FB
1551 aSig = extractFloat32Frac( a );
1552 aExp = extractFloat32Exp( a );
1553 aSign = extractFloat32Sign( a );
1554 if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1555 if ( aExp ) aSig |= 0x00800000;
1556 shiftCount = 0xAF - aExp;
1557 aSig64 = aSig;
1558 aSig64 <<= 32;
1559 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
ff32e16e 1560 return roundAndPackInt32(aSign, aSig64, status);
158142c2
FB
1561
1562}
1563
1564/*----------------------------------------------------------------------------
1565| Returns the result of converting the single-precision floating-point value
1566| `a' to the 32-bit two's complement integer format. The conversion is
1567| performed according to the IEC/IEEE Standard for Binary Floating-Point
1568| Arithmetic, except that the conversion is always rounded toward zero.
1569| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1570| the conversion overflows, the largest integer with the same sign as `a' is
1571| returned.
1572*----------------------------------------------------------------------------*/
1573
f4014512 1574int32_t float32_to_int32_round_to_zero(float32 a, float_status *status)
158142c2
FB
1575{
1576 flag aSign;
94a49d86 1577 int_fast16_t aExp, shiftCount;
bb98fe42 1578 uint32_t aSig;
b3a6a2e0 1579 int32_t z;
ff32e16e 1580 a = float32_squash_input_denormal(a, status);
158142c2
FB
1581
1582 aSig = extractFloat32Frac( a );
1583 aExp = extractFloat32Exp( a );
1584 aSign = extractFloat32Sign( a );
1585 shiftCount = aExp - 0x9E;
1586 if ( 0 <= shiftCount ) {
f090c9d4 1587 if ( float32_val(a) != 0xCF000000 ) {
ff32e16e 1588 float_raise(float_flag_invalid, status);
158142c2
FB
1589 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1590 }
bb98fe42 1591 return (int32_t) 0x80000000;
158142c2
FB
1592 }
1593 else if ( aExp <= 0x7E ) {
a2f2d288
PM
1594 if (aExp | aSig) {
1595 status->float_exception_flags |= float_flag_inexact;
1596 }
158142c2
FB
1597 return 0;
1598 }
1599 aSig = ( aSig | 0x00800000 )<<8;
1600 z = aSig>>( - shiftCount );
bb98fe42 1601 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
a2f2d288 1602 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
1603 }
1604 if ( aSign ) z = - z;
1605 return z;
1606
1607}
1608
cbcef455
PM
1609/*----------------------------------------------------------------------------
1610| Returns the result of converting the single-precision floating-point value
1611| `a' to the 16-bit two's complement integer format. The conversion is
1612| performed according to the IEC/IEEE Standard for Binary Floating-Point
1613| Arithmetic, except that the conversion is always rounded toward zero.
1614| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1615| the conversion overflows, the largest integer with the same sign as `a' is
1616| returned.
1617*----------------------------------------------------------------------------*/
1618
0bb721d7 1619int16_t float32_to_int16_round_to_zero(float32 a, float_status *status)
cbcef455
PM
1620{
1621 flag aSign;
94a49d86 1622 int_fast16_t aExp, shiftCount;
bb98fe42 1623 uint32_t aSig;
f4014512 1624 int32_t z;
cbcef455
PM
1625
1626 aSig = extractFloat32Frac( a );
1627 aExp = extractFloat32Exp( a );
1628 aSign = extractFloat32Sign( a );
1629 shiftCount = aExp - 0x8E;
1630 if ( 0 <= shiftCount ) {
1631 if ( float32_val(a) != 0xC7000000 ) {
ff32e16e 1632 float_raise(float_flag_invalid, status);
cbcef455
PM
1633 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1634 return 0x7FFF;
1635 }
1636 }
bb98fe42 1637 return (int32_t) 0xffff8000;
cbcef455
PM
1638 }
1639 else if ( aExp <= 0x7E ) {
1640 if ( aExp | aSig ) {
a2f2d288 1641 status->float_exception_flags |= float_flag_inexact;
cbcef455
PM
1642 }
1643 return 0;
1644 }
1645 shiftCount -= 0x10;
1646 aSig = ( aSig | 0x00800000 )<<8;
1647 z = aSig>>( - shiftCount );
bb98fe42 1648 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
a2f2d288 1649 status->float_exception_flags |= float_flag_inexact;
cbcef455
PM
1650 }
1651 if ( aSign ) {
1652 z = - z;
1653 }
1654 return z;
1655
1656}
1657
158142c2
FB
1658/*----------------------------------------------------------------------------
1659| Returns the result of converting the single-precision floating-point value
1660| `a' to the 64-bit two's complement integer format. The conversion is
1661| performed according to the IEC/IEEE Standard for Binary Floating-Point
1662| Arithmetic---which means in particular that the conversion is rounded
1663| according to the current rounding mode. If `a' is a NaN, the largest
1664| positive integer is returned. Otherwise, if the conversion overflows, the
1665| largest integer with the same sign as `a' is returned.
1666*----------------------------------------------------------------------------*/
1667
f42c2224 1668int64_t float32_to_int64(float32 a, float_status *status)
158142c2
FB
1669{
1670 flag aSign;
94a49d86 1671 int_fast16_t aExp, shiftCount;
bb98fe42
AF
1672 uint32_t aSig;
1673 uint64_t aSig64, aSigExtra;
ff32e16e 1674 a = float32_squash_input_denormal(a, status);
158142c2
FB
1675
1676 aSig = extractFloat32Frac( a );
1677 aExp = extractFloat32Exp( a );
1678 aSign = extractFloat32Sign( a );
1679 shiftCount = 0xBE - aExp;
1680 if ( shiftCount < 0 ) {
ff32e16e 1681 float_raise(float_flag_invalid, status);
158142c2
FB
1682 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1683 return LIT64( 0x7FFFFFFFFFFFFFFF );
1684 }
bb98fe42 1685 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
1686 }
1687 if ( aExp ) aSig |= 0x00800000;
1688 aSig64 = aSig;
1689 aSig64 <<= 40;
1690 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
ff32e16e 1691 return roundAndPackInt64(aSign, aSig64, aSigExtra, status);
158142c2
FB
1692
1693}
1694
2f18bbf9
TM
1695/*----------------------------------------------------------------------------
1696| Returns the result of converting the single-precision floating-point value
1697| `a' to the 64-bit unsigned integer format. The conversion is
1698| performed according to the IEC/IEEE Standard for Binary Floating-Point
1699| Arithmetic---which means in particular that the conversion is rounded
1700| according to the current rounding mode. If `a' is a NaN, the largest
1701| unsigned integer is returned. Otherwise, if the conversion overflows, the
1702| largest unsigned integer is returned. If the 'a' is negative, the result
1703| is rounded and zero is returned; values that do not round to zero will
1704| raise the inexact exception flag.
1705*----------------------------------------------------------------------------*/
1706
182f42fd 1707uint64_t float32_to_uint64(float32 a, float_status *status)
2f18bbf9
TM
1708{
1709 flag aSign;
1710 int_fast16_t aExp, shiftCount;
1711 uint32_t aSig;
1712 uint64_t aSig64, aSigExtra;
ff32e16e 1713 a = float32_squash_input_denormal(a, status);
2f18bbf9
TM
1714
1715 aSig = extractFloat32Frac(a);
1716 aExp = extractFloat32Exp(a);
1717 aSign = extractFloat32Sign(a);
1718 if ((aSign) && (aExp > 126)) {
ff32e16e 1719 float_raise(float_flag_invalid, status);
2f18bbf9
TM
1720 if (float32_is_any_nan(a)) {
1721 return LIT64(0xFFFFFFFFFFFFFFFF);
1722 } else {
1723 return 0;
1724 }
1725 }
1726 shiftCount = 0xBE - aExp;
1727 if (aExp) {
1728 aSig |= 0x00800000;
1729 }
1730 if (shiftCount < 0) {
ff32e16e 1731 float_raise(float_flag_invalid, status);
2f18bbf9
TM
1732 return LIT64(0xFFFFFFFFFFFFFFFF);
1733 }
1734
1735 aSig64 = aSig;
1736 aSig64 <<= 40;
1737 shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra);
ff32e16e 1738 return roundAndPackUint64(aSign, aSig64, aSigExtra, status);
2f18bbf9
TM
1739}
1740
a13d4489
TM
1741/*----------------------------------------------------------------------------
1742| Returns the result of converting the single-precision floating-point value
1743| `a' to the 64-bit unsigned integer format. The conversion is
1744| performed according to the IEC/IEEE Standard for Binary Floating-Point
1745| Arithmetic, except that the conversion is always rounded toward zero. If
1746| `a' is a NaN, the largest unsigned integer is returned. Otherwise, if the
1747| conversion overflows, the largest unsigned integer is returned. If the
1748| 'a' is negative, the result is rounded and zero is returned; values that do
1749| not round to zero will raise the inexact flag.
1750*----------------------------------------------------------------------------*/
1751
182f42fd 1752uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *status)
a13d4489 1753{
a2f2d288 1754 signed char current_rounding_mode = status->float_rounding_mode;
ff32e16e
PM
1755 set_float_rounding_mode(float_round_to_zero, status);
1756 int64_t v = float32_to_uint64(a, status);
1757 set_float_rounding_mode(current_rounding_mode, status);
a13d4489
TM
1758 return v;
1759}
1760
158142c2
FB
1761/*----------------------------------------------------------------------------
1762| Returns the result of converting the single-precision floating-point value
1763| `a' to the 64-bit two's complement integer format. The conversion is
1764| performed according to the IEC/IEEE Standard for Binary Floating-Point
1765| Arithmetic, except that the conversion is always rounded toward zero. If
1766| `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1767| conversion overflows, the largest integer with the same sign as `a' is
1768| returned.
1769*----------------------------------------------------------------------------*/
1770
f42c2224 1771int64_t float32_to_int64_round_to_zero(float32 a, float_status *status)
158142c2
FB
1772{
1773 flag aSign;
94a49d86 1774 int_fast16_t aExp, shiftCount;
bb98fe42
AF
1775 uint32_t aSig;
1776 uint64_t aSig64;
f42c2224 1777 int64_t z;
ff32e16e 1778 a = float32_squash_input_denormal(a, status);
158142c2
FB
1779
1780 aSig = extractFloat32Frac( a );
1781 aExp = extractFloat32Exp( a );
1782 aSign = extractFloat32Sign( a );
1783 shiftCount = aExp - 0xBE;
1784 if ( 0 <= shiftCount ) {
f090c9d4 1785 if ( float32_val(a) != 0xDF000000 ) {
ff32e16e 1786 float_raise(float_flag_invalid, status);
158142c2
FB
1787 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1788 return LIT64( 0x7FFFFFFFFFFFFFFF );
1789 }
1790 }
bb98fe42 1791 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
1792 }
1793 else if ( aExp <= 0x7E ) {
a2f2d288
PM
1794 if (aExp | aSig) {
1795 status->float_exception_flags |= float_flag_inexact;
1796 }
158142c2
FB
1797 return 0;
1798 }
1799 aSig64 = aSig | 0x00800000;
1800 aSig64 <<= 40;
1801 z = aSig64>>( - shiftCount );
bb98fe42 1802 if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
a2f2d288 1803 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
1804 }
1805 if ( aSign ) z = - z;
1806 return z;
1807
1808}
1809
1810/*----------------------------------------------------------------------------
1811| Returns the result of converting the single-precision floating-point value
1812| `a' to the double-precision floating-point format. The conversion is
1813| performed according to the IEC/IEEE Standard for Binary Floating-Point
1814| Arithmetic.
1815*----------------------------------------------------------------------------*/
1816
e5a41ffa 1817float64 float32_to_float64(float32 a, float_status *status)
158142c2
FB
1818{
1819 flag aSign;
94a49d86 1820 int_fast16_t aExp;
bb98fe42 1821 uint32_t aSig;
ff32e16e 1822 a = float32_squash_input_denormal(a, status);
158142c2
FB
1823
1824 aSig = extractFloat32Frac( a );
1825 aExp = extractFloat32Exp( a );
1826 aSign = extractFloat32Sign( a );
1827 if ( aExp == 0xFF ) {
ff32e16e
PM
1828 if (aSig) {
1829 return commonNaNToFloat64(float32ToCommonNaN(a, status), status);
1830 }
158142c2
FB
1831 return packFloat64( aSign, 0x7FF, 0 );
1832 }
1833 if ( aExp == 0 ) {
1834 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1835 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1836 --aExp;
1837 }
bb98fe42 1838 return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
158142c2
FB
1839
1840}
1841
158142c2
FB
1842/*----------------------------------------------------------------------------
1843| Returns the result of converting the single-precision floating-point value
1844| `a' to the extended double-precision floating-point format. The conversion
1845| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1846| Arithmetic.
1847*----------------------------------------------------------------------------*/
1848
e5a41ffa 1849floatx80 float32_to_floatx80(float32 a, float_status *status)
158142c2
FB
1850{
1851 flag aSign;
94a49d86 1852 int_fast16_t aExp;
bb98fe42 1853 uint32_t aSig;
158142c2 1854
ff32e16e 1855 a = float32_squash_input_denormal(a, status);
158142c2
FB
1856 aSig = extractFloat32Frac( a );
1857 aExp = extractFloat32Exp( a );
1858 aSign = extractFloat32Sign( a );
1859 if ( aExp == 0xFF ) {
ff32e16e
PM
1860 if (aSig) {
1861 return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
1862 }
158142c2
FB
1863 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1864 }
1865 if ( aExp == 0 ) {
1866 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1867 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1868 }
1869 aSig |= 0x00800000;
bb98fe42 1870 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
158142c2
FB
1871
1872}
1873
158142c2
FB
1874/*----------------------------------------------------------------------------
1875| Returns the result of converting the single-precision floating-point value
1876| `a' to the double-precision floating-point format. The conversion is
1877| performed according to the IEC/IEEE Standard for Binary Floating-Point
1878| Arithmetic.
1879*----------------------------------------------------------------------------*/
1880
e5a41ffa 1881float128 float32_to_float128(float32 a, float_status *status)
158142c2
FB
1882{
1883 flag aSign;
94a49d86 1884 int_fast16_t aExp;
bb98fe42 1885 uint32_t aSig;
158142c2 1886
ff32e16e 1887 a = float32_squash_input_denormal(a, status);
158142c2
FB
1888 aSig = extractFloat32Frac( a );
1889 aExp = extractFloat32Exp( a );
1890 aSign = extractFloat32Sign( a );
1891 if ( aExp == 0xFF ) {
ff32e16e
PM
1892 if (aSig) {
1893 return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
1894 }
158142c2
FB
1895 return packFloat128( aSign, 0x7FFF, 0, 0 );
1896 }
1897 if ( aExp == 0 ) {
1898 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1899 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1900 --aExp;
1901 }
bb98fe42 1902 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
158142c2
FB
1903
1904}
1905
158142c2
FB
1906/*----------------------------------------------------------------------------
1907| Rounds the single-precision floating-point value `a' to an integer, and
1908| returns the result as a single-precision floating-point value. The
1909| operation is performed according to the IEC/IEEE Standard for Binary
1910| Floating-Point Arithmetic.
1911*----------------------------------------------------------------------------*/
1912
e5a41ffa 1913float32 float32_round_to_int(float32 a, float_status *status)
158142c2
FB
1914{
1915 flag aSign;
94a49d86 1916 int_fast16_t aExp;
bb98fe42 1917 uint32_t lastBitMask, roundBitsMask;
bb98fe42 1918 uint32_t z;
ff32e16e 1919 a = float32_squash_input_denormal(a, status);
158142c2
FB
1920
1921 aExp = extractFloat32Exp( a );
1922 if ( 0x96 <= aExp ) {
1923 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
ff32e16e 1924 return propagateFloat32NaN(a, a, status);
158142c2
FB
1925 }
1926 return a;
1927 }
1928 if ( aExp <= 0x7E ) {
bb98fe42 1929 if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
a2f2d288 1930 status->float_exception_flags |= float_flag_inexact;
158142c2 1931 aSign = extractFloat32Sign( a );
a2f2d288 1932 switch (status->float_rounding_mode) {
158142c2
FB
1933 case float_round_nearest_even:
1934 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1935 return packFloat32( aSign, 0x7F, 0 );
1936 }
1937 break;
f9288a76
PM
1938 case float_round_ties_away:
1939 if (aExp == 0x7E) {
1940 return packFloat32(aSign, 0x7F, 0);
1941 }
1942 break;
158142c2 1943 case float_round_down:
f090c9d4 1944 return make_float32(aSign ? 0xBF800000 : 0);
158142c2 1945 case float_round_up:
f090c9d4 1946 return make_float32(aSign ? 0x80000000 : 0x3F800000);
158142c2
FB
1947 }
1948 return packFloat32( aSign, 0, 0 );
1949 }
1950 lastBitMask = 1;
1951 lastBitMask <<= 0x96 - aExp;
1952 roundBitsMask = lastBitMask - 1;
f090c9d4 1953 z = float32_val(a);
a2f2d288 1954 switch (status->float_rounding_mode) {
dc355b76 1955 case float_round_nearest_even:
158142c2 1956 z += lastBitMask>>1;
dc355b76
PM
1957 if ((z & roundBitsMask) == 0) {
1958 z &= ~lastBitMask;
1959 }
1960 break;
f9288a76
PM
1961 case float_round_ties_away:
1962 z += lastBitMask >> 1;
1963 break;
dc355b76
PM
1964 case float_round_to_zero:
1965 break;
1966 case float_round_up:
1967 if (!extractFloat32Sign(make_float32(z))) {
1968 z += roundBitsMask;
1969 }
1970 break;
1971 case float_round_down:
1972 if (extractFloat32Sign(make_float32(z))) {
158142c2
FB
1973 z += roundBitsMask;
1974 }
dc355b76
PM
1975 break;
1976 default:
1977 abort();
158142c2
FB
1978 }
1979 z &= ~ roundBitsMask;
a2f2d288
PM
1980 if (z != float32_val(a)) {
1981 status->float_exception_flags |= float_flag_inexact;
1982 }
f090c9d4 1983 return make_float32(z);
158142c2
FB
1984
1985}
1986
1987/*----------------------------------------------------------------------------
1988| Returns the result of adding the absolute values of the single-precision
1989| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1990| before being returned. `zSign' is ignored if the result is a NaN.
1991| The addition is performed according to the IEC/IEEE Standard for Binary
1992| Floating-Point Arithmetic.
1993*----------------------------------------------------------------------------*/
1994
e5a41ffa
PM
1995static float32 addFloat32Sigs(float32 a, float32 b, flag zSign,
1996 float_status *status)
158142c2 1997{
94a49d86 1998 int_fast16_t aExp, bExp, zExp;
bb98fe42 1999 uint32_t aSig, bSig, zSig;
94a49d86 2000 int_fast16_t expDiff;
158142c2
FB
2001
2002 aSig = extractFloat32Frac( a );
2003 aExp = extractFloat32Exp( a );
2004 bSig = extractFloat32Frac( b );
2005 bExp = extractFloat32Exp( b );
2006 expDiff = aExp - bExp;
2007 aSig <<= 6;
2008 bSig <<= 6;
2009 if ( 0 < expDiff ) {
2010 if ( aExp == 0xFF ) {
ff32e16e
PM
2011 if (aSig) {
2012 return propagateFloat32NaN(a, b, status);
2013 }
158142c2
FB
2014 return a;
2015 }
2016 if ( bExp == 0 ) {
2017 --expDiff;
2018 }
2019 else {
2020 bSig |= 0x20000000;
2021 }
2022 shift32RightJamming( bSig, expDiff, &bSig );
2023 zExp = aExp;
2024 }
2025 else if ( expDiff < 0 ) {
2026 if ( bExp == 0xFF ) {
ff32e16e
PM
2027 if (bSig) {
2028 return propagateFloat32NaN(a, b, status);
2029 }
158142c2
FB
2030 return packFloat32( zSign, 0xFF, 0 );
2031 }
2032 if ( aExp == 0 ) {
2033 ++expDiff;
2034 }
2035 else {
2036 aSig |= 0x20000000;
2037 }
2038 shift32RightJamming( aSig, - expDiff, &aSig );
2039 zExp = bExp;
2040 }
2041 else {
2042 if ( aExp == 0xFF ) {
ff32e16e
PM
2043 if (aSig | bSig) {
2044 return propagateFloat32NaN(a, b, status);
2045 }
158142c2
FB
2046 return a;
2047 }
fe76d976 2048 if ( aExp == 0 ) {
a2f2d288 2049 if (status->flush_to_zero) {
e6afc87f 2050 if (aSig | bSig) {
ff32e16e 2051 float_raise(float_flag_output_denormal, status);
e6afc87f
PM
2052 }
2053 return packFloat32(zSign, 0, 0);
2054 }
fe76d976
PB
2055 return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
2056 }
158142c2
FB
2057 zSig = 0x40000000 + aSig + bSig;
2058 zExp = aExp;
2059 goto roundAndPack;
2060 }
2061 aSig |= 0x20000000;
2062 zSig = ( aSig + bSig )<<1;
2063 --zExp;
bb98fe42 2064 if ( (int32_t) zSig < 0 ) {
158142c2
FB
2065 zSig = aSig + bSig;
2066 ++zExp;
2067 }
2068 roundAndPack:
ff32e16e 2069 return roundAndPackFloat32(zSign, zExp, zSig, status);
158142c2
FB
2070
2071}
2072
2073/*----------------------------------------------------------------------------
2074| Returns the result of subtracting the absolute values of the single-
2075| precision floating-point values `a' and `b'. If `zSign' is 1, the
2076| difference is negated before being returned. `zSign' is ignored if the
2077| result is a NaN. The subtraction is performed according to the IEC/IEEE
2078| Standard for Binary Floating-Point Arithmetic.
2079*----------------------------------------------------------------------------*/
2080
e5a41ffa
PM
2081static float32 subFloat32Sigs(float32 a, float32 b, flag zSign,
2082 float_status *status)
158142c2 2083{
94a49d86 2084 int_fast16_t aExp, bExp, zExp;
bb98fe42 2085 uint32_t aSig, bSig, zSig;
94a49d86 2086 int_fast16_t expDiff;
158142c2
FB
2087
2088 aSig = extractFloat32Frac( a );
2089 aExp = extractFloat32Exp( a );
2090 bSig = extractFloat32Frac( b );
2091 bExp = extractFloat32Exp( b );
2092 expDiff = aExp - bExp;
2093 aSig <<= 7;
2094 bSig <<= 7;
2095 if ( 0 < expDiff ) goto aExpBigger;
2096 if ( expDiff < 0 ) goto bExpBigger;
2097 if ( aExp == 0xFF ) {
ff32e16e
PM
2098 if (aSig | bSig) {
2099 return propagateFloat32NaN(a, b, status);
2100 }
2101 float_raise(float_flag_invalid, status);
158142c2
FB
2102 return float32_default_nan;
2103 }
2104 if ( aExp == 0 ) {
2105 aExp = 1;
2106 bExp = 1;
2107 }
2108 if ( bSig < aSig ) goto aBigger;
2109 if ( aSig < bSig ) goto bBigger;
a2f2d288 2110 return packFloat32(status->float_rounding_mode == float_round_down, 0, 0);
158142c2
FB
2111 bExpBigger:
2112 if ( bExp == 0xFF ) {
ff32e16e
PM
2113 if (bSig) {
2114 return propagateFloat32NaN(a, b, status);
2115 }
158142c2
FB
2116 return packFloat32( zSign ^ 1, 0xFF, 0 );
2117 }
2118 if ( aExp == 0 ) {
2119 ++expDiff;
2120 }
2121 else {
2122 aSig |= 0x40000000;
2123 }
2124 shift32RightJamming( aSig, - expDiff, &aSig );
2125 bSig |= 0x40000000;
2126 bBigger:
2127 zSig = bSig - aSig;
2128 zExp = bExp;
2129 zSign ^= 1;
2130 goto normalizeRoundAndPack;
2131 aExpBigger:
2132 if ( aExp == 0xFF ) {
ff32e16e
PM
2133 if (aSig) {
2134 return propagateFloat32NaN(a, b, status);
2135 }
158142c2
FB
2136 return a;
2137 }
2138 if ( bExp == 0 ) {
2139 --expDiff;
2140 }
2141 else {
2142 bSig |= 0x40000000;
2143 }
2144 shift32RightJamming( bSig, expDiff, &bSig );
2145 aSig |= 0x40000000;
2146 aBigger:
2147 zSig = aSig - bSig;
2148 zExp = aExp;
2149 normalizeRoundAndPack:
2150 --zExp;
ff32e16e 2151 return normalizeRoundAndPackFloat32(zSign, zExp, zSig, status);
158142c2
FB
2152
2153}
2154
2155/*----------------------------------------------------------------------------
2156| Returns the result of adding the single-precision floating-point values `a'
2157| and `b'. The operation is performed according to the IEC/IEEE Standard for
2158| Binary Floating-Point Arithmetic.
2159*----------------------------------------------------------------------------*/
2160
e5a41ffa 2161float32 float32_add(float32 a, float32 b, float_status *status)
158142c2
FB
2162{
2163 flag aSign, bSign;
ff32e16e
PM
2164 a = float32_squash_input_denormal(a, status);
2165 b = float32_squash_input_denormal(b, status);
158142c2
FB
2166
2167 aSign = extractFloat32Sign( a );
2168 bSign = extractFloat32Sign( b );
2169 if ( aSign == bSign ) {
ff32e16e 2170 return addFloat32Sigs(a, b, aSign, status);
158142c2
FB
2171 }
2172 else {
ff32e16e 2173 return subFloat32Sigs(a, b, aSign, status);
158142c2
FB
2174 }
2175
2176}
2177
2178/*----------------------------------------------------------------------------
2179| Returns the result of subtracting the single-precision floating-point values
2180| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2181| for Binary Floating-Point Arithmetic.
2182*----------------------------------------------------------------------------*/
2183
e5a41ffa 2184float32 float32_sub(float32 a, float32 b, float_status *status)
158142c2
FB
2185{
2186 flag aSign, bSign;
ff32e16e
PM
2187 a = float32_squash_input_denormal(a, status);
2188 b = float32_squash_input_denormal(b, status);
158142c2
FB
2189
2190 aSign = extractFloat32Sign( a );
2191 bSign = extractFloat32Sign( b );
2192 if ( aSign == bSign ) {
ff32e16e 2193 return subFloat32Sigs(a, b, aSign, status);
158142c2
FB
2194 }
2195 else {
ff32e16e 2196 return addFloat32Sigs(a, b, aSign, status);
158142c2
FB
2197 }
2198
2199}
2200
2201/*----------------------------------------------------------------------------
2202| Returns the result of multiplying the single-precision floating-point values
2203| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2204| for Binary Floating-Point Arithmetic.
2205*----------------------------------------------------------------------------*/
2206
e5a41ffa 2207float32 float32_mul(float32 a, float32 b, float_status *status)
158142c2
FB
2208{
2209 flag aSign, bSign, zSign;
94a49d86 2210 int_fast16_t aExp, bExp, zExp;
bb98fe42
AF
2211 uint32_t aSig, bSig;
2212 uint64_t zSig64;
2213 uint32_t zSig;
158142c2 2214
ff32e16e
PM
2215 a = float32_squash_input_denormal(a, status);
2216 b = float32_squash_input_denormal(b, status);
37d18660 2217
158142c2
FB
2218 aSig = extractFloat32Frac( a );
2219 aExp = extractFloat32Exp( a );
2220 aSign = extractFloat32Sign( a );
2221 bSig = extractFloat32Frac( b );
2222 bExp = extractFloat32Exp( b );
2223 bSign = extractFloat32Sign( b );
2224 zSign = aSign ^ bSign;
2225 if ( aExp == 0xFF ) {
2226 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
ff32e16e 2227 return propagateFloat32NaN(a, b, status);
158142c2
FB
2228 }
2229 if ( ( bExp | bSig ) == 0 ) {
ff32e16e 2230 float_raise(float_flag_invalid, status);
158142c2
FB
2231 return float32_default_nan;
2232 }
2233 return packFloat32( zSign, 0xFF, 0 );
2234 }
2235 if ( bExp == 0xFF ) {
ff32e16e
PM
2236 if (bSig) {
2237 return propagateFloat32NaN(a, b, status);
2238 }
158142c2 2239 if ( ( aExp | aSig ) == 0 ) {
ff32e16e 2240 float_raise(float_flag_invalid, status);
158142c2
FB
2241 return float32_default_nan;
2242 }
2243 return packFloat32( zSign, 0xFF, 0 );
2244 }
2245 if ( aExp == 0 ) {
2246 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2247 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2248 }
2249 if ( bExp == 0 ) {
2250 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
2251 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2252 }
2253 zExp = aExp + bExp - 0x7F;
2254 aSig = ( aSig | 0x00800000 )<<7;
2255 bSig = ( bSig | 0x00800000 )<<8;
bb98fe42 2256 shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
158142c2 2257 zSig = zSig64;
bb98fe42 2258 if ( 0 <= (int32_t) ( zSig<<1 ) ) {
158142c2
FB
2259 zSig <<= 1;
2260 --zExp;
2261 }
ff32e16e 2262 return roundAndPackFloat32(zSign, zExp, zSig, status);
158142c2
FB
2263
2264}
2265
2266/*----------------------------------------------------------------------------
2267| Returns the result of dividing the single-precision floating-point value `a'
2268| by the corresponding value `b'. The operation is performed according to the
2269| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2270*----------------------------------------------------------------------------*/
2271
e5a41ffa 2272float32 float32_div(float32 a, float32 b, float_status *status)
158142c2
FB
2273{
2274 flag aSign, bSign, zSign;
94a49d86 2275 int_fast16_t aExp, bExp, zExp;
bb98fe42 2276 uint32_t aSig, bSig, zSig;
ff32e16e
PM
2277 a = float32_squash_input_denormal(a, status);
2278 b = float32_squash_input_denormal(b, status);
158142c2
FB
2279
2280 aSig = extractFloat32Frac( a );
2281 aExp = extractFloat32Exp( a );
2282 aSign = extractFloat32Sign( a );
2283 bSig = extractFloat32Frac( b );
2284 bExp = extractFloat32Exp( b );
2285 bSign = extractFloat32Sign( b );
2286 zSign = aSign ^ bSign;
2287 if ( aExp == 0xFF ) {
ff32e16e
PM
2288 if (aSig) {
2289 return propagateFloat32NaN(a, b, status);
2290 }
158142c2 2291 if ( bExp == 0xFF ) {
ff32e16e
PM
2292 if (bSig) {
2293 return propagateFloat32NaN(a, b, status);
2294 }
2295 float_raise(float_flag_invalid, status);
158142c2
FB
2296 return float32_default_nan;
2297 }
2298 return packFloat32( zSign, 0xFF, 0 );
2299 }
2300 if ( bExp == 0xFF ) {
ff32e16e
PM
2301 if (bSig) {
2302 return propagateFloat32NaN(a, b, status);
2303 }
158142c2
FB
2304 return packFloat32( zSign, 0, 0 );
2305 }
2306 if ( bExp == 0 ) {
2307 if ( bSig == 0 ) {
2308 if ( ( aExp | aSig ) == 0 ) {
ff32e16e 2309 float_raise(float_flag_invalid, status);
158142c2
FB
2310 return float32_default_nan;
2311 }
ff32e16e 2312 float_raise(float_flag_divbyzero, status);
158142c2
FB
2313 return packFloat32( zSign, 0xFF, 0 );
2314 }
2315 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2316 }
2317 if ( aExp == 0 ) {
2318 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2319 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2320 }
2321 zExp = aExp - bExp + 0x7D;
2322 aSig = ( aSig | 0x00800000 )<<7;
2323 bSig = ( bSig | 0x00800000 )<<8;
2324 if ( bSig <= ( aSig + aSig ) ) {
2325 aSig >>= 1;
2326 ++zExp;
2327 }
bb98fe42 2328 zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
158142c2 2329 if ( ( zSig & 0x3F ) == 0 ) {
bb98fe42 2330 zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
158142c2 2331 }
ff32e16e 2332 return roundAndPackFloat32(zSign, zExp, zSig, status);
158142c2
FB
2333
2334}
2335
2336/*----------------------------------------------------------------------------
2337| Returns the remainder of the single-precision floating-point value `a'
2338| with respect to the corresponding value `b'. The operation is performed
2339| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2340*----------------------------------------------------------------------------*/
2341
e5a41ffa 2342float32 float32_rem(float32 a, float32 b, float_status *status)
158142c2 2343{
ed086f3d 2344 flag aSign, zSign;
94a49d86 2345 int_fast16_t aExp, bExp, expDiff;
bb98fe42
AF
2346 uint32_t aSig, bSig;
2347 uint32_t q;
2348 uint64_t aSig64, bSig64, q64;
2349 uint32_t alternateASig;
2350 int32_t sigMean;
ff32e16e
PM
2351 a = float32_squash_input_denormal(a, status);
2352 b = float32_squash_input_denormal(b, status);
158142c2
FB
2353
2354 aSig = extractFloat32Frac( a );
2355 aExp = extractFloat32Exp( a );
2356 aSign = extractFloat32Sign( a );
2357 bSig = extractFloat32Frac( b );
2358 bExp = extractFloat32Exp( b );
158142c2
FB
2359 if ( aExp == 0xFF ) {
2360 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
ff32e16e 2361 return propagateFloat32NaN(a, b, status);
158142c2 2362 }
ff32e16e 2363 float_raise(float_flag_invalid, status);
158142c2
FB
2364 return float32_default_nan;
2365 }
2366 if ( bExp == 0xFF ) {
ff32e16e
PM
2367 if (bSig) {
2368 return propagateFloat32NaN(a, b, status);
2369 }
158142c2
FB
2370 return a;
2371 }
2372 if ( bExp == 0 ) {
2373 if ( bSig == 0 ) {
ff32e16e 2374 float_raise(float_flag_invalid, status);
158142c2
FB
2375 return float32_default_nan;
2376 }
2377 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2378 }
2379 if ( aExp == 0 ) {
2380 if ( aSig == 0 ) return a;
2381 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2382 }
2383 expDiff = aExp - bExp;
2384 aSig |= 0x00800000;
2385 bSig |= 0x00800000;
2386 if ( expDiff < 32 ) {
2387 aSig <<= 8;
2388 bSig <<= 8;
2389 if ( expDiff < 0 ) {
2390 if ( expDiff < -1 ) return a;
2391 aSig >>= 1;
2392 }
2393 q = ( bSig <= aSig );
2394 if ( q ) aSig -= bSig;
2395 if ( 0 < expDiff ) {
bb98fe42 2396 q = ( ( (uint64_t) aSig )<<32 ) / bSig;
158142c2
FB
2397 q >>= 32 - expDiff;
2398 bSig >>= 2;
2399 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2400 }
2401 else {
2402 aSig >>= 2;
2403 bSig >>= 2;
2404 }
2405 }
2406 else {
2407 if ( bSig <= aSig ) aSig -= bSig;
bb98fe42
AF
2408 aSig64 = ( (uint64_t) aSig )<<40;
2409 bSig64 = ( (uint64_t) bSig )<<40;
158142c2
FB
2410 expDiff -= 64;
2411 while ( 0 < expDiff ) {
2412 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2413 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2414 aSig64 = - ( ( bSig * q64 )<<38 );
2415 expDiff -= 62;
2416 }
2417 expDiff += 64;
2418 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2419 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2420 q = q64>>( 64 - expDiff );
2421 bSig <<= 6;
2422 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2423 }
2424 do {
2425 alternateASig = aSig;
2426 ++q;
2427 aSig -= bSig;
bb98fe42 2428 } while ( 0 <= (int32_t) aSig );
158142c2
FB
2429 sigMean = aSig + alternateASig;
2430 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2431 aSig = alternateASig;
2432 }
bb98fe42 2433 zSign = ( (int32_t) aSig < 0 );
158142c2 2434 if ( zSign ) aSig = - aSig;
ff32e16e 2435 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
158142c2
FB
2436}
2437
369be8f6
PM
2438/*----------------------------------------------------------------------------
2439| Returns the result of multiplying the single-precision floating-point values
2440| `a' and `b' then adding 'c', with no intermediate rounding step after the
2441| multiplication. The operation is performed according to the IEC/IEEE
2442| Standard for Binary Floating-Point Arithmetic 754-2008.
2443| The flags argument allows the caller to select negation of the
2444| addend, the intermediate product, or the final result. (The difference
2445| between this and having the caller do a separate negation is that negating
2446| externally will flip the sign bit on NaNs.)
2447*----------------------------------------------------------------------------*/
2448
e5a41ffa
PM
2449float32 float32_muladd(float32 a, float32 b, float32 c, int flags,
2450 float_status *status)
369be8f6
PM
2451{
2452 flag aSign, bSign, cSign, zSign;
94a49d86 2453 int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
369be8f6
PM
2454 uint32_t aSig, bSig, cSig;
2455 flag pInf, pZero, pSign;
2456 uint64_t pSig64, cSig64, zSig64;
2457 uint32_t pSig;
2458 int shiftcount;
2459 flag signflip, infzero;
2460
ff32e16e
PM
2461 a = float32_squash_input_denormal(a, status);
2462 b = float32_squash_input_denormal(b, status);
2463 c = float32_squash_input_denormal(c, status);
369be8f6
PM
2464 aSig = extractFloat32Frac(a);
2465 aExp = extractFloat32Exp(a);
2466 aSign = extractFloat32Sign(a);
2467 bSig = extractFloat32Frac(b);
2468 bExp = extractFloat32Exp(b);
2469 bSign = extractFloat32Sign(b);
2470 cSig = extractFloat32Frac(c);
2471 cExp = extractFloat32Exp(c);
2472 cSign = extractFloat32Sign(c);
2473
2474 infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) ||
2475 (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0));
2476
2477 /* It is implementation-defined whether the cases of (0,inf,qnan)
2478 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
2479 * they return if they do), so we have to hand this information
2480 * off to the target-specific pick-a-NaN routine.
2481 */
2482 if (((aExp == 0xff) && aSig) ||
2483 ((bExp == 0xff) && bSig) ||
2484 ((cExp == 0xff) && cSig)) {
ff32e16e 2485 return propagateFloat32MulAddNaN(a, b, c, infzero, status);
369be8f6
PM
2486 }
2487
2488 if (infzero) {
ff32e16e 2489 float_raise(float_flag_invalid, status);
369be8f6
PM
2490 return float32_default_nan;
2491 }
2492
2493 if (flags & float_muladd_negate_c) {
2494 cSign ^= 1;
2495 }
2496
2497 signflip = (flags & float_muladd_negate_result) ? 1 : 0;
2498
2499 /* Work out the sign and type of the product */
2500 pSign = aSign ^ bSign;
2501 if (flags & float_muladd_negate_product) {
2502 pSign ^= 1;
2503 }
2504 pInf = (aExp == 0xff) || (bExp == 0xff);
2505 pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
2506
2507 if (cExp == 0xff) {
2508 if (pInf && (pSign ^ cSign)) {
2509 /* addition of opposite-signed infinities => InvalidOperation */
ff32e16e 2510 float_raise(float_flag_invalid, status);
369be8f6
PM
2511 return float32_default_nan;
2512 }
2513 /* Otherwise generate an infinity of the same sign */
2514 return packFloat32(cSign ^ signflip, 0xff, 0);
2515 }
2516
2517 if (pInf) {
2518 return packFloat32(pSign ^ signflip, 0xff, 0);
2519 }
2520
2521 if (pZero) {
2522 if (cExp == 0) {
2523 if (cSig == 0) {
2524 /* Adding two exact zeroes */
2525 if (pSign == cSign) {
2526 zSign = pSign;
a2f2d288 2527 } else if (status->float_rounding_mode == float_round_down) {
369be8f6
PM
2528 zSign = 1;
2529 } else {
2530 zSign = 0;
2531 }
2532 return packFloat32(zSign ^ signflip, 0, 0);
2533 }
2534 /* Exact zero plus a denorm */
a2f2d288 2535 if (status->flush_to_zero) {
ff32e16e 2536 float_raise(float_flag_output_denormal, status);
369be8f6
PM
2537 return packFloat32(cSign ^ signflip, 0, 0);
2538 }
2539 }
2540 /* Zero plus something non-zero : just return the something */
67d43538
PM
2541 if (flags & float_muladd_halve_result) {
2542 if (cExp == 0) {
2543 normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2544 }
2545 /* Subtract one to halve, and one again because roundAndPackFloat32
2546 * wants one less than the true exponent.
2547 */
2548 cExp -= 2;
2549 cSig = (cSig | 0x00800000) << 7;
ff32e16e 2550 return roundAndPackFloat32(cSign ^ signflip, cExp, cSig, status);
67d43538 2551 }
a6e7c184 2552 return packFloat32(cSign ^ signflip, cExp, cSig);
369be8f6
PM
2553 }
2554
2555 if (aExp == 0) {
2556 normalizeFloat32Subnormal(aSig, &aExp, &aSig);
2557 }
2558 if (bExp == 0) {
2559 normalizeFloat32Subnormal(bSig, &bExp, &bSig);
2560 }
2561
2562 /* Calculate the actual result a * b + c */
2563
2564 /* Multiply first; this is easy. */
2565 /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
2566 * because we want the true exponent, not the "one-less-than"
2567 * flavour that roundAndPackFloat32() takes.
2568 */
2569 pExp = aExp + bExp - 0x7e;
2570 aSig = (aSig | 0x00800000) << 7;
2571 bSig = (bSig | 0x00800000) << 8;
2572 pSig64 = (uint64_t)aSig * bSig;
2573 if ((int64_t)(pSig64 << 1) >= 0) {
2574 pSig64 <<= 1;
2575 pExp--;
2576 }
2577
2578 zSign = pSign ^ signflip;
2579
2580 /* Now pSig64 is the significand of the multiply, with the explicit bit in
2581 * position 62.
2582 */
2583 if (cExp == 0) {
2584 if (!cSig) {
2585 /* Throw out the special case of c being an exact zero now */
2586 shift64RightJamming(pSig64, 32, &pSig64);
2587 pSig = pSig64;
67d43538
PM
2588 if (flags & float_muladd_halve_result) {
2589 pExp--;
2590 }
369be8f6 2591 return roundAndPackFloat32(zSign, pExp - 1,
ff32e16e 2592 pSig, status);
369be8f6
PM
2593 }
2594 normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2595 }
2596
2597 cSig64 = (uint64_t)cSig << (62 - 23);
2598 cSig64 |= LIT64(0x4000000000000000);
2599 expDiff = pExp - cExp;
2600
2601 if (pSign == cSign) {
2602 /* Addition */
2603 if (expDiff > 0) {
2604 /* scale c to match p */
2605 shift64RightJamming(cSig64, expDiff, &cSig64);
2606 zExp = pExp;
2607 } else if (expDiff < 0) {
2608 /* scale p to match c */
2609 shift64RightJamming(pSig64, -expDiff, &pSig64);
2610 zExp = cExp;
2611 } else {
2612 /* no scaling needed */
2613 zExp = cExp;
2614 }
2615 /* Add significands and make sure explicit bit ends up in posn 62 */
2616 zSig64 = pSig64 + cSig64;
2617 if ((int64_t)zSig64 < 0) {
2618 shift64RightJamming(zSig64, 1, &zSig64);
2619 } else {
2620 zExp--;
2621 }
2622 } else {
2623 /* Subtraction */
2624 if (expDiff > 0) {
2625 shift64RightJamming(cSig64, expDiff, &cSig64);
2626 zSig64 = pSig64 - cSig64;
2627 zExp = pExp;
2628 } else if (expDiff < 0) {
2629 shift64RightJamming(pSig64, -expDiff, &pSig64);
2630 zSig64 = cSig64 - pSig64;
2631 zExp = cExp;
2632 zSign ^= 1;
2633 } else {
2634 zExp = pExp;
2635 if (cSig64 < pSig64) {
2636 zSig64 = pSig64 - cSig64;
2637 } else if (pSig64 < cSig64) {
2638 zSig64 = cSig64 - pSig64;
2639 zSign ^= 1;
2640 } else {
2641 /* Exact zero */
2642 zSign = signflip;
a2f2d288 2643 if (status->float_rounding_mode == float_round_down) {
369be8f6
PM
2644 zSign ^= 1;
2645 }
2646 return packFloat32(zSign, 0, 0);
2647 }
2648 }
2649 --zExp;
2650 /* Normalize to put the explicit bit back into bit 62. */
2651 shiftcount = countLeadingZeros64(zSig64) - 1;
2652 zSig64 <<= shiftcount;
2653 zExp -= shiftcount;
2654 }
67d43538
PM
2655 if (flags & float_muladd_halve_result) {
2656 zExp--;
2657 }
2658
369be8f6 2659 shift64RightJamming(zSig64, 32, &zSig64);
ff32e16e 2660 return roundAndPackFloat32(zSign, zExp, zSig64, status);
369be8f6
PM
2661}
2662
2663
158142c2
FB
2664/*----------------------------------------------------------------------------
2665| Returns the square root of the single-precision floating-point value `a'.
2666| The operation is performed according to the IEC/IEEE Standard for Binary
2667| Floating-Point Arithmetic.
2668*----------------------------------------------------------------------------*/
2669
e5a41ffa 2670float32 float32_sqrt(float32 a, float_status *status)
158142c2
FB
2671{
2672 flag aSign;
94a49d86 2673 int_fast16_t aExp, zExp;
bb98fe42
AF
2674 uint32_t aSig, zSig;
2675 uint64_t rem, term;
ff32e16e 2676 a = float32_squash_input_denormal(a, status);
158142c2
FB
2677
2678 aSig = extractFloat32Frac( a );
2679 aExp = extractFloat32Exp( a );
2680 aSign = extractFloat32Sign( a );
2681 if ( aExp == 0xFF ) {
ff32e16e
PM
2682 if (aSig) {
2683 return propagateFloat32NaN(a, float32_zero, status);
2684 }
158142c2 2685 if ( ! aSign ) return a;
ff32e16e 2686 float_raise(float_flag_invalid, status);
158142c2
FB
2687 return float32_default_nan;
2688 }
2689 if ( aSign ) {
2690 if ( ( aExp | aSig ) == 0 ) return a;
ff32e16e 2691 float_raise(float_flag_invalid, status);
158142c2
FB
2692 return float32_default_nan;
2693 }
2694 if ( aExp == 0 ) {
f090c9d4 2695 if ( aSig == 0 ) return float32_zero;
158142c2
FB
2696 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2697 }
2698 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2699 aSig = ( aSig | 0x00800000 )<<8;
2700 zSig = estimateSqrt32( aExp, aSig ) + 2;
2701 if ( ( zSig & 0x7F ) <= 5 ) {
2702 if ( zSig < 2 ) {
2703 zSig = 0x7FFFFFFF;
2704 goto roundAndPack;
2705 }
2706 aSig >>= aExp & 1;
bb98fe42
AF
2707 term = ( (uint64_t) zSig ) * zSig;
2708 rem = ( ( (uint64_t) aSig )<<32 ) - term;
2709 while ( (int64_t) rem < 0 ) {
158142c2 2710 --zSig;
bb98fe42 2711 rem += ( ( (uint64_t) zSig )<<1 ) | 1;
158142c2
FB
2712 }
2713 zSig |= ( rem != 0 );
2714 }
2715 shift32RightJamming( zSig, 1, &zSig );
2716 roundAndPack:
ff32e16e 2717 return roundAndPackFloat32(0, zExp, zSig, status);
158142c2
FB
2718
2719}
2720
8229c991
AJ
2721/*----------------------------------------------------------------------------
2722| Returns the binary exponential of the single-precision floating-point value
2723| `a'. The operation is performed according to the IEC/IEEE Standard for
2724| Binary Floating-Point Arithmetic.
2725|
2726| Uses the following identities:
2727|
2728| 1. -------------------------------------------------------------------------
2729| x x*ln(2)
2730| 2 = e
2731|
2732| 2. -------------------------------------------------------------------------
2733| 2 3 4 5 n
2734| x x x x x x x
2735| e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2736| 1! 2! 3! 4! 5! n!
2737*----------------------------------------------------------------------------*/
2738
2739static const float64 float32_exp2_coefficients[15] =
2740{
d5138cf4
PM
2741 const_float64( 0x3ff0000000000000ll ), /* 1 */
2742 const_float64( 0x3fe0000000000000ll ), /* 2 */
2743 const_float64( 0x3fc5555555555555ll ), /* 3 */
2744 const_float64( 0x3fa5555555555555ll ), /* 4 */
2745 const_float64( 0x3f81111111111111ll ), /* 5 */
2746 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
2747 const_float64( 0x3f2a01a01a01a01all ), /* 7 */
2748 const_float64( 0x3efa01a01a01a01all ), /* 8 */
2749 const_float64( 0x3ec71de3a556c734ll ), /* 9 */
2750 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2751 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2752 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2753 const_float64( 0x3de6124613a86d09ll ), /* 13 */
2754 const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2755 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
8229c991
AJ
2756};
2757
e5a41ffa 2758float32 float32_exp2(float32 a, float_status *status)
8229c991
AJ
2759{
2760 flag aSign;
94a49d86 2761 int_fast16_t aExp;
bb98fe42 2762 uint32_t aSig;
8229c991
AJ
2763 float64 r, x, xn;
2764 int i;
ff32e16e 2765 a = float32_squash_input_denormal(a, status);
8229c991
AJ
2766
2767 aSig = extractFloat32Frac( a );
2768 aExp = extractFloat32Exp( a );
2769 aSign = extractFloat32Sign( a );
2770
2771 if ( aExp == 0xFF) {
ff32e16e
PM
2772 if (aSig) {
2773 return propagateFloat32NaN(a, float32_zero, status);
2774 }
8229c991
AJ
2775 return (aSign) ? float32_zero : a;
2776 }
2777 if (aExp == 0) {
2778 if (aSig == 0) return float32_one;
2779 }
2780
ff32e16e 2781 float_raise(float_flag_inexact, status);
8229c991
AJ
2782
2783 /* ******************************* */
2784 /* using float64 for approximation */
2785 /* ******************************* */
ff32e16e
PM
2786 x = float32_to_float64(a, status);
2787 x = float64_mul(x, float64_ln2, status);
8229c991
AJ
2788
2789 xn = x;
2790 r = float64_one;
2791 for (i = 0 ; i < 15 ; i++) {
2792 float64 f;
2793
ff32e16e
PM
2794 f = float64_mul(xn, float32_exp2_coefficients[i], status);
2795 r = float64_add(r, f, status);
8229c991 2796
ff32e16e 2797 xn = float64_mul(xn, x, status);
8229c991
AJ
2798 }
2799
2800 return float64_to_float32(r, status);
2801}
2802
374dfc33
AJ
2803/*----------------------------------------------------------------------------
2804| Returns the binary log of the single-precision floating-point value `a'.
2805| The operation is performed according to the IEC/IEEE Standard for Binary
2806| Floating-Point Arithmetic.
2807*----------------------------------------------------------------------------*/
e5a41ffa 2808float32 float32_log2(float32 a, float_status *status)
374dfc33
AJ
2809{
2810 flag aSign, zSign;
94a49d86 2811 int_fast16_t aExp;
bb98fe42 2812 uint32_t aSig, zSig, i;
374dfc33 2813
ff32e16e 2814 a = float32_squash_input_denormal(a, status);
374dfc33
AJ
2815 aSig = extractFloat32Frac( a );
2816 aExp = extractFloat32Exp( a );
2817 aSign = extractFloat32Sign( a );
2818
2819 if ( aExp == 0 ) {
2820 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2821 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2822 }
2823 if ( aSign ) {
ff32e16e 2824 float_raise(float_flag_invalid, status);
374dfc33
AJ
2825 return float32_default_nan;
2826 }
2827 if ( aExp == 0xFF ) {
ff32e16e
PM
2828 if (aSig) {
2829 return propagateFloat32NaN(a, float32_zero, status);
2830 }
374dfc33
AJ
2831 return a;
2832 }
2833
2834 aExp -= 0x7F;
2835 aSig |= 0x00800000;
2836 zSign = aExp < 0;
2837 zSig = aExp << 23;
2838
2839 for (i = 1 << 22; i > 0; i >>= 1) {
bb98fe42 2840 aSig = ( (uint64_t)aSig * aSig ) >> 23;
374dfc33
AJ
2841 if ( aSig & 0x01000000 ) {
2842 aSig >>= 1;
2843 zSig |= i;
2844 }
2845 }
2846
2847 if ( zSign )
2848 zSig = -zSig;
2849
ff32e16e 2850 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
374dfc33
AJ
2851}
2852
158142c2
FB
2853/*----------------------------------------------------------------------------
2854| Returns 1 if the single-precision floating-point value `a' is equal to
b689362d
AJ
2855| the corresponding value `b', and 0 otherwise. The invalid exception is
2856| raised if either operand is a NaN. Otherwise, the comparison is performed
158142c2
FB
2857| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2858*----------------------------------------------------------------------------*/
2859
e5a41ffa 2860int float32_eq(float32 a, float32 b, float_status *status)
158142c2 2861{
b689362d 2862 uint32_t av, bv;
ff32e16e
PM
2863 a = float32_squash_input_denormal(a, status);
2864 b = float32_squash_input_denormal(b, status);
158142c2
FB
2865
2866 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2867 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2868 ) {
ff32e16e 2869 float_raise(float_flag_invalid, status);
158142c2
FB
2870 return 0;
2871 }
b689362d
AJ
2872 av = float32_val(a);
2873 bv = float32_val(b);
2874 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
158142c2
FB
2875}
2876
2877/*----------------------------------------------------------------------------
2878| Returns 1 if the single-precision floating-point value `a' is less than
f5a64251
AJ
2879| or equal to the corresponding value `b', and 0 otherwise. The invalid
2880| exception is raised if either operand is a NaN. The comparison is performed
2881| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
2882*----------------------------------------------------------------------------*/
2883
e5a41ffa 2884int float32_le(float32 a, float32 b, float_status *status)
158142c2
FB
2885{
2886 flag aSign, bSign;
bb98fe42 2887 uint32_t av, bv;
ff32e16e
PM
2888 a = float32_squash_input_denormal(a, status);
2889 b = float32_squash_input_denormal(b, status);
158142c2
FB
2890
2891 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2892 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2893 ) {
ff32e16e 2894 float_raise(float_flag_invalid, status);
158142c2
FB
2895 return 0;
2896 }
2897 aSign = extractFloat32Sign( a );
2898 bSign = extractFloat32Sign( b );
f090c9d4
PB
2899 av = float32_val(a);
2900 bv = float32_val(b);
bb98fe42 2901 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 2902 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
2903
2904}
2905
2906/*----------------------------------------------------------------------------
2907| Returns 1 if the single-precision floating-point value `a' is less than
f5a64251
AJ
2908| the corresponding value `b', and 0 otherwise. The invalid exception is
2909| raised if either operand is a NaN. The comparison is performed according
2910| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
2911*----------------------------------------------------------------------------*/
2912
e5a41ffa 2913int float32_lt(float32 a, float32 b, float_status *status)
158142c2
FB
2914{
2915 flag aSign, bSign;
bb98fe42 2916 uint32_t av, bv;
ff32e16e
PM
2917 a = float32_squash_input_denormal(a, status);
2918 b = float32_squash_input_denormal(b, status);
158142c2
FB
2919
2920 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2921 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2922 ) {
ff32e16e 2923 float_raise(float_flag_invalid, status);
158142c2
FB
2924 return 0;
2925 }
2926 aSign = extractFloat32Sign( a );
2927 bSign = extractFloat32Sign( b );
f090c9d4
PB
2928 av = float32_val(a);
2929 bv = float32_val(b);
bb98fe42 2930 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 2931 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
2932
2933}
2934
67b7861d
AJ
2935/*----------------------------------------------------------------------------
2936| Returns 1 if the single-precision floating-point values `a' and `b' cannot
f5a64251
AJ
2937| be compared, and 0 otherwise. The invalid exception is raised if either
2938| operand is a NaN. The comparison is performed according to the IEC/IEEE
2939| Standard for Binary Floating-Point Arithmetic.
67b7861d
AJ
2940*----------------------------------------------------------------------------*/
2941
e5a41ffa 2942int float32_unordered(float32 a, float32 b, float_status *status)
67b7861d 2943{
ff32e16e
PM
2944 a = float32_squash_input_denormal(a, status);
2945 b = float32_squash_input_denormal(b, status);
67b7861d
AJ
2946
2947 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2948 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2949 ) {
ff32e16e 2950 float_raise(float_flag_invalid, status);
67b7861d
AJ
2951 return 1;
2952 }
2953 return 0;
2954}
b689362d 2955
158142c2
FB
2956/*----------------------------------------------------------------------------
2957| Returns 1 if the single-precision floating-point value `a' is equal to
f5a64251
AJ
2958| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2959| exception. The comparison is performed according to the IEC/IEEE Standard
2960| for Binary Floating-Point Arithmetic.
158142c2
FB
2961*----------------------------------------------------------------------------*/
2962
e5a41ffa 2963int float32_eq_quiet(float32 a, float32 b, float_status *status)
158142c2 2964{
ff32e16e
PM
2965 a = float32_squash_input_denormal(a, status);
2966 b = float32_squash_input_denormal(b, status);
158142c2
FB
2967
2968 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2969 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2970 ) {
b689362d 2971 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
ff32e16e 2972 float_raise(float_flag_invalid, status);
b689362d 2973 }
158142c2
FB
2974 return 0;
2975 }
b689362d
AJ
2976 return ( float32_val(a) == float32_val(b) ) ||
2977 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
158142c2
FB
2978}
2979
2980/*----------------------------------------------------------------------------
2981| Returns 1 if the single-precision floating-point value `a' is less than or
2982| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2983| cause an exception. Otherwise, the comparison is performed according to the
2984| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2985*----------------------------------------------------------------------------*/
2986
e5a41ffa 2987int float32_le_quiet(float32 a, float32 b, float_status *status)
158142c2
FB
2988{
2989 flag aSign, bSign;
bb98fe42 2990 uint32_t av, bv;
ff32e16e
PM
2991 a = float32_squash_input_denormal(a, status);
2992 b = float32_squash_input_denormal(b, status);
158142c2
FB
2993
2994 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2995 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2996 ) {
2997 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
ff32e16e 2998 float_raise(float_flag_invalid, status);
158142c2
FB
2999 }
3000 return 0;
3001 }
3002 aSign = extractFloat32Sign( a );
3003 bSign = extractFloat32Sign( b );
f090c9d4
PB
3004 av = float32_val(a);
3005 bv = float32_val(b);
bb98fe42 3006 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 3007 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
3008
3009}
3010
3011/*----------------------------------------------------------------------------
3012| Returns 1 if the single-precision floating-point value `a' is less than
3013| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3014| exception. Otherwise, the comparison is performed according to the IEC/IEEE
3015| Standard for Binary Floating-Point Arithmetic.
3016*----------------------------------------------------------------------------*/
3017
e5a41ffa 3018int float32_lt_quiet(float32 a, float32 b, float_status *status)
158142c2
FB
3019{
3020 flag aSign, bSign;
bb98fe42 3021 uint32_t av, bv;
ff32e16e
PM
3022 a = float32_squash_input_denormal(a, status);
3023 b = float32_squash_input_denormal(b, status);
158142c2
FB
3024
3025 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3026 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3027 ) {
3028 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
ff32e16e 3029 float_raise(float_flag_invalid, status);
158142c2
FB
3030 }
3031 return 0;
3032 }
3033 aSign = extractFloat32Sign( a );
3034 bSign = extractFloat32Sign( b );
f090c9d4
PB
3035 av = float32_val(a);
3036 bv = float32_val(b);
bb98fe42 3037 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 3038 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
3039
3040}
3041
67b7861d
AJ
3042/*----------------------------------------------------------------------------
3043| Returns 1 if the single-precision floating-point values `a' and `b' cannot
3044| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
3045| comparison is performed according to the IEC/IEEE Standard for Binary
3046| Floating-Point Arithmetic.
3047*----------------------------------------------------------------------------*/
3048
e5a41ffa 3049int float32_unordered_quiet(float32 a, float32 b, float_status *status)
67b7861d 3050{
ff32e16e
PM
3051 a = float32_squash_input_denormal(a, status);
3052 b = float32_squash_input_denormal(b, status);
67b7861d
AJ
3053
3054 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3055 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3056 ) {
3057 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
ff32e16e 3058 float_raise(float_flag_invalid, status);
67b7861d
AJ
3059 }
3060 return 1;
3061 }
3062 return 0;
3063}
3064
158142c2
FB
3065/*----------------------------------------------------------------------------
3066| Returns the result of converting the double-precision floating-point value
3067| `a' to the 32-bit two's complement integer format. The conversion is
3068| performed according to the IEC/IEEE Standard for Binary Floating-Point
3069| Arithmetic---which means in particular that the conversion is rounded
3070| according to the current rounding mode. If `a' is a NaN, the largest
3071| positive integer is returned. Otherwise, if the conversion overflows, the
3072| largest integer with the same sign as `a' is returned.
3073*----------------------------------------------------------------------------*/
3074
f4014512 3075int32_t float64_to_int32(float64 a, float_status *status)
158142c2
FB
3076{
3077 flag aSign;
94a49d86 3078 int_fast16_t aExp, shiftCount;
bb98fe42 3079 uint64_t aSig;
ff32e16e 3080 a = float64_squash_input_denormal(a, status);
158142c2
FB
3081
3082 aSig = extractFloat64Frac( a );
3083 aExp = extractFloat64Exp( a );
3084 aSign = extractFloat64Sign( a );
3085 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3086 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3087 shiftCount = 0x42C - aExp;
3088 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
ff32e16e 3089 return roundAndPackInt32(aSign, aSig, status);
158142c2
FB
3090
3091}
3092
3093/*----------------------------------------------------------------------------
3094| Returns the result of converting the double-precision floating-point value
3095| `a' to the 32-bit two's complement integer format. The conversion is
3096| performed according to the IEC/IEEE Standard for Binary Floating-Point
3097| Arithmetic, except that the conversion is always rounded toward zero.
3098| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
3099| the conversion overflows, the largest integer with the same sign as `a' is
3100| returned.
3101*----------------------------------------------------------------------------*/
3102
f4014512 3103int32_t float64_to_int32_round_to_zero(float64 a, float_status *status)
158142c2
FB
3104{
3105 flag aSign;
94a49d86 3106 int_fast16_t aExp, shiftCount;
bb98fe42 3107 uint64_t aSig, savedASig;
b3a6a2e0 3108 int32_t z;
ff32e16e 3109 a = float64_squash_input_denormal(a, status);
158142c2
FB
3110
3111 aSig = extractFloat64Frac( a );
3112 aExp = extractFloat64Exp( a );
3113 aSign = extractFloat64Sign( a );
3114 if ( 0x41E < aExp ) {
3115 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3116 goto invalid;
3117 }
3118 else if ( aExp < 0x3FF ) {
a2f2d288
PM
3119 if (aExp || aSig) {
3120 status->float_exception_flags |= float_flag_inexact;
3121 }
158142c2
FB
3122 return 0;
3123 }
3124 aSig |= LIT64( 0x0010000000000000 );
3125 shiftCount = 0x433 - aExp;
3126 savedASig = aSig;
3127 aSig >>= shiftCount;
3128 z = aSig;
3129 if ( aSign ) z = - z;
3130 if ( ( z < 0 ) ^ aSign ) {
3131 invalid:
ff32e16e 3132 float_raise(float_flag_invalid, status);
bb98fe42 3133 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2
FB
3134 }
3135 if ( ( aSig<<shiftCount ) != savedASig ) {
a2f2d288 3136 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
3137 }
3138 return z;
3139
3140}
3141
cbcef455
PM
3142/*----------------------------------------------------------------------------
3143| Returns the result of converting the double-precision floating-point value
3144| `a' to the 16-bit two's complement integer format. The conversion is
3145| performed according to the IEC/IEEE Standard for Binary Floating-Point
3146| Arithmetic, except that the conversion is always rounded toward zero.
3147| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
3148| the conversion overflows, the largest integer with the same sign as `a' is
3149| returned.
3150*----------------------------------------------------------------------------*/
3151
0bb721d7 3152int16_t float64_to_int16_round_to_zero(float64 a, float_status *status)
cbcef455
PM
3153{
3154 flag aSign;
94a49d86 3155 int_fast16_t aExp, shiftCount;
bb98fe42 3156 uint64_t aSig, savedASig;
f4014512 3157 int32_t z;
cbcef455
PM
3158
3159 aSig = extractFloat64Frac( a );
3160 aExp = extractFloat64Exp( a );
3161 aSign = extractFloat64Sign( a );
3162 if ( 0x40E < aExp ) {
3163 if ( ( aExp == 0x7FF ) && aSig ) {
3164 aSign = 0;
3165 }
3166 goto invalid;
3167 }
3168 else if ( aExp < 0x3FF ) {
3169 if ( aExp || aSig ) {
a2f2d288 3170 status->float_exception_flags |= float_flag_inexact;
cbcef455
PM
3171 }
3172 return 0;
3173 }
3174 aSig |= LIT64( 0x0010000000000000 );
3175 shiftCount = 0x433 - aExp;
3176 savedASig = aSig;
3177 aSig >>= shiftCount;
3178 z = aSig;
3179 if ( aSign ) {
3180 z = - z;
3181 }
3182 if ( ( (int16_t)z < 0 ) ^ aSign ) {
3183 invalid:
ff32e16e 3184 float_raise(float_flag_invalid, status);
bb98fe42 3185 return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
cbcef455
PM
3186 }
3187 if ( ( aSig<<shiftCount ) != savedASig ) {
a2f2d288 3188 status->float_exception_flags |= float_flag_inexact;
cbcef455
PM
3189 }
3190 return z;
3191}
3192
158142c2
FB
3193/*----------------------------------------------------------------------------
3194| Returns the result of converting the double-precision floating-point value
3195| `a' to the 64-bit two's complement integer format. The conversion is
3196| performed according to the IEC/IEEE Standard for Binary Floating-Point
3197| Arithmetic---which means in particular that the conversion is rounded
3198| according to the current rounding mode. If `a' is a NaN, the largest
3199| positive integer is returned. Otherwise, if the conversion overflows, the
3200| largest integer with the same sign as `a' is returned.
3201*----------------------------------------------------------------------------*/
3202
f42c2224 3203int64_t float64_to_int64(float64 a, float_status *status)
158142c2
FB
3204{
3205 flag aSign;
94a49d86 3206 int_fast16_t aExp, shiftCount;
bb98fe42 3207 uint64_t aSig, aSigExtra;
ff32e16e 3208 a = float64_squash_input_denormal(a, status);
158142c2
FB
3209
3210 aSig = extractFloat64Frac( a );
3211 aExp = extractFloat64Exp( a );
3212 aSign = extractFloat64Sign( a );
3213 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3214 shiftCount = 0x433 - aExp;
3215 if ( shiftCount <= 0 ) {
3216 if ( 0x43E < aExp ) {
ff32e16e 3217 float_raise(float_flag_invalid, status);
158142c2
FB
3218 if ( ! aSign
3219 || ( ( aExp == 0x7FF )
3220 && ( aSig != LIT64( 0x0010000000000000 ) ) )
3221 ) {
3222 return LIT64( 0x7FFFFFFFFFFFFFFF );
3223 }
bb98fe42 3224 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
3225 }
3226 aSigExtra = 0;
3227 aSig <<= - shiftCount;
3228 }
3229 else {
3230 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3231 }
ff32e16e 3232 return roundAndPackInt64(aSign, aSig, aSigExtra, status);
158142c2
FB
3233
3234}
3235
3236/*----------------------------------------------------------------------------
3237| Returns the result of converting the double-precision floating-point value
3238| `a' to the 64-bit two's complement integer format. The conversion is
3239| performed according to the IEC/IEEE Standard for Binary Floating-Point
3240| Arithmetic, except that the conversion is always rounded toward zero.
3241| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
3242| the conversion overflows, the largest integer with the same sign as `a' is
3243| returned.
3244*----------------------------------------------------------------------------*/
3245
f42c2224 3246int64_t float64_to_int64_round_to_zero(float64 a, float_status *status)
158142c2
FB
3247{
3248 flag aSign;
94a49d86 3249 int_fast16_t aExp, shiftCount;
bb98fe42 3250 uint64_t aSig;
f42c2224 3251 int64_t z;
ff32e16e 3252 a = float64_squash_input_denormal(a, status);
158142c2
FB
3253
3254 aSig = extractFloat64Frac( a );
3255 aExp = extractFloat64Exp( a );
3256 aSign = extractFloat64Sign( a );
3257 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3258 shiftCount = aExp - 0x433;
3259 if ( 0 <= shiftCount ) {
3260 if ( 0x43E <= aExp ) {
f090c9d4 3261 if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
ff32e16e 3262 float_raise(float_flag_invalid, status);
158142c2
FB
3263 if ( ! aSign
3264 || ( ( aExp == 0x7FF )
3265 && ( aSig != LIT64( 0x0010000000000000 ) ) )
3266 ) {
3267 return LIT64( 0x7FFFFFFFFFFFFFFF );
3268 }
3269 }
bb98fe42 3270 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
3271 }
3272 z = aSig<<shiftCount;
3273 }
3274 else {
3275 if ( aExp < 0x3FE ) {
a2f2d288
PM
3276 if (aExp | aSig) {
3277 status->float_exception_flags |= float_flag_inexact;
3278 }
158142c2
FB
3279 return 0;
3280 }
3281 z = aSig>>( - shiftCount );
bb98fe42 3282 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
a2f2d288 3283 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
3284 }
3285 }
3286 if ( aSign ) z = - z;
3287 return z;
3288
3289}
3290
3291/*----------------------------------------------------------------------------
3292| Returns the result of converting the double-precision floating-point value
3293| `a' to the single-precision floating-point format. The conversion is
3294| performed according to the IEC/IEEE Standard for Binary Floating-Point
3295| Arithmetic.
3296*----------------------------------------------------------------------------*/
3297
e5a41ffa 3298float32 float64_to_float32(float64 a, float_status *status)
158142c2
FB
3299{
3300 flag aSign;
94a49d86 3301 int_fast16_t aExp;
bb98fe42
AF
3302 uint64_t aSig;
3303 uint32_t zSig;
ff32e16e 3304 a = float64_squash_input_denormal(a, status);
158142c2
FB
3305
3306 aSig = extractFloat64Frac( a );
3307 aExp = extractFloat64Exp( a );
3308 aSign = extractFloat64Sign( a );
3309 if ( aExp == 0x7FF ) {
ff32e16e
PM
3310 if (aSig) {
3311 return commonNaNToFloat32(float64ToCommonNaN(a, status), status);
3312 }
158142c2
FB
3313 return packFloat32( aSign, 0xFF, 0 );
3314 }
3315 shift64RightJamming( aSig, 22, &aSig );
3316 zSig = aSig;
3317 if ( aExp || zSig ) {
3318 zSig |= 0x40000000;
3319 aExp -= 0x381;
3320 }
ff32e16e 3321 return roundAndPackFloat32(aSign, aExp, zSig, status);
158142c2
FB
3322
3323}
3324
60011498
PB
3325
3326/*----------------------------------------------------------------------------
3327| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3328| half-precision floating-point value, returning the result. After being
3329| shifted into the proper positions, the three fields are simply added
3330| together to form the result. This means that any integer portion of `zSig'
3331| will be added into the exponent. Since a properly normalized significand
3332| will have an integer portion equal to 1, the `zExp' input should be 1 less
3333| than the desired result exponent whenever `zSig' is a complete, normalized
3334| significand.
3335*----------------------------------------------------------------------------*/
94a49d86 3336static float16 packFloat16(flag zSign, int_fast16_t zExp, uint16_t zSig)
60011498 3337{
bb4d4bb3 3338 return make_float16(
bb98fe42 3339 (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
60011498
PB
3340}
3341
c4a1c5e7
PM
3342/*----------------------------------------------------------------------------
3343| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3344| and significand `zSig', and returns the proper half-precision floating-
3345| point value corresponding to the abstract input. Ordinarily, the abstract
3346| value is simply rounded and packed into the half-precision format, with
3347| the inexact exception raised if the abstract input cannot be represented
3348| exactly. However, if the abstract value is too large, the overflow and
3349| inexact exceptions are raised and an infinity or maximal finite value is
3350| returned. If the abstract value is too small, the input value is rounded to
3351| a subnormal number, and the underflow and inexact exceptions are raised if
3352| the abstract input cannot be represented exactly as a subnormal half-
3353| precision floating-point number.
3354| The `ieee' flag indicates whether to use IEEE standard half precision, or
3355| ARM-style "alternative representation", which omits the NaN and Inf
3356| encodings in order to raise the maximum representable exponent by one.
3357| The input significand `zSig' has its binary point between bits 22
3358| and 23, which is 13 bits to the left of the usual location. This shifted
3359| significand must be normalized or smaller. If `zSig' is not normalized,
3360| `zExp' must be 0; in that case, the result returned is a subnormal number,
3361| and it must not require rounding. In the usual case that `zSig' is
3362| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3363| Note the slightly odd position of the binary point in zSig compared with the
3364| other roundAndPackFloat functions. This should probably be fixed if we
3365| need to implement more float16 routines than just conversion.
3366| The handling of underflow and overflow follows the IEC/IEEE Standard for
3367| Binary Floating-Point Arithmetic.
3368*----------------------------------------------------------------------------*/
3369
7ceac86f 3370static float16 roundAndPackFloat16(flag zSign, int_fast16_t zExp,
e5a41ffa
PM
3371 uint32_t zSig, flag ieee,
3372 float_status *status)
c4a1c5e7
PM
3373{
3374 int maxexp = ieee ? 29 : 30;
3375 uint32_t mask;
3376 uint32_t increment;
c4a1c5e7
PM
3377 bool rounding_bumps_exp;
3378 bool is_tiny = false;
3379
3380 /* Calculate the mask of bits of the mantissa which are not
3381 * representable in half-precision and will be lost.
3382 */
3383 if (zExp < 1) {
3384 /* Will be denormal in halfprec */
3385 mask = 0x00ffffff;
3386 if (zExp >= -11) {
3387 mask >>= 11 + zExp;
3388 }
3389 } else {
3390 /* Normal number in halfprec */
3391 mask = 0x00001fff;
3392 }
3393
a2f2d288 3394 switch (status->float_rounding_mode) {
c4a1c5e7
PM
3395 case float_round_nearest_even:
3396 increment = (mask + 1) >> 1;
3397 if ((zSig & mask) == increment) {
3398 increment = zSig & (increment << 1);
3399 }
3400 break;
f9288a76
PM
3401 case float_round_ties_away:
3402 increment = (mask + 1) >> 1;
3403 break;
c4a1c5e7
PM
3404 case float_round_up:
3405 increment = zSign ? 0 : mask;
3406 break;
3407 case float_round_down:
3408 increment = zSign ? mask : 0;
3409 break;
3410 default: /* round_to_zero */
3411 increment = 0;
3412 break;
3413 }
3414
3415 rounding_bumps_exp = (zSig + increment >= 0x01000000);
3416
3417 if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3418 if (ieee) {
ff32e16e 3419 float_raise(float_flag_overflow | float_flag_inexact, status);
c4a1c5e7
PM
3420 return packFloat16(zSign, 0x1f, 0);
3421 } else {
ff32e16e 3422 float_raise(float_flag_invalid, status);
c4a1c5e7
PM
3423 return packFloat16(zSign, 0x1f, 0x3ff);
3424 }
3425 }
3426
3427 if (zExp < 0) {
3428 /* Note that flush-to-zero does not affect half-precision results */
3429 is_tiny =
a2f2d288 3430 (status->float_detect_tininess == float_tininess_before_rounding)
c4a1c5e7
PM
3431 || (zExp < -1)
3432 || (!rounding_bumps_exp);
3433 }
3434 if (zSig & mask) {
ff32e16e 3435 float_raise(float_flag_inexact, status);
c4a1c5e7 3436 if (is_tiny) {
ff32e16e 3437 float_raise(float_flag_underflow, status);
c4a1c5e7
PM
3438 }
3439 }
3440
3441 zSig += increment;
3442 if (rounding_bumps_exp) {
3443 zSig >>= 1;
3444 zExp++;
3445 }
3446
3447 if (zExp < -10) {
3448 return packFloat16(zSign, 0, 0);
3449 }
3450 if (zExp < 0) {
3451 zSig >>= -zExp;
3452 zExp = 0;
3453 }
3454 return packFloat16(zSign, zExp, zSig >> 13);
3455}
3456
3457static void normalizeFloat16Subnormal(uint32_t aSig, int_fast16_t *zExpPtr,
3458 uint32_t *zSigPtr)
3459{
3460 int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3461 *zSigPtr = aSig << shiftCount;
3462 *zExpPtr = 1 - shiftCount;
3463}
3464
60011498
PB
3465/* Half precision floats come in two formats: standard IEEE and "ARM" format.
3466 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
bb4d4bb3 3467
e5a41ffa 3468float32 float16_to_float32(float16 a, flag ieee, float_status *status)
60011498
PB
3469{
3470 flag aSign;
94a49d86 3471 int_fast16_t aExp;
bb98fe42 3472 uint32_t aSig;
60011498 3473
bb4d4bb3
PM
3474 aSign = extractFloat16Sign(a);
3475 aExp = extractFloat16Exp(a);
3476 aSig = extractFloat16Frac(a);
60011498
PB
3477
3478 if (aExp == 0x1f && ieee) {
3479 if (aSig) {
ff32e16e 3480 return commonNaNToFloat32(float16ToCommonNaN(a, status), status);
60011498 3481 }
4be8eeac 3482 return packFloat32(aSign, 0xff, 0);
60011498
PB
3483 }
3484 if (aExp == 0) {
60011498
PB
3485 if (aSig == 0) {
3486 return packFloat32(aSign, 0, 0);
3487 }
3488
c4a1c5e7
PM
3489 normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3490 aExp--;
60011498
PB
3491 }
3492 return packFloat32( aSign, aExp + 0x70, aSig << 13);
3493}
3494
e5a41ffa 3495float16 float32_to_float16(float32 a, flag ieee, float_status *status)
60011498
PB
3496{
3497 flag aSign;
94a49d86 3498 int_fast16_t aExp;
bb98fe42 3499 uint32_t aSig;
38970efa 3500
ff32e16e 3501 a = float32_squash_input_denormal(a, status);
60011498
PB
3502
3503 aSig = extractFloat32Frac( a );
3504 aExp = extractFloat32Exp( a );
3505 aSign = extractFloat32Sign( a );
3506 if ( aExp == 0xFF ) {
3507 if (aSig) {
600e30d2 3508 /* Input is a NaN */
600e30d2 3509 if (!ieee) {
ff32e16e 3510 float_raise(float_flag_invalid, status);
600e30d2
PM
3511 return packFloat16(aSign, 0, 0);
3512 }
38970efa 3513 return commonNaNToFloat16(
ff32e16e 3514 float32ToCommonNaN(a, status), status);
60011498 3515 }
600e30d2
PM
3516 /* Infinity */
3517 if (!ieee) {
ff32e16e 3518 float_raise(float_flag_invalid, status);
600e30d2
PM
3519 return packFloat16(aSign, 0x1f, 0x3ff);
3520 }
3521 return packFloat16(aSign, 0x1f, 0);
60011498 3522 }
600e30d2 3523 if (aExp == 0 && aSig == 0) {
60011498
PB
3524 return packFloat16(aSign, 0, 0);
3525 }
38970efa
PM
3526 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3527 * even if the input is denormal; however this is harmless because
3528 * the largest possible single-precision denormal is still smaller
3529 * than the smallest representable half-precision denormal, and so we
3530 * will end up ignoring aSig and returning via the "always return zero"
3531 * codepath.
3532 */
60011498 3533 aSig |= 0x00800000;
c4a1c5e7 3534 aExp -= 0x71;
60011498 3535
ff32e16e 3536 return roundAndPackFloat16(aSign, aExp, aSig, ieee, status);
60011498
PB
3537}
3538
e5a41ffa 3539float64 float16_to_float64(float16 a, flag ieee, float_status *status)
14c9a07e
PM
3540{
3541 flag aSign;
3542 int_fast16_t aExp;
3543 uint32_t aSig;
3544
3545 aSign = extractFloat16Sign(a);
3546 aExp = extractFloat16Exp(a);
3547 aSig = extractFloat16Frac(a);
3548
3549 if (aExp == 0x1f && ieee) {
3550 if (aSig) {
3551 return commonNaNToFloat64(
ff32e16e 3552 float16ToCommonNaN(a, status), status);
14c9a07e
PM
3553 }
3554 return packFloat64(aSign, 0x7ff, 0);
3555 }
3556 if (aExp == 0) {
3557 if (aSig == 0) {
3558 return packFloat64(aSign, 0, 0);
3559 }
3560
3561 normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3562 aExp--;
3563 }
3564 return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);
3565}
3566
e5a41ffa 3567float16 float64_to_float16(float64 a, flag ieee, float_status *status)
14c9a07e
PM
3568{
3569 flag aSign;
3570 int_fast16_t aExp;
3571 uint64_t aSig;
3572 uint32_t zSig;
3573
ff32e16e 3574 a = float64_squash_input_denormal(a, status);
14c9a07e
PM
3575
3576 aSig = extractFloat64Frac(a);
3577 aExp = extractFloat64Exp(a);
3578 aSign = extractFloat64Sign(a);
3579 if (aExp == 0x7FF) {
3580 if (aSig) {
3581 /* Input is a NaN */
3582 if (!ieee) {
ff32e16e 3583 float_raise(float_flag_invalid, status);
14c9a07e
PM
3584 return packFloat16(aSign, 0, 0);
3585 }
3586 return commonNaNToFloat16(
ff32e16e 3587 float64ToCommonNaN(a, status), status);
14c9a07e
PM
3588 }
3589 /* Infinity */
3590 if (!ieee) {
ff32e16e 3591 float_raise(float_flag_invalid, status);
14c9a07e
PM
3592 return packFloat16(aSign, 0x1f, 0x3ff);
3593 }
3594 return packFloat16(aSign, 0x1f, 0);
3595 }
3596 shift64RightJamming(aSig, 29, &aSig);
3597 zSig = aSig;
3598 if (aExp == 0 && zSig == 0) {
3599 return packFloat16(aSign, 0, 0);
3600 }
3601 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3602 * even if the input is denormal; however this is harmless because
3603 * the largest possible single-precision denormal is still smaller
3604 * than the smallest representable half-precision denormal, and so we
3605 * will end up ignoring aSig and returning via the "always return zero"
3606 * codepath.
3607 */
3608 zSig |= 0x00800000;
3609 aExp -= 0x3F1;
3610
ff32e16e 3611 return roundAndPackFloat16(aSign, aExp, zSig, ieee, status);
14c9a07e
PM
3612}
3613
158142c2
FB
3614/*----------------------------------------------------------------------------
3615| Returns the result of converting the double-precision floating-point value
3616| `a' to the extended double-precision floating-point format. The conversion
3617| is performed according to the IEC/IEEE Standard for Binary Floating-Point
3618| Arithmetic.
3619*----------------------------------------------------------------------------*/
3620
e5a41ffa 3621floatx80 float64_to_floatx80(float64 a, float_status *status)
158142c2
FB
3622{
3623 flag aSign;
94a49d86 3624 int_fast16_t aExp;
bb98fe42 3625 uint64_t aSig;
158142c2 3626
ff32e16e 3627 a = float64_squash_input_denormal(a, status);
158142c2
FB
3628 aSig = extractFloat64Frac( a );
3629 aExp = extractFloat64Exp( a );
3630 aSign = extractFloat64Sign( a );
3631 if ( aExp == 0x7FF ) {
ff32e16e
PM
3632 if (aSig) {
3633 return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
3634 }
158142c2
FB
3635 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3636 }
3637 if ( aExp == 0 ) {
3638 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3639 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3640 }
3641 return
3642 packFloatx80(
3643 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
3644
3645}
3646
158142c2
FB
3647/*----------------------------------------------------------------------------
3648| Returns the result of converting the double-precision floating-point value
3649| `a' to the quadruple-precision floating-point format. The conversion is
3650| performed according to the IEC/IEEE Standard for Binary Floating-Point
3651| Arithmetic.
3652*----------------------------------------------------------------------------*/
3653
e5a41ffa 3654float128 float64_to_float128(float64 a, float_status *status)
158142c2
FB
3655{
3656 flag aSign;
94a49d86 3657 int_fast16_t aExp;
bb98fe42 3658 uint64_t aSig, zSig0, zSig1;
158142c2 3659
ff32e16e 3660 a = float64_squash_input_denormal(a, status);
158142c2
FB
3661 aSig = extractFloat64Frac( a );
3662 aExp = extractFloat64Exp( a );
3663 aSign = extractFloat64Sign( a );
3664 if ( aExp == 0x7FF ) {
ff32e16e
PM
3665 if (aSig) {
3666 return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
3667 }
158142c2
FB
3668 return packFloat128( aSign, 0x7FFF, 0, 0 );
3669 }
3670 if ( aExp == 0 ) {
3671 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3672 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3673 --aExp;
3674 }
3675 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
3676 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
3677
3678}
3679
158142c2
FB
3680/*----------------------------------------------------------------------------
3681| Rounds the double-precision floating-point value `a' to an integer, and
3682| returns the result as a double-precision floating-point value. The
3683| operation is performed according to the IEC/IEEE Standard for Binary
3684| Floating-Point Arithmetic.
3685*----------------------------------------------------------------------------*/
3686
e5a41ffa 3687float64 float64_round_to_int(float64 a, float_status *status)
158142c2
FB
3688{
3689 flag aSign;
94a49d86 3690 int_fast16_t aExp;
bb98fe42 3691 uint64_t lastBitMask, roundBitsMask;
bb98fe42 3692 uint64_t z;
ff32e16e 3693 a = float64_squash_input_denormal(a, status);
158142c2
FB
3694
3695 aExp = extractFloat64Exp( a );
3696 if ( 0x433 <= aExp ) {
3697 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
ff32e16e 3698 return propagateFloat64NaN(a, a, status);
158142c2
FB
3699 }
3700 return a;
3701 }
3702 if ( aExp < 0x3FF ) {
bb98fe42 3703 if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
a2f2d288 3704 status->float_exception_flags |= float_flag_inexact;
158142c2 3705 aSign = extractFloat64Sign( a );
a2f2d288 3706 switch (status->float_rounding_mode) {
158142c2
FB
3707 case float_round_nearest_even:
3708 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3709 return packFloat64( aSign, 0x3FF, 0 );
3710 }
3711 break;
f9288a76
PM
3712 case float_round_ties_away:
3713 if (aExp == 0x3FE) {
3714 return packFloat64(aSign, 0x3ff, 0);
3715 }
3716 break;
158142c2 3717 case float_round_down:
f090c9d4 3718 return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
158142c2 3719 case float_round_up:
f090c9d4
PB
3720 return make_float64(
3721 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
158142c2
FB
3722 }
3723 return packFloat64( aSign, 0, 0 );
3724 }
3725 lastBitMask = 1;
3726 lastBitMask <<= 0x433 - aExp;
3727 roundBitsMask = lastBitMask - 1;
f090c9d4 3728 z = float64_val(a);
a2f2d288 3729 switch (status->float_rounding_mode) {
dc355b76
PM
3730 case float_round_nearest_even:
3731 z += lastBitMask >> 1;
3732 if ((z & roundBitsMask) == 0) {
3733 z &= ~lastBitMask;
3734 }
3735 break;
f9288a76
PM
3736 case float_round_ties_away:
3737 z += lastBitMask >> 1;
3738 break;
dc355b76
PM
3739 case float_round_to_zero:
3740 break;
3741 case float_round_up:
3742 if (!extractFloat64Sign(make_float64(z))) {
3743 z += roundBitsMask;
3744 }
3745 break;
3746 case float_round_down:
3747 if (extractFloat64Sign(make_float64(z))) {
158142c2
FB
3748 z += roundBitsMask;
3749 }
dc355b76
PM
3750 break;
3751 default:
3752 abort();
158142c2
FB
3753 }
3754 z &= ~ roundBitsMask;
a2f2d288
PM
3755 if (z != float64_val(a)) {
3756 status->float_exception_flags |= float_flag_inexact;
3757 }
f090c9d4 3758 return make_float64(z);
158142c2
FB
3759
3760}
3761
e5a41ffa 3762float64 float64_trunc_to_int(float64 a, float_status *status)
e6e5906b
PB
3763{
3764 int oldmode;
3765 float64 res;
a2f2d288
PM
3766 oldmode = status->float_rounding_mode;
3767 status->float_rounding_mode = float_round_to_zero;
ff32e16e 3768 res = float64_round_to_int(a, status);
a2f2d288 3769 status->float_rounding_mode = oldmode;
e6e5906b
PB
3770 return res;
3771}
3772
158142c2
FB
3773/*----------------------------------------------------------------------------
3774| Returns the result of adding the absolute values of the double-precision
3775| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
3776| before being returned. `zSign' is ignored if the result is a NaN.
3777| The addition is performed according to the IEC/IEEE Standard for Binary
3778| Floating-Point Arithmetic.
3779*----------------------------------------------------------------------------*/
3780
e5a41ffa
PM
3781static float64 addFloat64Sigs(float64 a, float64 b, flag zSign,
3782 float_status *status)
158142c2 3783{
94a49d86 3784 int_fast16_t aExp, bExp, zExp;
bb98fe42 3785 uint64_t aSig, bSig, zSig;
94a49d86 3786 int_fast16_t expDiff;
158142c2
FB
3787
3788 aSig = extractFloat64Frac( a );
3789 aExp = extractFloat64Exp( a );
3790 bSig = extractFloat64Frac( b );
3791 bExp = extractFloat64Exp( b );
3792 expDiff = aExp - bExp;
3793 aSig <<= 9;
3794 bSig <<= 9;
3795 if ( 0 < expDiff ) {
3796 if ( aExp == 0x7FF ) {
ff32e16e
PM
3797 if (aSig) {
3798 return propagateFloat64NaN(a, b, status);
3799 }
158142c2
FB
3800 return a;
3801 }
3802 if ( bExp == 0 ) {
3803 --expDiff;
3804 }
3805 else {
3806 bSig |= LIT64( 0x2000000000000000 );
3807 }
3808 shift64RightJamming( bSig, expDiff, &bSig );
3809 zExp = aExp;
3810 }
3811 else if ( expDiff < 0 ) {
3812 if ( bExp == 0x7FF ) {
ff32e16e
PM
3813 if (bSig) {
3814 return propagateFloat64NaN(a, b, status);
3815 }
158142c2
FB
3816 return packFloat64( zSign, 0x7FF, 0 );
3817 }
3818 if ( aExp == 0 ) {
3819 ++expDiff;
3820 }
3821 else {
3822 aSig |= LIT64( 0x2000000000000000 );
3823 }
3824 shift64RightJamming( aSig, - expDiff, &aSig );
3825 zExp = bExp;
3826 }
3827 else {
3828 if ( aExp == 0x7FF ) {
ff32e16e
PM
3829 if (aSig | bSig) {
3830 return propagateFloat64NaN(a, b, status);
3831 }
158142c2
FB
3832 return a;
3833 }
fe76d976 3834 if ( aExp == 0 ) {
a2f2d288 3835 if (status->flush_to_zero) {
e6afc87f 3836 if (aSig | bSig) {
ff32e16e 3837 float_raise(float_flag_output_denormal, status);
e6afc87f
PM
3838 }
3839 return packFloat64(zSign, 0, 0);
3840 }
fe76d976
PB
3841 return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3842 }
158142c2
FB
3843 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3844 zExp = aExp;
3845 goto roundAndPack;
3846 }
3847 aSig |= LIT64( 0x2000000000000000 );
3848 zSig = ( aSig + bSig )<<1;
3849 --zExp;
bb98fe42 3850 if ( (int64_t) zSig < 0 ) {
158142c2
FB
3851 zSig = aSig + bSig;
3852 ++zExp;
3853 }
3854 roundAndPack:
ff32e16e 3855 return roundAndPackFloat64(zSign, zExp, zSig, status);
158142c2
FB
3856
3857}
3858
3859/*----------------------------------------------------------------------------
3860| Returns the result of subtracting the absolute values of the double-
3861| precision floating-point values `a' and `b'. If `zSign' is 1, the
3862| difference is negated before being returned. `zSign' is ignored if the
3863| result is a NaN. The subtraction is performed according to the IEC/IEEE
3864| Standard for Binary Floating-Point Arithmetic.
3865*----------------------------------------------------------------------------*/
3866
e5a41ffa
PM
3867static float64 subFloat64Sigs(float64 a, float64 b, flag zSign,
3868 float_status *status)
158142c2 3869{
94a49d86 3870 int_fast16_t aExp, bExp, zExp;
bb98fe42 3871 uint64_t aSig, bSig, zSig;
94a49d86 3872 int_fast16_t expDiff;
158142c2
FB
3873
3874 aSig = extractFloat64Frac( a );
3875 aExp = extractFloat64Exp( a );
3876 bSig = extractFloat64Frac( b );
3877 bExp = extractFloat64Exp( b );
3878 expDiff = aExp - bExp;
3879 aSig <<= 10;
3880 bSig <<= 10;
3881 if ( 0 < expDiff ) goto aExpBigger;
3882 if ( expDiff < 0 ) goto bExpBigger;
3883 if ( aExp == 0x7FF ) {
ff32e16e
PM
3884 if (aSig | bSig) {
3885 return propagateFloat64NaN(a, b, status);
3886 }
3887 float_raise(float_flag_invalid, status);
158142c2
FB
3888 return float64_default_nan;
3889 }
3890 if ( aExp == 0 ) {
3891 aExp = 1;
3892 bExp = 1;
3893 }
3894 if ( bSig < aSig ) goto aBigger;
3895 if ( aSig < bSig ) goto bBigger;
a2f2d288 3896 return packFloat64(status->float_rounding_mode == float_round_down, 0, 0);
158142c2
FB
3897 bExpBigger:
3898 if ( bExp == 0x7FF ) {
ff32e16e
PM
3899 if (bSig) {
3900 return propagateFloat64NaN(a, b, status);
3901 }
158142c2
FB
3902 return packFloat64( zSign ^ 1, 0x7FF, 0 );
3903 }
3904 if ( aExp == 0 ) {
3905 ++expDiff;
3906 }
3907 else {
3908 aSig |= LIT64( 0x4000000000000000 );
3909 }
3910 shift64RightJamming( aSig, - expDiff, &aSig );
3911 bSig |= LIT64( 0x4000000000000000 );
3912 bBigger:
3913 zSig = bSig - aSig;
3914 zExp = bExp;
3915 zSign ^= 1;
3916 goto normalizeRoundAndPack;
3917 aExpBigger:
3918 if ( aExp == 0x7FF ) {
ff32e16e
PM
3919 if (aSig) {
3920 return propagateFloat64NaN(a, b, status);
3921 }
158142c2
FB
3922 return a;
3923 }
3924 if ( bExp == 0 ) {
3925 --expDiff;
3926 }
3927 else {
3928 bSig |= LIT64( 0x4000000000000000 );
3929 }
3930 shift64RightJamming( bSig, expDiff, &bSig );
3931 aSig |= LIT64( 0x4000000000000000 );
3932 aBigger:
3933 zSig = aSig - bSig;
3934 zExp = aExp;
3935 normalizeRoundAndPack:
3936 --zExp;
ff32e16e 3937 return normalizeRoundAndPackFloat64(zSign, zExp, zSig, status);
158142c2
FB
3938
3939}
3940
3941/*----------------------------------------------------------------------------
3942| Returns the result of adding the double-precision floating-point values `a'
3943| and `b'. The operation is performed according to the IEC/IEEE Standard for
3944| Binary Floating-Point Arithmetic.
3945*----------------------------------------------------------------------------*/
3946
e5a41ffa 3947float64 float64_add(float64 a, float64 b, float_status *status)
158142c2
FB
3948{
3949 flag aSign, bSign;
ff32e16e
PM
3950 a = float64_squash_input_denormal(a, status);
3951 b = float64_squash_input_denormal(b, status);
158142c2
FB
3952
3953 aSign = extractFloat64Sign( a );
3954 bSign = extractFloat64Sign( b );
3955 if ( aSign == bSign ) {
ff32e16e 3956 return addFloat64Sigs(a, b, aSign, status);
158142c2
FB
3957 }
3958 else {
ff32e16e 3959 return subFloat64Sigs(a, b, aSign, status);
158142c2
FB
3960 }
3961
3962}
3963
3964/*----------------------------------------------------------------------------
3965| Returns the result of subtracting the double-precision floating-point values
3966| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3967| for Binary Floating-Point Arithmetic.
3968*----------------------------------------------------------------------------*/
3969
e5a41ffa 3970float64 float64_sub(float64 a, float64 b, float_status *status)
158142c2
FB
3971{
3972 flag aSign, bSign;
ff32e16e
PM
3973 a = float64_squash_input_denormal(a, status);
3974 b = float64_squash_input_denormal(b, status);
158142c2
FB
3975
3976 aSign = extractFloat64Sign( a );
3977 bSign = extractFloat64Sign( b );
3978 if ( aSign == bSign ) {
ff32e16e 3979 return subFloat64Sigs(a, b, aSign, status);
158142c2
FB
3980 }
3981 else {
ff32e16e 3982 return addFloat64Sigs(a, b, aSign, status);
158142c2
FB
3983 }
3984
3985}
3986
3987/*----------------------------------------------------------------------------
3988| Returns the result of multiplying the double-precision floating-point values
3989| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3990| for Binary Floating-Point Arithmetic.
3991*----------------------------------------------------------------------------*/
3992
e5a41ffa 3993float64 float64_mul(float64 a, float64 b, float_status *status)
158142c2
FB
3994{
3995 flag aSign, bSign, zSign;
94a49d86 3996 int_fast16_t aExp, bExp, zExp;
bb98fe42 3997 uint64_t aSig, bSig, zSig0, zSig1;
158142c2 3998
ff32e16e
PM
3999 a = float64_squash_input_denormal(a, status);
4000 b = float64_squash_input_denormal(b, status);
37d18660 4001
158142c2
FB
4002 aSig = extractFloat64Frac( a );
4003 aExp = extractFloat64Exp( a );
4004 aSign = extractFloat64Sign( a );
4005 bSig = extractFloat64Frac( b );
4006 bExp = extractFloat64Exp( b );
4007 bSign = extractFloat64Sign( b );
4008 zSign = aSign ^ bSign;
4009 if ( aExp == 0x7FF ) {
4010 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
ff32e16e 4011 return propagateFloat64NaN(a, b, status);
158142c2
FB
4012 }
4013 if ( ( bExp | bSig ) == 0 ) {
ff32e16e 4014 float_raise(float_flag_invalid, status);
158142c2
FB
4015 return float64_default_nan;
4016 }
4017 return packFloat64( zSign, 0x7FF, 0 );
4018 }
4019 if ( bExp == 0x7FF ) {
ff32e16e
PM
4020 if (bSig) {
4021 return propagateFloat64NaN(a, b, status);
4022 }
158142c2 4023 if ( ( aExp | aSig ) == 0 ) {
ff32e16e 4024 float_raise(float_flag_invalid, status);
158142c2
FB
4025 return float64_default_nan;
4026 }
4027 return packFloat64( zSign, 0x7FF, 0 );
4028 }
4029 if ( aExp == 0 ) {
4030 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
4031 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4032 }
4033 if ( bExp == 0 ) {
4034 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
4035 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4036 }
4037 zExp = aExp + bExp - 0x3FF;
4038 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
4039 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4040 mul64To128( aSig, bSig, &zSig0, &zSig1 );
4041 zSig0 |= ( zSig1 != 0 );
bb98fe42 4042 if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
158142c2
FB
4043 zSig0 <<= 1;
4044 --zExp;
4045 }
ff32e16e 4046 return roundAndPackFloat64(zSign, zExp, zSig0, status);
158142c2
FB
4047
4048}
4049
4050/*----------------------------------------------------------------------------
4051| Returns the result of dividing the double-precision floating-point value `a'
4052| by the corresponding value `b'. The operation is performed according to
4053| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4054*----------------------------------------------------------------------------*/
4055
e5a41ffa 4056float64 float64_div(float64 a, float64 b, float_status *status)
158142c2
FB
4057{
4058 flag aSign, bSign, zSign;
94a49d86 4059 int_fast16_t aExp, bExp, zExp;
bb98fe42
AF
4060 uint64_t aSig, bSig, zSig;
4061 uint64_t rem0, rem1;
4062 uint64_t term0, term1;
ff32e16e
PM
4063 a = float64_squash_input_denormal(a, status);
4064 b = float64_squash_input_denormal(b, status);
158142c2
FB
4065
4066 aSig = extractFloat64Frac( a );
4067 aExp = extractFloat64Exp( a );
4068 aSign = extractFloat64Sign( a );
4069 bSig = extractFloat64Frac( b );
4070 bExp = extractFloat64Exp( b );
4071 bSign = extractFloat64Sign( b );
4072 zSign = aSign ^ bSign;
4073 if ( aExp == 0x7FF ) {
ff32e16e
PM
4074 if (aSig) {
4075 return propagateFloat64NaN(a, b, status);
4076 }
158142c2 4077 if ( bExp == 0x7FF ) {
ff32e16e
PM
4078 if (bSig) {
4079 return propagateFloat64NaN(a, b, status);
4080 }
4081 float_raise(float_flag_invalid, status);
158142c2
FB
4082 return float64_default_nan;
4083 }
4084 return packFloat64( zSign, 0x7FF, 0 );
4085 }
4086 if ( bExp == 0x7FF ) {
ff32e16e
PM
4087 if (bSig) {
4088 return propagateFloat64NaN(a, b, status);
4089 }
158142c2
FB
4090 return packFloat64( zSign, 0, 0 );
4091 }
4092 if ( bExp == 0 ) {
4093 if ( bSig == 0 ) {
4094 if ( ( aExp | aSig ) == 0 ) {
ff32e16e 4095 float_raise(float_flag_invalid, status);
158142c2
FB
4096 return float64_default_nan;
4097 }
ff32e16e 4098 float_raise(float_flag_divbyzero, status);
158142c2
FB
4099 return packFloat64( zSign, 0x7FF, 0 );
4100 }
4101 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4102 }
4103 if ( aExp == 0 ) {
4104 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
4105 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4106 }
4107 zExp = aExp - bExp + 0x3FD;
4108 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
4109 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4110 if ( bSig <= ( aSig + aSig ) ) {
4111 aSig >>= 1;
4112 ++zExp;
4113 }
4114 zSig = estimateDiv128To64( aSig, 0, bSig );
4115 if ( ( zSig & 0x1FF ) <= 2 ) {
4116 mul64To128( bSig, zSig, &term0, &term1 );
4117 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
bb98fe42 4118 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
4119 --zSig;
4120 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4121 }
4122 zSig |= ( rem1 != 0 );
4123 }
ff32e16e 4124 return roundAndPackFloat64(zSign, zExp, zSig, status);
158142c2
FB
4125
4126}
4127
4128/*----------------------------------------------------------------------------
4129| Returns the remainder of the double-precision floating-point value `a'
4130| with respect to the corresponding value `b'. The operation is performed
4131| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4132*----------------------------------------------------------------------------*/
4133
e5a41ffa 4134float64 float64_rem(float64 a, float64 b, float_status *status)
158142c2 4135{
ed086f3d 4136 flag aSign, zSign;
94a49d86 4137 int_fast16_t aExp, bExp, expDiff;
bb98fe42
AF
4138 uint64_t aSig, bSig;
4139 uint64_t q, alternateASig;
4140 int64_t sigMean;
158142c2 4141
ff32e16e
PM
4142 a = float64_squash_input_denormal(a, status);
4143 b = float64_squash_input_denormal(b, status);
158142c2
FB
4144 aSig = extractFloat64Frac( a );
4145 aExp = extractFloat64Exp( a );
4146 aSign = extractFloat64Sign( a );
4147 bSig = extractFloat64Frac( b );
4148 bExp = extractFloat64Exp( b );
158142c2
FB
4149 if ( aExp == 0x7FF ) {
4150 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
ff32e16e 4151 return propagateFloat64NaN(a, b, status);
158142c2 4152 }
ff32e16e 4153 float_raise(float_flag_invalid, status);
158142c2
FB
4154 return float64_default_nan;
4155 }
4156 if ( bExp == 0x7FF ) {
ff32e16e
PM
4157 if (bSig) {
4158 return propagateFloat64NaN(a, b, status);
4159 }
158142c2
FB
4160 return a;
4161 }
4162 if ( bExp == 0 ) {
4163 if ( bSig == 0 ) {
ff32e16e 4164 float_raise(float_flag_invalid, status);
158142c2
FB
4165 return float64_default_nan;
4166 }
4167 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4168 }
4169 if ( aExp == 0 ) {
4170 if ( aSig == 0 ) return a;
4171 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4172 }
4173 expDiff = aExp - bExp;
4174 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4175 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4176 if ( expDiff < 0 ) {
4177 if ( expDiff < -1 ) return a;
4178 aSig >>= 1;
4179 }
4180 q = ( bSig <= aSig );
4181 if ( q ) aSig -= bSig;
4182 expDiff -= 64;
4183 while ( 0 < expDiff ) {
4184 q = estimateDiv128To64( aSig, 0, bSig );
4185 q = ( 2 < q ) ? q - 2 : 0;
4186 aSig = - ( ( bSig>>2 ) * q );
4187 expDiff -= 62;
4188 }
4189 expDiff += 64;
4190 if ( 0 < expDiff ) {
4191 q = estimateDiv128To64( aSig, 0, bSig );
4192 q = ( 2 < q ) ? q - 2 : 0;
4193 q >>= 64 - expDiff;
4194 bSig >>= 2;
4195 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4196 }
4197 else {
4198 aSig >>= 2;
4199 bSig >>= 2;
4200 }
4201 do {
4202 alternateASig = aSig;
4203 ++q;
4204 aSig -= bSig;
bb98fe42 4205 } while ( 0 <= (int64_t) aSig );
158142c2
FB
4206 sigMean = aSig + alternateASig;
4207 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4208 aSig = alternateASig;
4209 }
bb98fe42 4210 zSign = ( (int64_t) aSig < 0 );
158142c2 4211 if ( zSign ) aSig = - aSig;
ff32e16e 4212 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
158142c2
FB
4213
4214}
4215
369be8f6
PM
4216/*----------------------------------------------------------------------------
4217| Returns the result of multiplying the double-precision floating-point values
4218| `a' and `b' then adding 'c', with no intermediate rounding step after the
4219| multiplication. The operation is performed according to the IEC/IEEE
4220| Standard for Binary Floating-Point Arithmetic 754-2008.
4221| The flags argument allows the caller to select negation of the
4222| addend, the intermediate product, or the final result. (The difference
4223| between this and having the caller do a separate negation is that negating
4224| externally will flip the sign bit on NaNs.)
4225*----------------------------------------------------------------------------*/
4226
e5a41ffa
PM
4227float64 float64_muladd(float64 a, float64 b, float64 c, int flags,
4228 float_status *status)
369be8f6
PM
4229{
4230 flag aSign, bSign, cSign, zSign;
94a49d86 4231 int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
369be8f6
PM
4232 uint64_t aSig, bSig, cSig;
4233 flag pInf, pZero, pSign;
4234 uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1;
4235 int shiftcount;
4236 flag signflip, infzero;
4237
ff32e16e
PM
4238 a = float64_squash_input_denormal(a, status);
4239 b = float64_squash_input_denormal(b, status);
4240 c = float64_squash_input_denormal(c, status);
369be8f6
PM
4241 aSig = extractFloat64Frac(a);
4242 aExp = extractFloat64Exp(a);
4243 aSign = extractFloat64Sign(a);
4244 bSig = extractFloat64Frac(b);
4245 bExp = extractFloat64Exp(b);
4246 bSign = extractFloat64Sign(b);
4247 cSig = extractFloat64Frac(c);
4248 cExp = extractFloat64Exp(c);
4249 cSign = extractFloat64Sign(c);
4250
4251 infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) ||
4252 (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0));
4253
4254 /* It is implementation-defined whether the cases of (0,inf,qnan)
4255 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
4256 * they return if they do), so we have to hand this information
4257 * off to the target-specific pick-a-NaN routine.
4258 */
4259 if (((aExp == 0x7ff) && aSig) ||
4260 ((bExp == 0x7ff) && bSig) ||
4261 ((cExp == 0x7ff) && cSig)) {
ff32e16e 4262 return propagateFloat64MulAddNaN(a, b, c, infzero, status);
369be8f6
PM
4263 }
4264
4265 if (infzero) {
ff32e16e 4266 float_raise(float_flag_invalid, status);
369be8f6
PM
4267 return float64_default_nan;
4268 }
4269
4270 if (flags & float_muladd_negate_c) {
4271 cSign ^= 1;
4272 }
4273
4274 signflip = (flags & float_muladd_negate_result) ? 1 : 0;
4275
4276 /* Work out the sign and type of the product */
4277 pSign = aSign ^ bSign;
4278 if (flags & float_muladd_negate_product) {
4279 pSign ^= 1;
4280 }
4281 pInf = (aExp == 0x7ff) || (bExp == 0x7ff);
4282 pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
4283
4284 if (cExp == 0x7ff) {
4285 if (pInf && (pSign ^ cSign)) {
4286 /* addition of opposite-signed infinities => InvalidOperation */
ff32e16e 4287 float_raise(float_flag_invalid, status);
369be8f6
PM
4288 return float64_default_nan;
4289 }
4290 /* Otherwise generate an infinity of the same sign */
4291 return packFloat64(cSign ^ signflip, 0x7ff, 0);
4292 }
4293
4294 if (pInf) {
4295 return packFloat64(pSign ^ signflip, 0x7ff, 0);
4296 }
4297
4298 if (pZero) {
4299 if (cExp == 0) {
4300 if (cSig == 0) {
4301 /* Adding two exact zeroes */
4302 if (pSign == cSign) {
4303 zSign = pSign;
a2f2d288 4304 } else if (status->float_rounding_mode == float_round_down) {
369be8f6
PM
4305 zSign = 1;
4306 } else {
4307 zSign = 0;
4308 }
4309 return packFloat64(zSign ^ signflip, 0, 0);
4310 }
4311 /* Exact zero plus a denorm */
a2f2d288 4312 if (status->flush_to_zero) {
ff32e16e 4313 float_raise(float_flag_output_denormal, status);
369be8f6
PM
4314 return packFloat64(cSign ^ signflip, 0, 0);
4315 }
4316 }
4317 /* Zero plus something non-zero : just return the something */
67d43538
PM
4318 if (flags & float_muladd_halve_result) {
4319 if (cExp == 0) {
4320 normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4321 }
4322 /* Subtract one to halve, and one again because roundAndPackFloat64
4323 * wants one less than the true exponent.
4324 */
4325 cExp -= 2;
4326 cSig = (cSig | 0x0010000000000000ULL) << 10;
ff32e16e 4327 return roundAndPackFloat64(cSign ^ signflip, cExp, cSig, status);
67d43538 4328 }
a6e7c184 4329 return packFloat64(cSign ^ signflip, cExp, cSig);
369be8f6
PM
4330 }
4331
4332 if (aExp == 0) {
4333 normalizeFloat64Subnormal(aSig, &aExp, &aSig);
4334 }
4335 if (bExp == 0) {
4336 normalizeFloat64Subnormal(bSig, &bExp, &bSig);
4337 }
4338
4339 /* Calculate the actual result a * b + c */
4340
4341 /* Multiply first; this is easy. */
4342 /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff
4343 * because we want the true exponent, not the "one-less-than"
4344 * flavour that roundAndPackFloat64() takes.
4345 */
4346 pExp = aExp + bExp - 0x3fe;
4347 aSig = (aSig | LIT64(0x0010000000000000))<<10;
4348 bSig = (bSig | LIT64(0x0010000000000000))<<11;
4349 mul64To128(aSig, bSig, &pSig0, &pSig1);
4350 if ((int64_t)(pSig0 << 1) >= 0) {
4351 shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1);
4352 pExp--;
4353 }
4354
4355 zSign = pSign ^ signflip;
4356
4357 /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit
4358 * bit in position 126.
4359 */
4360 if (cExp == 0) {
4361 if (!cSig) {
4362 /* Throw out the special case of c being an exact zero now */
4363 shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1);
67d43538
PM
4364 if (flags & float_muladd_halve_result) {
4365 pExp--;
4366 }
369be8f6 4367 return roundAndPackFloat64(zSign, pExp - 1,
ff32e16e 4368 pSig1, status);
369be8f6
PM
4369 }
4370 normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4371 }
4372
4373 /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the
4374 * significand of the addend, with the explicit bit in position 126.
4375 */
4376 cSig0 = cSig << (126 - 64 - 52);
4377 cSig1 = 0;
4378 cSig0 |= LIT64(0x4000000000000000);
4379 expDiff = pExp - cExp;
4380
4381 if (pSign == cSign) {
4382 /* Addition */
4383 if (expDiff > 0) {
4384 /* scale c to match p */
4385 shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4386 zExp = pExp;
4387 } else if (expDiff < 0) {
4388 /* scale p to match c */
4389 shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4390 zExp = cExp;
4391 } else {
4392 /* no scaling needed */
4393 zExp = cExp;
4394 }
4395 /* Add significands and make sure explicit bit ends up in posn 126 */
4396 add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4397 if ((int64_t)zSig0 < 0) {
4398 shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);
4399 } else {
4400 zExp--;
4401 }
4402 shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1);
67d43538
PM
4403 if (flags & float_muladd_halve_result) {
4404 zExp--;
4405 }
ff32e16e 4406 return roundAndPackFloat64(zSign, zExp, zSig1, status);
369be8f6
PM
4407 } else {
4408 /* Subtraction */
4409 if (expDiff > 0) {
4410 shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4411 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4412 zExp = pExp;
4413 } else if (expDiff < 0) {
4414 shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4415 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4416 zExp = cExp;
4417 zSign ^= 1;
4418 } else {
4419 zExp = pExp;
4420 if (lt128(cSig0, cSig1, pSig0, pSig1)) {
4421 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4422 } else if (lt128(pSig0, pSig1, cSig0, cSig1)) {
4423 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4424 zSign ^= 1;
4425 } else {
4426 /* Exact zero */
4427 zSign = signflip;
a2f2d288 4428 if (status->float_rounding_mode == float_round_down) {
369be8f6
PM
4429 zSign ^= 1;
4430 }
4431 return packFloat64(zSign, 0, 0);
4432 }
4433 }
4434 --zExp;
4435 /* Do the equivalent of normalizeRoundAndPackFloat64() but
4436 * starting with the significand in a pair of uint64_t.
4437 */
4438 if (zSig0) {
4439 shiftcount = countLeadingZeros64(zSig0) - 1;
4440 shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1);
4441 if (zSig1) {
4442 zSig0 |= 1;
4443 }
4444 zExp -= shiftcount;
4445 } else {
e3d142d0
PM
4446 shiftcount = countLeadingZeros64(zSig1);
4447 if (shiftcount == 0) {
4448 zSig0 = (zSig1 >> 1) | (zSig1 & 1);
4449 zExp -= 63;
4450 } else {
4451 shiftcount--;
4452 zSig0 = zSig1 << shiftcount;
4453 zExp -= (shiftcount + 64);
4454 }
369be8f6 4455 }
67d43538
PM
4456 if (flags & float_muladd_halve_result) {
4457 zExp--;
4458 }
ff32e16e 4459 return roundAndPackFloat64(zSign, zExp, zSig0, status);
369be8f6
PM
4460 }
4461}
4462
158142c2
FB
4463/*----------------------------------------------------------------------------
4464| Returns the square root of the double-precision floating-point value `a'.
4465| The operation is performed according to the IEC/IEEE Standard for Binary
4466| Floating-Point Arithmetic.
4467*----------------------------------------------------------------------------*/
4468
e5a41ffa 4469float64 float64_sqrt(float64 a, float_status *status)
158142c2
FB
4470{
4471 flag aSign;
94a49d86 4472 int_fast16_t aExp, zExp;
bb98fe42
AF
4473 uint64_t aSig, zSig, doubleZSig;
4474 uint64_t rem0, rem1, term0, term1;
ff32e16e 4475 a = float64_squash_input_denormal(a, status);
158142c2
FB
4476
4477 aSig = extractFloat64Frac( a );
4478 aExp = extractFloat64Exp( a );
4479 aSign = extractFloat64Sign( a );
4480 if ( aExp == 0x7FF ) {
ff32e16e
PM
4481 if (aSig) {
4482 return propagateFloat64NaN(a, a, status);
4483 }
158142c2 4484 if ( ! aSign ) return a;
ff32e16e 4485 float_raise(float_flag_invalid, status);
158142c2
FB
4486 return float64_default_nan;
4487 }
4488 if ( aSign ) {
4489 if ( ( aExp | aSig ) == 0 ) return a;
ff32e16e 4490 float_raise(float_flag_invalid, status);
158142c2
FB
4491 return float64_default_nan;
4492 }
4493 if ( aExp == 0 ) {
f090c9d4 4494 if ( aSig == 0 ) return float64_zero;
158142c2
FB
4495 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4496 }
4497 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
4498 aSig |= LIT64( 0x0010000000000000 );
4499 zSig = estimateSqrt32( aExp, aSig>>21 );
4500 aSig <<= 9 - ( aExp & 1 );
4501 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
4502 if ( ( zSig & 0x1FF ) <= 5 ) {
4503 doubleZSig = zSig<<1;
4504 mul64To128( zSig, zSig, &term0, &term1 );
4505 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
bb98fe42 4506 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
4507 --zSig;
4508 doubleZSig -= 2;
4509 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
4510 }
4511 zSig |= ( ( rem0 | rem1 ) != 0 );
4512 }
ff32e16e 4513 return roundAndPackFloat64(0, zExp, zSig, status);
158142c2
FB
4514
4515}
4516
374dfc33
AJ
4517/*----------------------------------------------------------------------------
4518| Returns the binary log of the double-precision floating-point value `a'.
4519| The operation is performed according to the IEC/IEEE Standard for Binary
4520| Floating-Point Arithmetic.
4521*----------------------------------------------------------------------------*/
e5a41ffa 4522float64 float64_log2(float64 a, float_status *status)
374dfc33
AJ
4523{
4524 flag aSign, zSign;
94a49d86 4525 int_fast16_t aExp;
bb98fe42 4526 uint64_t aSig, aSig0, aSig1, zSig, i;
ff32e16e 4527 a = float64_squash_input_denormal(a, status);
374dfc33
AJ
4528
4529 aSig = extractFloat64Frac( a );
4530 aExp = extractFloat64Exp( a );
4531 aSign = extractFloat64Sign( a );
4532
4533 if ( aExp == 0 ) {
4534 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4535 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4536 }
4537 if ( aSign ) {
ff32e16e 4538 float_raise(float_flag_invalid, status);
374dfc33
AJ
4539 return float64_default_nan;
4540 }
4541 if ( aExp == 0x7FF ) {
ff32e16e
PM
4542 if (aSig) {
4543 return propagateFloat64NaN(a, float64_zero, status);
4544 }
374dfc33
AJ
4545 return a;
4546 }
4547
4548 aExp -= 0x3FF;
4549 aSig |= LIT64( 0x0010000000000000 );
4550 zSign = aExp < 0;
bb98fe42 4551 zSig = (uint64_t)aExp << 52;
374dfc33
AJ
4552 for (i = 1LL << 51; i > 0; i >>= 1) {
4553 mul64To128( aSig, aSig, &aSig0, &aSig1 );
4554 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4555 if ( aSig & LIT64( 0x0020000000000000 ) ) {
4556 aSig >>= 1;
4557 zSig |= i;
4558 }
4559 }
4560
4561 if ( zSign )
4562 zSig = -zSig;
ff32e16e 4563 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
374dfc33
AJ
4564}
4565
158142c2
FB
4566/*----------------------------------------------------------------------------
4567| Returns 1 if the double-precision floating-point value `a' is equal to the
b689362d
AJ
4568| corresponding value `b', and 0 otherwise. The invalid exception is raised
4569| if either operand is a NaN. Otherwise, the comparison is performed
158142c2
FB
4570| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4571*----------------------------------------------------------------------------*/
4572
e5a41ffa 4573int float64_eq(float64 a, float64 b, float_status *status)
158142c2 4574{
bb98fe42 4575 uint64_t av, bv;
ff32e16e
PM
4576 a = float64_squash_input_denormal(a, status);
4577 b = float64_squash_input_denormal(b, status);
158142c2
FB
4578
4579 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4580 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4581 ) {
ff32e16e 4582 float_raise(float_flag_invalid, status);
158142c2
FB
4583 return 0;
4584 }
f090c9d4 4585 av = float64_val(a);
a1b91bb4 4586 bv = float64_val(b);
bb98fe42 4587 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
158142c2
FB
4588
4589}
4590
4591/*----------------------------------------------------------------------------
4592| Returns 1 if the double-precision floating-point value `a' is less than or
f5a64251
AJ
4593| equal to the corresponding value `b', and 0 otherwise. The invalid
4594| exception is raised if either operand is a NaN. The comparison is performed
4595| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
4596*----------------------------------------------------------------------------*/
4597
e5a41ffa 4598int float64_le(float64 a, float64 b, float_status *status)
158142c2
FB
4599{
4600 flag aSign, bSign;
bb98fe42 4601 uint64_t av, bv;
ff32e16e
PM
4602 a = float64_squash_input_denormal(a, status);
4603 b = float64_squash_input_denormal(b, status);
158142c2
FB
4604
4605 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4606 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4607 ) {
ff32e16e 4608 float_raise(float_flag_invalid, status);
158142c2
FB
4609 return 0;
4610 }
4611 aSign = extractFloat64Sign( a );
4612 bSign = extractFloat64Sign( b );
f090c9d4 4613 av = float64_val(a);
a1b91bb4 4614 bv = float64_val(b);
bb98fe42 4615 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 4616 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
4617
4618}
4619
4620/*----------------------------------------------------------------------------
4621| Returns 1 if the double-precision floating-point value `a' is less than
f5a64251
AJ
4622| the corresponding value `b', and 0 otherwise. The invalid exception is
4623| raised if either operand is a NaN. The comparison is performed according
4624| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
4625*----------------------------------------------------------------------------*/
4626
e5a41ffa 4627int float64_lt(float64 a, float64 b, float_status *status)
158142c2
FB
4628{
4629 flag aSign, bSign;
bb98fe42 4630 uint64_t av, bv;
158142c2 4631
ff32e16e
PM
4632 a = float64_squash_input_denormal(a, status);
4633 b = float64_squash_input_denormal(b, status);
158142c2
FB
4634 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4635 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4636 ) {
ff32e16e 4637 float_raise(float_flag_invalid, status);
158142c2
FB
4638 return 0;
4639 }
4640 aSign = extractFloat64Sign( a );
4641 bSign = extractFloat64Sign( b );
f090c9d4 4642 av = float64_val(a);
a1b91bb4 4643 bv = float64_val(b);
bb98fe42 4644 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 4645 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
4646
4647}
4648
67b7861d
AJ
4649/*----------------------------------------------------------------------------
4650| Returns 1 if the double-precision floating-point values `a' and `b' cannot
f5a64251
AJ
4651| be compared, and 0 otherwise. The invalid exception is raised if either
4652| operand is a NaN. The comparison is performed according to the IEC/IEEE
4653| Standard for Binary Floating-Point Arithmetic.
67b7861d
AJ
4654*----------------------------------------------------------------------------*/
4655
e5a41ffa 4656int float64_unordered(float64 a, float64 b, float_status *status)
67b7861d 4657{
ff32e16e
PM
4658 a = float64_squash_input_denormal(a, status);
4659 b = float64_squash_input_denormal(b, status);
67b7861d
AJ
4660
4661 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4662 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4663 ) {
ff32e16e 4664 float_raise(float_flag_invalid, status);
67b7861d
AJ
4665 return 1;
4666 }
4667 return 0;
4668}
4669
158142c2
FB
4670/*----------------------------------------------------------------------------
4671| Returns 1 if the double-precision floating-point value `a' is equal to the
f5a64251
AJ
4672| corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4673| exception.The comparison is performed according to the IEC/IEEE Standard
4674| for Binary Floating-Point Arithmetic.
158142c2
FB
4675*----------------------------------------------------------------------------*/
4676
e5a41ffa 4677int float64_eq_quiet(float64 a, float64 b, float_status *status)
158142c2 4678{
bb98fe42 4679 uint64_t av, bv;
ff32e16e
PM
4680 a = float64_squash_input_denormal(a, status);
4681 b = float64_squash_input_denormal(b, status);
158142c2
FB
4682
4683 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4684 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4685 ) {
b689362d 4686 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
ff32e16e 4687 float_raise(float_flag_invalid, status);
b689362d 4688 }
158142c2
FB
4689 return 0;
4690 }
f090c9d4 4691 av = float64_val(a);
a1b91bb4 4692 bv = float64_val(b);
bb98fe42 4693 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
158142c2
FB
4694
4695}
4696
4697/*----------------------------------------------------------------------------
4698| Returns 1 if the double-precision floating-point value `a' is less than or
4699| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4700| cause an exception. Otherwise, the comparison is performed according to the
4701| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4702*----------------------------------------------------------------------------*/
4703
e5a41ffa 4704int float64_le_quiet(float64 a, float64 b, float_status *status)
158142c2
FB
4705{
4706 flag aSign, bSign;
bb98fe42 4707 uint64_t av, bv;
ff32e16e
PM
4708 a = float64_squash_input_denormal(a, status);
4709 b = float64_squash_input_denormal(b, status);
158142c2
FB
4710
4711 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4712 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4713 ) {
4714 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
ff32e16e 4715 float_raise(float_flag_invalid, status);
158142c2
FB
4716 }
4717 return 0;
4718 }
4719 aSign = extractFloat64Sign( a );
4720 bSign = extractFloat64Sign( b );
f090c9d4 4721 av = float64_val(a);
a1b91bb4 4722 bv = float64_val(b);
bb98fe42 4723 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 4724 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
4725
4726}
4727
4728/*----------------------------------------------------------------------------
4729| Returns 1 if the double-precision floating-point value `a' is less than
4730| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4731| exception. Otherwise, the comparison is performed according to the IEC/IEEE
4732| Standard for Binary Floating-Point Arithmetic.
4733*----------------------------------------------------------------------------*/
4734
e5a41ffa 4735int float64_lt_quiet(float64 a, float64 b, float_status *status)
158142c2
FB
4736{
4737 flag aSign, bSign;
bb98fe42 4738 uint64_t av, bv;
ff32e16e
PM
4739 a = float64_squash_input_denormal(a, status);
4740 b = float64_squash_input_denormal(b, status);
158142c2
FB
4741
4742 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4743 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4744 ) {
4745 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
ff32e16e 4746 float_raise(float_flag_invalid, status);
158142c2
FB
4747 }
4748 return 0;
4749 }
4750 aSign = extractFloat64Sign( a );
4751 bSign = extractFloat64Sign( b );
f090c9d4 4752 av = float64_val(a);
a1b91bb4 4753 bv = float64_val(b);
bb98fe42 4754 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 4755 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
4756
4757}
4758
67b7861d
AJ
4759/*----------------------------------------------------------------------------
4760| Returns 1 if the double-precision floating-point values `a' and `b' cannot
4761| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4762| comparison is performed according to the IEC/IEEE Standard for Binary
4763| Floating-Point Arithmetic.
4764*----------------------------------------------------------------------------*/
4765
e5a41ffa 4766int float64_unordered_quiet(float64 a, float64 b, float_status *status)
67b7861d 4767{
ff32e16e
PM
4768 a = float64_squash_input_denormal(a, status);
4769 b = float64_squash_input_denormal(b, status);
67b7861d
AJ
4770
4771 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4772 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4773 ) {
4774 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
ff32e16e 4775 float_raise(float_flag_invalid, status);
67b7861d
AJ
4776 }
4777 return 1;
4778 }
4779 return 0;
4780}
4781
158142c2
FB
4782/*----------------------------------------------------------------------------
4783| Returns the result of converting the extended double-precision floating-
4784| point value `a' to the 32-bit two's complement integer format. The
4785| conversion is performed according to the IEC/IEEE Standard for Binary
4786| Floating-Point Arithmetic---which means in particular that the conversion
4787| is rounded according to the current rounding mode. If `a' is a NaN, the
4788| largest positive integer is returned. Otherwise, if the conversion
4789| overflows, the largest integer with the same sign as `a' is returned.
4790*----------------------------------------------------------------------------*/
4791
f4014512 4792int32_t floatx80_to_int32(floatx80 a, float_status *status)
158142c2
FB
4793{
4794 flag aSign;
f4014512 4795 int32_t aExp, shiftCount;
bb98fe42 4796 uint64_t aSig;
158142c2
FB
4797
4798 aSig = extractFloatx80Frac( a );
4799 aExp = extractFloatx80Exp( a );
4800 aSign = extractFloatx80Sign( a );
bb98fe42 4801 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
158142c2
FB
4802 shiftCount = 0x4037 - aExp;
4803 if ( shiftCount <= 0 ) shiftCount = 1;
4804 shift64RightJamming( aSig, shiftCount, &aSig );
ff32e16e 4805 return roundAndPackInt32(aSign, aSig, status);
158142c2
FB
4806
4807}
4808
4809/*----------------------------------------------------------------------------
4810| Returns the result of converting the extended double-precision floating-
4811| point value `a' to the 32-bit two's complement integer format. The
4812| conversion is performed according to the IEC/IEEE Standard for Binary
4813| Floating-Point Arithmetic, except that the conversion is always rounded
4814| toward zero. If `a' is a NaN, the largest positive integer is returned.
4815| Otherwise, if the conversion overflows, the largest integer with the same
4816| sign as `a' is returned.
4817*----------------------------------------------------------------------------*/
4818
f4014512 4819int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
158142c2
FB
4820{
4821 flag aSign;
f4014512 4822 int32_t aExp, shiftCount;
bb98fe42 4823 uint64_t aSig, savedASig;
b3a6a2e0 4824 int32_t z;
158142c2
FB
4825
4826 aSig = extractFloatx80Frac( a );
4827 aExp = extractFloatx80Exp( a );
4828 aSign = extractFloatx80Sign( a );
4829 if ( 0x401E < aExp ) {
bb98fe42 4830 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
158142c2
FB
4831 goto invalid;
4832 }
4833 else if ( aExp < 0x3FFF ) {
a2f2d288
PM
4834 if (aExp || aSig) {
4835 status->float_exception_flags |= float_flag_inexact;
4836 }
158142c2
FB
4837 return 0;
4838 }
4839 shiftCount = 0x403E - aExp;
4840 savedASig = aSig;
4841 aSig >>= shiftCount;
4842 z = aSig;
4843 if ( aSign ) z = - z;
4844 if ( ( z < 0 ) ^ aSign ) {
4845 invalid:
ff32e16e 4846 float_raise(float_flag_invalid, status);
bb98fe42 4847 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2
FB
4848 }
4849 if ( ( aSig<<shiftCount ) != savedASig ) {
a2f2d288 4850 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
4851 }
4852 return z;
4853
4854}
4855
4856/*----------------------------------------------------------------------------
4857| Returns the result of converting the extended double-precision floating-
4858| point value `a' to the 64-bit two's complement integer format. The
4859| conversion is performed according to the IEC/IEEE Standard for Binary
4860| Floating-Point Arithmetic---which means in particular that the conversion
4861| is rounded according to the current rounding mode. If `a' is a NaN,
4862| the largest positive integer is returned. Otherwise, if the conversion
4863| overflows, the largest integer with the same sign as `a' is returned.
4864*----------------------------------------------------------------------------*/
4865
f42c2224 4866int64_t floatx80_to_int64(floatx80 a, float_status *status)
158142c2
FB
4867{
4868 flag aSign;
f4014512 4869 int32_t aExp, shiftCount;
bb98fe42 4870 uint64_t aSig, aSigExtra;
158142c2
FB
4871
4872 aSig = extractFloatx80Frac( a );
4873 aExp = extractFloatx80Exp( a );
4874 aSign = extractFloatx80Sign( a );
4875 shiftCount = 0x403E - aExp;
4876 if ( shiftCount <= 0 ) {
4877 if ( shiftCount ) {
ff32e16e 4878 float_raise(float_flag_invalid, status);
158142c2
FB
4879 if ( ! aSign
4880 || ( ( aExp == 0x7FFF )
4881 && ( aSig != LIT64( 0x8000000000000000 ) ) )
4882 ) {
4883 return LIT64( 0x7FFFFFFFFFFFFFFF );
4884 }
bb98fe42 4885 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
4886 }
4887 aSigExtra = 0;
4888 }
4889 else {
4890 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4891 }
ff32e16e 4892 return roundAndPackInt64(aSign, aSig, aSigExtra, status);
158142c2
FB
4893
4894}
4895
4896/*----------------------------------------------------------------------------
4897| Returns the result of converting the extended double-precision floating-
4898| point value `a' to the 64-bit two's complement integer format. The
4899| conversion is performed according to the IEC/IEEE Standard for Binary
4900| Floating-Point Arithmetic, except that the conversion is always rounded
4901| toward zero. If `a' is a NaN, the largest positive integer is returned.
4902| Otherwise, if the conversion overflows, the largest integer with the same
4903| sign as `a' is returned.
4904*----------------------------------------------------------------------------*/
4905
f42c2224 4906int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
158142c2
FB
4907{
4908 flag aSign;
f4014512 4909 int32_t aExp, shiftCount;
bb98fe42 4910 uint64_t aSig;
f42c2224 4911 int64_t z;
158142c2
FB
4912
4913 aSig = extractFloatx80Frac( a );
4914 aExp = extractFloatx80Exp( a );
4915 aSign = extractFloatx80Sign( a );
4916 shiftCount = aExp - 0x403E;
4917 if ( 0 <= shiftCount ) {
4918 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4919 if ( ( a.high != 0xC03E ) || aSig ) {
ff32e16e 4920 float_raise(float_flag_invalid, status);
158142c2
FB
4921 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4922 return LIT64( 0x7FFFFFFFFFFFFFFF );
4923 }
4924 }
bb98fe42 4925 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
4926 }
4927 else if ( aExp < 0x3FFF ) {
a2f2d288
PM
4928 if (aExp | aSig) {
4929 status->float_exception_flags |= float_flag_inexact;
4930 }
158142c2
FB
4931 return 0;
4932 }
4933 z = aSig>>( - shiftCount );
bb98fe42 4934 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
a2f2d288 4935 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
4936 }
4937 if ( aSign ) z = - z;
4938 return z;
4939
4940}
4941
4942/*----------------------------------------------------------------------------
4943| Returns the result of converting the extended double-precision floating-
4944| point value `a' to the single-precision floating-point format. The
4945| conversion is performed according to the IEC/IEEE Standard for Binary
4946| Floating-Point Arithmetic.
4947*----------------------------------------------------------------------------*/
4948
e5a41ffa 4949float32 floatx80_to_float32(floatx80 a, float_status *status)
158142c2
FB
4950{
4951 flag aSign;
f4014512 4952 int32_t aExp;
bb98fe42 4953 uint64_t aSig;
158142c2
FB
4954
4955 aSig = extractFloatx80Frac( a );
4956 aExp = extractFloatx80Exp( a );
4957 aSign = extractFloatx80Sign( a );
4958 if ( aExp == 0x7FFF ) {
bb98fe42 4959 if ( (uint64_t) ( aSig<<1 ) ) {
ff32e16e 4960 return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
158142c2
FB
4961 }
4962 return packFloat32( aSign, 0xFF, 0 );
4963 }
4964 shift64RightJamming( aSig, 33, &aSig );
4965 if ( aExp || aSig ) aExp -= 0x3F81;
ff32e16e 4966 return roundAndPackFloat32(aSign, aExp, aSig, status);
158142c2
FB
4967
4968}
4969
4970/*----------------------------------------------------------------------------
4971| Returns the result of converting the extended double-precision floating-
4972| point value `a' to the double-precision floating-point format. The
4973| conversion is performed according to the IEC/IEEE Standard for Binary
4974| Floating-Point Arithmetic.
4975*----------------------------------------------------------------------------*/
4976
e5a41ffa 4977float64 floatx80_to_float64(floatx80 a, float_status *status)
158142c2
FB
4978{
4979 flag aSign;
f4014512 4980 int32_t aExp;
bb98fe42 4981 uint64_t aSig, zSig;
158142c2
FB
4982
4983 aSig = extractFloatx80Frac( a );
4984 aExp = extractFloatx80Exp( a );
4985 aSign = extractFloatx80Sign( a );
4986 if ( aExp == 0x7FFF ) {
bb98fe42 4987 if ( (uint64_t) ( aSig<<1 ) ) {
ff32e16e 4988 return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
158142c2
FB
4989 }
4990 return packFloat64( aSign, 0x7FF, 0 );
4991 }
4992 shift64RightJamming( aSig, 1, &zSig );
4993 if ( aExp || aSig ) aExp -= 0x3C01;
ff32e16e 4994 return roundAndPackFloat64(aSign, aExp, zSig, status);
158142c2
FB
4995
4996}
4997
158142c2
FB
4998/*----------------------------------------------------------------------------
4999| Returns the result of converting the extended double-precision floating-
5000| point value `a' to the quadruple-precision floating-point format. The
5001| conversion is performed according to the IEC/IEEE Standard for Binary
5002| Floating-Point Arithmetic.
5003*----------------------------------------------------------------------------*/
5004
e5a41ffa 5005float128 floatx80_to_float128(floatx80 a, float_status *status)
158142c2
FB
5006{
5007 flag aSign;
94a49d86 5008 int_fast16_t aExp;
bb98fe42 5009 uint64_t aSig, zSig0, zSig1;
158142c2
FB
5010
5011 aSig = extractFloatx80Frac( a );
5012 aExp = extractFloatx80Exp( a );
5013 aSign = extractFloatx80Sign( a );
bb98fe42 5014 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
ff32e16e 5015 return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
158142c2
FB
5016 }
5017 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5018 return packFloat128( aSign, aExp, zSig0, zSig1 );
5019
5020}
5021
158142c2
FB
5022/*----------------------------------------------------------------------------
5023| Rounds the extended double-precision floating-point value `a' to an integer,
5024| and returns the result as an extended quadruple-precision floating-point
5025| value. The operation is performed according to the IEC/IEEE Standard for
5026| Binary Floating-Point Arithmetic.
5027*----------------------------------------------------------------------------*/
5028
e5a41ffa 5029floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
158142c2
FB
5030{
5031 flag aSign;
f4014512 5032 int32_t aExp;
bb98fe42 5033 uint64_t lastBitMask, roundBitsMask;
158142c2
FB
5034 floatx80 z;
5035
5036 aExp = extractFloatx80Exp( a );
5037 if ( 0x403E <= aExp ) {
bb98fe42 5038 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
ff32e16e 5039 return propagateFloatx80NaN(a, a, status);
158142c2
FB
5040 }
5041 return a;
5042 }
5043 if ( aExp < 0x3FFF ) {
5044 if ( ( aExp == 0 )
bb98fe42 5045 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
158142c2
FB
5046 return a;
5047 }
a2f2d288 5048 status->float_exception_flags |= float_flag_inexact;
158142c2 5049 aSign = extractFloatx80Sign( a );
a2f2d288 5050 switch (status->float_rounding_mode) {
158142c2 5051 case float_round_nearest_even:
bb98fe42 5052 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
158142c2
FB
5053 ) {
5054 return
5055 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
5056 }
5057 break;
f9288a76
PM
5058 case float_round_ties_away:
5059 if (aExp == 0x3FFE) {
5060 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
5061 }
5062 break;
158142c2
FB
5063 case float_round_down:
5064 return
5065 aSign ?
5066 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
5067 : packFloatx80( 0, 0, 0 );
5068 case float_round_up:
5069 return
5070 aSign ? packFloatx80( 1, 0, 0 )
5071 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
5072 }
5073 return packFloatx80( aSign, 0, 0 );
5074 }
5075 lastBitMask = 1;
5076 lastBitMask <<= 0x403E - aExp;
5077 roundBitsMask = lastBitMask - 1;
5078 z = a;
a2f2d288 5079 switch (status->float_rounding_mode) {
dc355b76 5080 case float_round_nearest_even:
158142c2 5081 z.low += lastBitMask>>1;
dc355b76
PM
5082 if ((z.low & roundBitsMask) == 0) {
5083 z.low &= ~lastBitMask;
5084 }
5085 break;
f9288a76
PM
5086 case float_round_ties_away:
5087 z.low += lastBitMask >> 1;
5088 break;
dc355b76
PM
5089 case float_round_to_zero:
5090 break;
5091 case float_round_up:
5092 if (!extractFloatx80Sign(z)) {
5093 z.low += roundBitsMask;
5094 }
5095 break;
5096 case float_round_down:
5097 if (extractFloatx80Sign(z)) {
158142c2
FB
5098 z.low += roundBitsMask;
5099 }
dc355b76
PM
5100 break;
5101 default:
5102 abort();
158142c2
FB
5103 }
5104 z.low &= ~ roundBitsMask;
5105 if ( z.low == 0 ) {
5106 ++z.high;
5107 z.low = LIT64( 0x8000000000000000 );
5108 }
a2f2d288
PM
5109 if (z.low != a.low) {
5110 status->float_exception_flags |= float_flag_inexact;
5111 }
158142c2
FB
5112 return z;
5113
5114}
5115
5116/*----------------------------------------------------------------------------
5117| Returns the result of adding the absolute values of the extended double-
5118| precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
5119| negated before being returned. `zSign' is ignored if the result is a NaN.
5120| The addition is performed according to the IEC/IEEE Standard for Binary
5121| Floating-Point Arithmetic.
5122*----------------------------------------------------------------------------*/
5123
e5a41ffa
PM
5124static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5125 float_status *status)
158142c2 5126{
f4014512 5127 int32_t aExp, bExp, zExp;
bb98fe42 5128 uint64_t aSig, bSig, zSig0, zSig1;
f4014512 5129 int32_t expDiff;
158142c2
FB
5130
5131 aSig = extractFloatx80Frac( a );
5132 aExp = extractFloatx80Exp( a );
5133 bSig = extractFloatx80Frac( b );
5134 bExp = extractFloatx80Exp( b );
5135 expDiff = aExp - bExp;
5136 if ( 0 < expDiff ) {
5137 if ( aExp == 0x7FFF ) {
ff32e16e
PM
5138 if ((uint64_t)(aSig << 1)) {
5139 return propagateFloatx80NaN(a, b, status);
5140 }
158142c2
FB
5141 return a;
5142 }
5143 if ( bExp == 0 ) --expDiff;
5144 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5145 zExp = aExp;
5146 }
5147 else if ( expDiff < 0 ) {
5148 if ( bExp == 0x7FFF ) {
ff32e16e
PM
5149 if ((uint64_t)(bSig << 1)) {
5150 return propagateFloatx80NaN(a, b, status);
5151 }
158142c2
FB
5152 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5153 }
5154 if ( aExp == 0 ) ++expDiff;
5155 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5156 zExp = bExp;
5157 }
5158 else {
5159 if ( aExp == 0x7FFF ) {
bb98fe42 5160 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
ff32e16e 5161 return propagateFloatx80NaN(a, b, status);
158142c2
FB
5162 }
5163 return a;
5164 }
5165 zSig1 = 0;
5166 zSig0 = aSig + bSig;
5167 if ( aExp == 0 ) {
5168 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5169 goto roundAndPack;
5170 }
5171 zExp = aExp;
5172 goto shiftRight1;
5173 }
5174 zSig0 = aSig + bSig;
bb98fe42 5175 if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
158142c2
FB
5176 shiftRight1:
5177 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5178 zSig0 |= LIT64( 0x8000000000000000 );
5179 ++zExp;
5180 roundAndPack:
a2f2d288 5181 return roundAndPackFloatx80(status->floatx80_rounding_precision,
ff32e16e 5182 zSign, zExp, zSig0, zSig1, status);
158142c2
FB
5183}
5184
5185/*----------------------------------------------------------------------------
5186| Returns the result of subtracting the absolute values of the extended
5187| double-precision floating-point values `a' and `b'. If `zSign' is 1, the
5188| difference is negated before being returned. `zSign' is ignored if the
5189| result is a NaN. The subtraction is performed according to the IEC/IEEE
5190| Standard for Binary Floating-Point Arithmetic.
5191*----------------------------------------------------------------------------*/
5192
e5a41ffa
PM
5193static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5194 float_status *status)
158142c2 5195{
f4014512 5196 int32_t aExp, bExp, zExp;
bb98fe42 5197 uint64_t aSig, bSig, zSig0, zSig1;
f4014512 5198 int32_t expDiff;
158142c2
FB
5199 floatx80 z;
5200
5201 aSig = extractFloatx80Frac( a );
5202 aExp = extractFloatx80Exp( a );
5203 bSig = extractFloatx80Frac( b );
5204 bExp = extractFloatx80Exp( b );
5205 expDiff = aExp - bExp;
5206 if ( 0 < expDiff ) goto aExpBigger;
5207 if ( expDiff < 0 ) goto bExpBigger;
5208 if ( aExp == 0x7FFF ) {
bb98fe42 5209 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
ff32e16e 5210 return propagateFloatx80NaN(a, b, status);
158142c2 5211 }
ff32e16e 5212 float_raise(float_flag_invalid, status);
158142c2
FB
5213 z.low = floatx80_default_nan_low;
5214 z.high = floatx80_default_nan_high;
5215 return z;
5216 }
5217 if ( aExp == 0 ) {
5218 aExp = 1;
5219 bExp = 1;
5220 }
5221 zSig1 = 0;
5222 if ( bSig < aSig ) goto aBigger;
5223 if ( aSig < bSig ) goto bBigger;
a2f2d288 5224 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
158142c2
FB
5225 bExpBigger:
5226 if ( bExp == 0x7FFF ) {
ff32e16e
PM
5227 if ((uint64_t)(bSig << 1)) {
5228 return propagateFloatx80NaN(a, b, status);
5229 }
158142c2
FB
5230 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
5231 }
5232 if ( aExp == 0 ) ++expDiff;
5233 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5234 bBigger:
5235 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5236 zExp = bExp;
5237 zSign ^= 1;
5238 goto normalizeRoundAndPack;
5239 aExpBigger:
5240 if ( aExp == 0x7FFF ) {
ff32e16e
PM
5241 if ((uint64_t)(aSig << 1)) {
5242 return propagateFloatx80NaN(a, b, status);
5243 }
158142c2
FB
5244 return a;
5245 }
5246 if ( bExp == 0 ) --expDiff;
5247 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5248 aBigger:
5249 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5250 zExp = aExp;
5251 normalizeRoundAndPack:
a2f2d288 5252 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
ff32e16e 5253 zSign, zExp, zSig0, zSig1, status);
158142c2
FB
5254}
5255
5256/*----------------------------------------------------------------------------
5257| Returns the result of adding the extended double-precision floating-point
5258| values `a' and `b'. The operation is performed according to the IEC/IEEE
5259| Standard for Binary Floating-Point Arithmetic.
5260*----------------------------------------------------------------------------*/
5261
e5a41ffa 5262floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5263{
5264 flag aSign, bSign;
5265
5266 aSign = extractFloatx80Sign( a );
5267 bSign = extractFloatx80Sign( b );
5268 if ( aSign == bSign ) {
ff32e16e 5269 return addFloatx80Sigs(a, b, aSign, status);
158142c2
FB
5270 }
5271 else {
ff32e16e 5272 return subFloatx80Sigs(a, b, aSign, status);
158142c2
FB
5273 }
5274
5275}
5276
5277/*----------------------------------------------------------------------------
5278| Returns the result of subtracting the extended double-precision floating-
5279| point values `a' and `b'. The operation is performed according to the
5280| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5281*----------------------------------------------------------------------------*/
5282
e5a41ffa 5283floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5284{
5285 flag aSign, bSign;
5286
5287 aSign = extractFloatx80Sign( a );
5288 bSign = extractFloatx80Sign( b );
5289 if ( aSign == bSign ) {
ff32e16e 5290 return subFloatx80Sigs(a, b, aSign, status);
158142c2
FB
5291 }
5292 else {
ff32e16e 5293 return addFloatx80Sigs(a, b, aSign, status);
158142c2
FB
5294 }
5295
5296}
5297
5298/*----------------------------------------------------------------------------
5299| Returns the result of multiplying the extended double-precision floating-
5300| point values `a' and `b'. The operation is performed according to the
5301| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5302*----------------------------------------------------------------------------*/
5303
e5a41ffa 5304floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5305{
5306 flag aSign, bSign, zSign;
f4014512 5307 int32_t aExp, bExp, zExp;
bb98fe42 5308 uint64_t aSig, bSig, zSig0, zSig1;
158142c2
FB
5309 floatx80 z;
5310
5311 aSig = extractFloatx80Frac( a );
5312 aExp = extractFloatx80Exp( a );
5313 aSign = extractFloatx80Sign( a );
5314 bSig = extractFloatx80Frac( b );
5315 bExp = extractFloatx80Exp( b );
5316 bSign = extractFloatx80Sign( b );
5317 zSign = aSign ^ bSign;
5318 if ( aExp == 0x7FFF ) {
bb98fe42
AF
5319 if ( (uint64_t) ( aSig<<1 )
5320 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
ff32e16e 5321 return propagateFloatx80NaN(a, b, status);
158142c2
FB
5322 }
5323 if ( ( bExp | bSig ) == 0 ) goto invalid;
5324 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5325 }
5326 if ( bExp == 0x7FFF ) {
ff32e16e
PM
5327 if ((uint64_t)(bSig << 1)) {
5328 return propagateFloatx80NaN(a, b, status);
5329 }
158142c2
FB
5330 if ( ( aExp | aSig ) == 0 ) {
5331 invalid:
ff32e16e 5332 float_raise(float_flag_invalid, status);
158142c2
FB
5333 z.low = floatx80_default_nan_low;
5334 z.high = floatx80_default_nan_high;
5335 return z;
5336 }
5337 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5338 }
5339 if ( aExp == 0 ) {
5340 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5341 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5342 }
5343 if ( bExp == 0 ) {
5344 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5345 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5346 }
5347 zExp = aExp + bExp - 0x3FFE;
5348 mul64To128( aSig, bSig, &zSig0, &zSig1 );
bb98fe42 5349 if ( 0 < (int64_t) zSig0 ) {
158142c2
FB
5350 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5351 --zExp;
5352 }
a2f2d288 5353 return roundAndPackFloatx80(status->floatx80_rounding_precision,
ff32e16e 5354 zSign, zExp, zSig0, zSig1, status);
158142c2
FB
5355}
5356
5357/*----------------------------------------------------------------------------
5358| Returns the result of dividing the extended double-precision floating-point
5359| value `a' by the corresponding value `b'. The operation is performed
5360| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5361*----------------------------------------------------------------------------*/
5362
e5a41ffa 5363floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5364{
5365 flag aSign, bSign, zSign;
f4014512 5366 int32_t aExp, bExp, zExp;
bb98fe42
AF
5367 uint64_t aSig, bSig, zSig0, zSig1;
5368 uint64_t rem0, rem1, rem2, term0, term1, term2;
158142c2
FB
5369 floatx80 z;
5370
5371 aSig = extractFloatx80Frac( a );
5372 aExp = extractFloatx80Exp( a );
5373 aSign = extractFloatx80Sign( a );
5374 bSig = extractFloatx80Frac( b );
5375 bExp = extractFloatx80Exp( b );
5376 bSign = extractFloatx80Sign( b );
5377 zSign = aSign ^ bSign;
5378 if ( aExp == 0x7FFF ) {
ff32e16e
PM
5379 if ((uint64_t)(aSig << 1)) {
5380 return propagateFloatx80NaN(a, b, status);
5381 }
158142c2 5382 if ( bExp == 0x7FFF ) {
ff32e16e
PM
5383 if ((uint64_t)(bSig << 1)) {
5384 return propagateFloatx80NaN(a, b, status);
5385 }
158142c2
FB
5386 goto invalid;
5387 }
5388 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5389 }
5390 if ( bExp == 0x7FFF ) {
ff32e16e
PM
5391 if ((uint64_t)(bSig << 1)) {
5392 return propagateFloatx80NaN(a, b, status);
5393 }
158142c2
FB
5394 return packFloatx80( zSign, 0, 0 );
5395 }
5396 if ( bExp == 0 ) {
5397 if ( bSig == 0 ) {
5398 if ( ( aExp | aSig ) == 0 ) {
5399 invalid:
ff32e16e 5400 float_raise(float_flag_invalid, status);
158142c2
FB
5401 z.low = floatx80_default_nan_low;
5402 z.high = floatx80_default_nan_high;
5403 return z;
5404 }
ff32e16e 5405 float_raise(float_flag_divbyzero, status);
158142c2
FB
5406 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5407 }
5408 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5409 }
5410 if ( aExp == 0 ) {
5411 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5412 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5413 }
5414 zExp = aExp - bExp + 0x3FFE;
5415 rem1 = 0;
5416 if ( bSig <= aSig ) {
5417 shift128Right( aSig, 0, 1, &aSig, &rem1 );
5418 ++zExp;
5419 }
5420 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5421 mul64To128( bSig, zSig0, &term0, &term1 );
5422 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
bb98fe42 5423 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
5424 --zSig0;
5425 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5426 }
5427 zSig1 = estimateDiv128To64( rem1, 0, bSig );
bb98fe42 5428 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
158142c2
FB
5429 mul64To128( bSig, zSig1, &term1, &term2 );
5430 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
bb98fe42 5431 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
5432 --zSig1;
5433 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5434 }
5435 zSig1 |= ( ( rem1 | rem2 ) != 0 );
5436 }
a2f2d288 5437 return roundAndPackFloatx80(status->floatx80_rounding_precision,
ff32e16e 5438 zSign, zExp, zSig0, zSig1, status);
158142c2
FB
5439}
5440
5441/*----------------------------------------------------------------------------
5442| Returns the remainder of the extended double-precision floating-point value
5443| `a' with respect to the corresponding value `b'. The operation is performed
5444| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5445*----------------------------------------------------------------------------*/
5446
e5a41ffa 5447floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
158142c2 5448{
ed086f3d 5449 flag aSign, zSign;
f4014512 5450 int32_t aExp, bExp, expDiff;
bb98fe42
AF
5451 uint64_t aSig0, aSig1, bSig;
5452 uint64_t q, term0, term1, alternateASig0, alternateASig1;
158142c2
FB
5453 floatx80 z;
5454
5455 aSig0 = extractFloatx80Frac( a );
5456 aExp = extractFloatx80Exp( a );
5457 aSign = extractFloatx80Sign( a );
5458 bSig = extractFloatx80Frac( b );
5459 bExp = extractFloatx80Exp( b );
158142c2 5460 if ( aExp == 0x7FFF ) {
bb98fe42
AF
5461 if ( (uint64_t) ( aSig0<<1 )
5462 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
ff32e16e 5463 return propagateFloatx80NaN(a, b, status);
158142c2
FB
5464 }
5465 goto invalid;
5466 }
5467 if ( bExp == 0x7FFF ) {
ff32e16e
PM
5468 if ((uint64_t)(bSig << 1)) {
5469 return propagateFloatx80NaN(a, b, status);
5470 }
158142c2
FB
5471 return a;
5472 }
5473 if ( bExp == 0 ) {
5474 if ( bSig == 0 ) {
5475 invalid:
ff32e16e 5476 float_raise(float_flag_invalid, status);
158142c2
FB
5477 z.low = floatx80_default_nan_low;
5478 z.high = floatx80_default_nan_high;
5479 return z;
5480 }
5481 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5482 }
5483 if ( aExp == 0 ) {
bb98fe42 5484 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
158142c2
FB
5485 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5486 }
5487 bSig |= LIT64( 0x8000000000000000 );
5488 zSign = aSign;
5489 expDiff = aExp - bExp;
5490 aSig1 = 0;
5491 if ( expDiff < 0 ) {
5492 if ( expDiff < -1 ) return a;
5493 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5494 expDiff = 0;
5495 }
5496 q = ( bSig <= aSig0 );
5497 if ( q ) aSig0 -= bSig;
5498 expDiff -= 64;
5499 while ( 0 < expDiff ) {
5500 q = estimateDiv128To64( aSig0, aSig1, bSig );
5501 q = ( 2 < q ) ? q - 2 : 0;
5502 mul64To128( bSig, q, &term0, &term1 );
5503 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5504 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5505 expDiff -= 62;
5506 }
5507 expDiff += 64;
5508 if ( 0 < expDiff ) {
5509 q = estimateDiv128To64( aSig0, aSig1, bSig );
5510 q = ( 2 < q ) ? q - 2 : 0;
5511 q >>= 64 - expDiff;
5512 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5513 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5514 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5515 while ( le128( term0, term1, aSig0, aSig1 ) ) {
5516 ++q;
5517 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5518 }
5519 }
5520 else {
5521 term1 = 0;
5522 term0 = bSig;
5523 }
5524 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5525 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5526 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5527 && ( q & 1 ) )
5528 ) {
5529 aSig0 = alternateASig0;
5530 aSig1 = alternateASig1;
5531 zSign = ! zSign;
5532 }
5533 return
5534 normalizeRoundAndPackFloatx80(
ff32e16e 5535 80, zSign, bExp + expDiff, aSig0, aSig1, status);
158142c2
FB
5536
5537}
5538
5539/*----------------------------------------------------------------------------
5540| Returns the square root of the extended double-precision floating-point
5541| value `a'. The operation is performed according to the IEC/IEEE Standard
5542| for Binary Floating-Point Arithmetic.
5543*----------------------------------------------------------------------------*/
5544
e5a41ffa 5545floatx80 floatx80_sqrt(floatx80 a, float_status *status)
158142c2
FB
5546{
5547 flag aSign;
f4014512 5548 int32_t aExp, zExp;
bb98fe42
AF
5549 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5550 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
158142c2
FB
5551 floatx80 z;
5552
5553 aSig0 = extractFloatx80Frac( a );
5554 aExp = extractFloatx80Exp( a );
5555 aSign = extractFloatx80Sign( a );
5556 if ( aExp == 0x7FFF ) {
ff32e16e
PM
5557 if ((uint64_t)(aSig0 << 1)) {
5558 return propagateFloatx80NaN(a, a, status);
5559 }
158142c2
FB
5560 if ( ! aSign ) return a;
5561 goto invalid;
5562 }
5563 if ( aSign ) {
5564 if ( ( aExp | aSig0 ) == 0 ) return a;
5565 invalid:
ff32e16e 5566 float_raise(float_flag_invalid, status);
158142c2
FB
5567 z.low = floatx80_default_nan_low;
5568 z.high = floatx80_default_nan_high;
5569 return z;
5570 }
5571 if ( aExp == 0 ) {
5572 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5573 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5574 }
5575 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5576 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5577 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5578 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5579 doubleZSig0 = zSig0<<1;
5580 mul64To128( zSig0, zSig0, &term0, &term1 );
5581 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
bb98fe42 5582 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
5583 --zSig0;
5584 doubleZSig0 -= 2;
5585 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5586 }
5587 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5588 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5589 if ( zSig1 == 0 ) zSig1 = 1;
5590 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5591 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5592 mul64To128( zSig1, zSig1, &term2, &term3 );
5593 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
bb98fe42 5594 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
5595 --zSig1;
5596 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5597 term3 |= 1;
5598 term2 |= doubleZSig0;
5599 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5600 }
5601 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5602 }
5603 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5604 zSig0 |= doubleZSig0;
a2f2d288
PM
5605 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5606 0, zExp, zSig0, zSig1, status);
158142c2
FB
5607}
5608
5609/*----------------------------------------------------------------------------
b689362d
AJ
5610| Returns 1 if the extended double-precision floating-point value `a' is equal
5611| to the corresponding value `b', and 0 otherwise. The invalid exception is
5612| raised if either operand is a NaN. Otherwise, the comparison is performed
5613| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
5614*----------------------------------------------------------------------------*/
5615
e5a41ffa 5616int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5617{
5618
5619 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5620 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5621 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5622 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2 5623 ) {
ff32e16e 5624 float_raise(float_flag_invalid, status);
158142c2
FB
5625 return 0;
5626 }
5627 return
5628 ( a.low == b.low )
5629 && ( ( a.high == b.high )
5630 || ( ( a.low == 0 )
bb98fe42 5631 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
5632 );
5633
5634}
5635
5636/*----------------------------------------------------------------------------
5637| Returns 1 if the extended double-precision floating-point value `a' is
5638| less than or equal to the corresponding value `b', and 0 otherwise. The
f5a64251
AJ
5639| invalid exception is raised if either operand is a NaN. The comparison is
5640| performed according to the IEC/IEEE Standard for Binary Floating-Point
5641| Arithmetic.
158142c2
FB
5642*----------------------------------------------------------------------------*/
5643
e5a41ffa 5644int floatx80_le(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5645{
5646 flag aSign, bSign;
5647
5648 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5649 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5650 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5651 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2 5652 ) {
ff32e16e 5653 float_raise(float_flag_invalid, status);
158142c2
FB
5654 return 0;
5655 }
5656 aSign = extractFloatx80Sign( a );
5657 bSign = extractFloatx80Sign( b );
5658 if ( aSign != bSign ) {
5659 return
5660 aSign
bb98fe42 5661 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5662 == 0 );
5663 }
5664 return
5665 aSign ? le128( b.high, b.low, a.high, a.low )
5666 : le128( a.high, a.low, b.high, b.low );
5667
5668}
5669
5670/*----------------------------------------------------------------------------
5671| Returns 1 if the extended double-precision floating-point value `a' is
f5a64251
AJ
5672| less than the corresponding value `b', and 0 otherwise. The invalid
5673| exception is raised if either operand is a NaN. The comparison is performed
5674| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
5675*----------------------------------------------------------------------------*/
5676
e5a41ffa 5677int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5678{
5679 flag aSign, bSign;
5680
5681 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5682 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5683 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5684 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2 5685 ) {
ff32e16e 5686 float_raise(float_flag_invalid, status);
158142c2
FB
5687 return 0;
5688 }
5689 aSign = extractFloatx80Sign( a );
5690 bSign = extractFloatx80Sign( b );
5691 if ( aSign != bSign ) {
5692 return
5693 aSign
bb98fe42 5694 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5695 != 0 );
5696 }
5697 return
5698 aSign ? lt128( b.high, b.low, a.high, a.low )
5699 : lt128( a.high, a.low, b.high, b.low );
5700
5701}
5702
67b7861d
AJ
5703/*----------------------------------------------------------------------------
5704| Returns 1 if the extended double-precision floating-point values `a' and `b'
f5a64251
AJ
5705| cannot be compared, and 0 otherwise. The invalid exception is raised if
5706| either operand is a NaN. The comparison is performed according to the
5707| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
67b7861d 5708*----------------------------------------------------------------------------*/
e5a41ffa 5709int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
67b7861d
AJ
5710{
5711 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5712 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5713 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5714 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5715 ) {
ff32e16e 5716 float_raise(float_flag_invalid, status);
67b7861d
AJ
5717 return 1;
5718 }
5719 return 0;
5720}
5721
158142c2 5722/*----------------------------------------------------------------------------
b689362d 5723| Returns 1 if the extended double-precision floating-point value `a' is
f5a64251
AJ
5724| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5725| cause an exception. The comparison is performed according to the IEC/IEEE
5726| Standard for Binary Floating-Point Arithmetic.
158142c2
FB
5727*----------------------------------------------------------------------------*/
5728
e5a41ffa 5729int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5730{
5731
5732 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5733 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5734 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5735 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2 5736 ) {
b689362d
AJ
5737 if ( floatx80_is_signaling_nan( a )
5738 || floatx80_is_signaling_nan( b ) ) {
ff32e16e 5739 float_raise(float_flag_invalid, status);
b689362d 5740 }
158142c2
FB
5741 return 0;
5742 }
5743 return
5744 ( a.low == b.low )
5745 && ( ( a.high == b.high )
5746 || ( ( a.low == 0 )
bb98fe42 5747 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
5748 );
5749
5750}
5751
5752/*----------------------------------------------------------------------------
5753| Returns 1 if the extended double-precision floating-point value `a' is less
5754| than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
5755| do not cause an exception. Otherwise, the comparison is performed according
5756| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5757*----------------------------------------------------------------------------*/
5758
e5a41ffa 5759int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5760{
5761 flag aSign, bSign;
5762
5763 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5764 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5765 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5766 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2
FB
5767 ) {
5768 if ( floatx80_is_signaling_nan( a )
5769 || floatx80_is_signaling_nan( b ) ) {
ff32e16e 5770 float_raise(float_flag_invalid, status);
158142c2
FB
5771 }
5772 return 0;
5773 }
5774 aSign = extractFloatx80Sign( a );
5775 bSign = extractFloatx80Sign( b );
5776 if ( aSign != bSign ) {
5777 return
5778 aSign
bb98fe42 5779 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5780 == 0 );
5781 }
5782 return
5783 aSign ? le128( b.high, b.low, a.high, a.low )
5784 : le128( a.high, a.low, b.high, b.low );
5785
5786}
5787
5788/*----------------------------------------------------------------------------
5789| Returns 1 if the extended double-precision floating-point value `a' is less
5790| than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
5791| an exception. Otherwise, the comparison is performed according to the
5792| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5793*----------------------------------------------------------------------------*/
5794
e5a41ffa 5795int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5796{
5797 flag aSign, bSign;
5798
5799 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5800 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5801 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5802 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2
FB
5803 ) {
5804 if ( floatx80_is_signaling_nan( a )
5805 || floatx80_is_signaling_nan( b ) ) {
ff32e16e 5806 float_raise(float_flag_invalid, status);
158142c2
FB
5807 }
5808 return 0;
5809 }
5810 aSign = extractFloatx80Sign( a );
5811 bSign = extractFloatx80Sign( b );
5812 if ( aSign != bSign ) {
5813 return
5814 aSign
bb98fe42 5815 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5816 != 0 );
5817 }
5818 return
5819 aSign ? lt128( b.high, b.low, a.high, a.low )
5820 : lt128( a.high, a.low, b.high, b.low );
5821
5822}
5823
67b7861d
AJ
5824/*----------------------------------------------------------------------------
5825| Returns 1 if the extended double-precision floating-point values `a' and `b'
5826| cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
5827| The comparison is performed according to the IEC/IEEE Standard for Binary
5828| Floating-Point Arithmetic.
5829*----------------------------------------------------------------------------*/
e5a41ffa 5830int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
67b7861d
AJ
5831{
5832 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5833 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5834 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5835 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5836 ) {
5837 if ( floatx80_is_signaling_nan( a )
5838 || floatx80_is_signaling_nan( b ) ) {
ff32e16e 5839 float_raise(float_flag_invalid, status);
67b7861d
AJ
5840 }
5841 return 1;
5842 }
5843 return 0;
5844}
5845
158142c2
FB
5846/*----------------------------------------------------------------------------
5847| Returns the result of converting the quadruple-precision floating-point
5848| value `a' to the 32-bit two's complement integer format. The conversion
5849| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5850| Arithmetic---which means in particular that the conversion is rounded
5851| according to the current rounding mode. If `a' is a NaN, the largest
5852| positive integer is returned. Otherwise, if the conversion overflows, the
5853| largest integer with the same sign as `a' is returned.
5854*----------------------------------------------------------------------------*/
5855
f4014512 5856int32_t float128_to_int32(float128 a, float_status *status)
158142c2
FB
5857{
5858 flag aSign;
f4014512 5859 int32_t aExp, shiftCount;
bb98fe42 5860 uint64_t aSig0, aSig1;
158142c2
FB
5861
5862 aSig1 = extractFloat128Frac1( a );
5863 aSig0 = extractFloat128Frac0( a );
5864 aExp = extractFloat128Exp( a );
5865 aSign = extractFloat128Sign( a );
5866 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5867 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5868 aSig0 |= ( aSig1 != 0 );
5869 shiftCount = 0x4028 - aExp;
5870 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
ff32e16e 5871 return roundAndPackInt32(aSign, aSig0, status);
158142c2
FB
5872
5873}
5874
5875/*----------------------------------------------------------------------------
5876| Returns the result of converting the quadruple-precision floating-point
5877| value `a' to the 32-bit two's complement integer format. The conversion
5878| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5879| Arithmetic, except that the conversion is always rounded toward zero. If
5880| `a' is a NaN, the largest positive integer is returned. Otherwise, if the
5881| conversion overflows, the largest integer with the same sign as `a' is
5882| returned.
5883*----------------------------------------------------------------------------*/
5884
f4014512 5885int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
158142c2
FB
5886{
5887 flag aSign;
f4014512 5888 int32_t aExp, shiftCount;
bb98fe42 5889 uint64_t aSig0, aSig1, savedASig;
b3a6a2e0 5890 int32_t z;
158142c2
FB
5891
5892 aSig1 = extractFloat128Frac1( a );
5893 aSig0 = extractFloat128Frac0( a );
5894 aExp = extractFloat128Exp( a );
5895 aSign = extractFloat128Sign( a );
5896 aSig0 |= ( aSig1 != 0 );
5897 if ( 0x401E < aExp ) {
5898 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5899 goto invalid;
5900 }
5901 else if ( aExp < 0x3FFF ) {
a2f2d288
PM
5902 if (aExp || aSig0) {
5903 status->float_exception_flags |= float_flag_inexact;
5904 }
158142c2
FB
5905 return 0;
5906 }
5907 aSig0 |= LIT64( 0x0001000000000000 );
5908 shiftCount = 0x402F - aExp;
5909 savedASig = aSig0;
5910 aSig0 >>= shiftCount;
5911 z = aSig0;
5912 if ( aSign ) z = - z;
5913 if ( ( z < 0 ) ^ aSign ) {
5914 invalid:
ff32e16e 5915 float_raise(float_flag_invalid, status);
bb98fe42 5916 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2
FB
5917 }
5918 if ( ( aSig0<<shiftCount ) != savedASig ) {
a2f2d288 5919 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
5920 }
5921 return z;
5922
5923}
5924
5925/*----------------------------------------------------------------------------
5926| Returns the result of converting the quadruple-precision floating-point
5927| value `a' to the 64-bit two's complement integer format. The conversion
5928| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5929| Arithmetic---which means in particular that the conversion is rounded
5930| according to the current rounding mode. If `a' is a NaN, the largest
5931| positive integer is returned. Otherwise, if the conversion overflows, the
5932| largest integer with the same sign as `a' is returned.
5933*----------------------------------------------------------------------------*/
5934
f42c2224 5935int64_t float128_to_int64(float128 a, float_status *status)
158142c2
FB
5936{
5937 flag aSign;
f4014512 5938 int32_t aExp, shiftCount;
bb98fe42 5939 uint64_t aSig0, aSig1;
158142c2
FB
5940
5941 aSig1 = extractFloat128Frac1( a );
5942 aSig0 = extractFloat128Frac0( a );
5943 aExp = extractFloat128Exp( a );
5944 aSign = extractFloat128Sign( a );
5945 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5946 shiftCount = 0x402F - aExp;
5947 if ( shiftCount <= 0 ) {
5948 if ( 0x403E < aExp ) {
ff32e16e 5949 float_raise(float_flag_invalid, status);
158142c2
FB
5950 if ( ! aSign
5951 || ( ( aExp == 0x7FFF )
5952 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5953 )
5954 ) {
5955 return LIT64( 0x7FFFFFFFFFFFFFFF );
5956 }
bb98fe42 5957 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
5958 }
5959 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5960 }
5961 else {
5962 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5963 }
ff32e16e 5964 return roundAndPackInt64(aSign, aSig0, aSig1, status);
158142c2
FB
5965
5966}
5967
5968/*----------------------------------------------------------------------------
5969| Returns the result of converting the quadruple-precision floating-point
5970| value `a' to the 64-bit two's complement integer format. The conversion
5971| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5972| Arithmetic, except that the conversion is always rounded toward zero.
5973| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
5974| the conversion overflows, the largest integer with the same sign as `a' is
5975| returned.
5976*----------------------------------------------------------------------------*/
5977
f42c2224 5978int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
158142c2
FB
5979{
5980 flag aSign;
f4014512 5981 int32_t aExp, shiftCount;
bb98fe42 5982 uint64_t aSig0, aSig1;
f42c2224 5983 int64_t z;
158142c2
FB
5984
5985 aSig1 = extractFloat128Frac1( a );
5986 aSig0 = extractFloat128Frac0( a );
5987 aExp = extractFloat128Exp( a );
5988 aSign = extractFloat128Sign( a );
5989 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5990 shiftCount = aExp - 0x402F;
5991 if ( 0 < shiftCount ) {
5992 if ( 0x403E <= aExp ) {
5993 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5994 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
5995 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
a2f2d288
PM
5996 if (aSig1) {
5997 status->float_exception_flags |= float_flag_inexact;
5998 }
158142c2
FB
5999 }
6000 else {
ff32e16e 6001 float_raise(float_flag_invalid, status);
158142c2
FB
6002 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
6003 return LIT64( 0x7FFFFFFFFFFFFFFF );
6004 }
6005 }
bb98fe42 6006 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
6007 }
6008 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
bb98fe42 6009 if ( (uint64_t) ( aSig1<<shiftCount ) ) {
a2f2d288 6010 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
6011 }
6012 }
6013 else {
6014 if ( aExp < 0x3FFF ) {
6015 if ( aExp | aSig0 | aSig1 ) {
a2f2d288 6016 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
6017 }
6018 return 0;
6019 }
6020 z = aSig0>>( - shiftCount );
6021 if ( aSig1
bb98fe42 6022 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
a2f2d288 6023 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
6024 }
6025 }
6026 if ( aSign ) z = - z;
6027 return z;
6028
6029}
6030
6031/*----------------------------------------------------------------------------
6032| Returns the result of converting the quadruple-precision floating-point
6033| value `a' to the single-precision floating-point format. The conversion
6034| is performed according to the IEC/IEEE Standard for Binary Floating-Point
6035| Arithmetic.
6036*----------------------------------------------------------------------------*/
6037
e5a41ffa 6038float32 float128_to_float32(float128 a, float_status *status)
158142c2
FB
6039{
6040 flag aSign;
f4014512 6041 int32_t aExp;
bb98fe42
AF
6042 uint64_t aSig0, aSig1;
6043 uint32_t zSig;
158142c2
FB
6044
6045 aSig1 = extractFloat128Frac1( a );
6046 aSig0 = extractFloat128Frac0( a );
6047 aExp = extractFloat128Exp( a );
6048 aSign = extractFloat128Sign( a );
6049 if ( aExp == 0x7FFF ) {
6050 if ( aSig0 | aSig1 ) {
ff32e16e 6051 return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
158142c2
FB
6052 }
6053 return packFloat32( aSign, 0xFF, 0 );
6054 }
6055 aSig0 |= ( aSig1 != 0 );
6056 shift64RightJamming( aSig0, 18, &aSig0 );
6057 zSig = aSig0;
6058 if ( aExp || zSig ) {
6059 zSig |= 0x40000000;
6060 aExp -= 0x3F81;
6061 }
ff32e16e 6062 return roundAndPackFloat32(aSign, aExp, zSig, status);
158142c2
FB
6063
6064}
6065
6066/*----------------------------------------------------------------------------
6067| Returns the result of converting the quadruple-precision floating-point
6068| value `a' to the double-precision floating-point format. The conversion
6069| is performed according to the IEC/IEEE Standard for Binary Floating-Point
6070| Arithmetic.
6071*----------------------------------------------------------------------------*/
6072
e5a41ffa 6073float64 float128_to_float64(float128 a, float_status *status)
158142c2
FB
6074{
6075 flag aSign;
f4014512 6076 int32_t aExp;
bb98fe42 6077 uint64_t aSig0, aSig1;
158142c2
FB
6078
6079 aSig1 = extractFloat128Frac1( a );
6080 aSig0 = extractFloat128Frac0( a );
6081 aExp = extractFloat128Exp( a );
6082 aSign = extractFloat128Sign( a );
6083 if ( aExp == 0x7FFF ) {
6084 if ( aSig0 | aSig1 ) {
ff32e16e 6085 return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
158142c2
FB
6086 }
6087 return packFloat64( aSign, 0x7FF, 0 );
6088 }
6089 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6090 aSig0 |= ( aSig1 != 0 );
6091 if ( aExp || aSig0 ) {
6092 aSig0 |= LIT64( 0x4000000000000000 );
6093 aExp -= 0x3C01;
6094 }
ff32e16e 6095 return roundAndPackFloat64(aSign, aExp, aSig0, status);
158142c2
FB
6096
6097}
6098
158142c2
FB
6099/*----------------------------------------------------------------------------
6100| Returns the result of converting the quadruple-precision floating-point
6101| value `a' to the extended double-precision floating-point format. The
6102| conversion is performed according to the IEC/IEEE Standard for Binary
6103| Floating-Point Arithmetic.
6104*----------------------------------------------------------------------------*/
6105
e5a41ffa 6106floatx80 float128_to_floatx80(float128 a, float_status *status)
158142c2
FB
6107{
6108 flag aSign;
f4014512 6109 int32_t aExp;
bb98fe42 6110 uint64_t aSig0, aSig1;
158142c2
FB
6111
6112 aSig1 = extractFloat128Frac1( a );
6113 aSig0 = extractFloat128Frac0( a );
6114 aExp = extractFloat128Exp( a );
6115 aSign = extractFloat128Sign( a );
6116 if ( aExp == 0x7FFF ) {
6117 if ( aSig0 | aSig1 ) {
ff32e16e 6118 return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
158142c2
FB
6119 }
6120 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
6121 }
6122 if ( aExp == 0 ) {
6123 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6124 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6125 }
6126 else {
6127 aSig0 |= LIT64( 0x0001000000000000 );
6128 }
6129 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
ff32e16e 6130 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
158142c2
FB
6131
6132}
6133
158142c2
FB
6134/*----------------------------------------------------------------------------
6135| Rounds the quadruple-precision floating-point value `a' to an integer, and
6136| returns the result as a quadruple-precision floating-point value. The
6137| operation is performed according to the IEC/IEEE Standard for Binary
6138| Floating-Point Arithmetic.
6139*----------------------------------------------------------------------------*/
6140
e5a41ffa 6141float128 float128_round_to_int(float128 a, float_status *status)
158142c2
FB
6142{
6143 flag aSign;
f4014512 6144 int32_t aExp;
bb98fe42 6145 uint64_t lastBitMask, roundBitsMask;
158142c2
FB
6146 float128 z;
6147
6148 aExp = extractFloat128Exp( a );
6149 if ( 0x402F <= aExp ) {
6150 if ( 0x406F <= aExp ) {
6151 if ( ( aExp == 0x7FFF )
6152 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6153 ) {
ff32e16e 6154 return propagateFloat128NaN(a, a, status);
158142c2
FB
6155 }
6156 return a;
6157 }
6158 lastBitMask = 1;
6159 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6160 roundBitsMask = lastBitMask - 1;
6161 z = a;
a2f2d288 6162 switch (status->float_rounding_mode) {
dc355b76 6163 case float_round_nearest_even:
158142c2
FB
6164 if ( lastBitMask ) {
6165 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6166 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6167 }
6168 else {
bb98fe42 6169 if ( (int64_t) z.low < 0 ) {
158142c2 6170 ++z.high;
bb98fe42 6171 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
158142c2
FB
6172 }
6173 }
dc355b76 6174 break;
f9288a76
PM
6175 case float_round_ties_away:
6176 if (lastBitMask) {
6177 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6178 } else {
6179 if ((int64_t) z.low < 0) {
6180 ++z.high;
6181 }
6182 }
6183 break;
dc355b76
PM
6184 case float_round_to_zero:
6185 break;
6186 case float_round_up:
6187 if (!extractFloat128Sign(z)) {
6188 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6189 }
6190 break;
6191 case float_round_down:
6192 if (extractFloat128Sign(z)) {
6193 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
158142c2 6194 }
dc355b76
PM
6195 break;
6196 default:
6197 abort();
158142c2
FB
6198 }
6199 z.low &= ~ roundBitsMask;
6200 }
6201 else {
6202 if ( aExp < 0x3FFF ) {
bb98fe42 6203 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
a2f2d288 6204 status->float_exception_flags |= float_flag_inexact;
158142c2 6205 aSign = extractFloat128Sign( a );
a2f2d288 6206 switch (status->float_rounding_mode) {
158142c2
FB
6207 case float_round_nearest_even:
6208 if ( ( aExp == 0x3FFE )
6209 && ( extractFloat128Frac0( a )
6210 | extractFloat128Frac1( a ) )
6211 ) {
6212 return packFloat128( aSign, 0x3FFF, 0, 0 );
6213 }
6214 break;
f9288a76
PM
6215 case float_round_ties_away:
6216 if (aExp == 0x3FFE) {
6217 return packFloat128(aSign, 0x3FFF, 0, 0);
6218 }
6219 break;
158142c2
FB
6220 case float_round_down:
6221 return
6222 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6223 : packFloat128( 0, 0, 0, 0 );
6224 case float_round_up:
6225 return
6226 aSign ? packFloat128( 1, 0, 0, 0 )
6227 : packFloat128( 0, 0x3FFF, 0, 0 );
6228 }
6229 return packFloat128( aSign, 0, 0, 0 );
6230 }
6231 lastBitMask = 1;
6232 lastBitMask <<= 0x402F - aExp;
6233 roundBitsMask = lastBitMask - 1;
6234 z.low = 0;
6235 z.high = a.high;
a2f2d288 6236 switch (status->float_rounding_mode) {
dc355b76 6237 case float_round_nearest_even:
158142c2
FB
6238 z.high += lastBitMask>>1;
6239 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6240 z.high &= ~ lastBitMask;
6241 }
dc355b76 6242 break;
f9288a76
PM
6243 case float_round_ties_away:
6244 z.high += lastBitMask>>1;
6245 break;
dc355b76
PM
6246 case float_round_to_zero:
6247 break;
6248 case float_round_up:
6249 if (!extractFloat128Sign(z)) {
158142c2
FB
6250 z.high |= ( a.low != 0 );
6251 z.high += roundBitsMask;
6252 }
dc355b76
PM
6253 break;
6254 case float_round_down:
6255 if (extractFloat128Sign(z)) {
6256 z.high |= (a.low != 0);
6257 z.high += roundBitsMask;
6258 }
6259 break;
6260 default:
6261 abort();
158142c2
FB
6262 }
6263 z.high &= ~ roundBitsMask;
6264 }
6265 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
a2f2d288 6266 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
6267 }
6268 return z;
6269
6270}
6271
6272/*----------------------------------------------------------------------------
6273| Returns the result of adding the absolute values of the quadruple-precision
6274| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
6275| before being returned. `zSign' is ignored if the result is a NaN.
6276| The addition is performed according to the IEC/IEEE Standard for Binary
6277| Floating-Point Arithmetic.
6278*----------------------------------------------------------------------------*/
6279
e5a41ffa
PM
6280static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
6281 float_status *status)
158142c2 6282{
f4014512 6283 int32_t aExp, bExp, zExp;
bb98fe42 6284 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
f4014512 6285 int32_t expDiff;
158142c2
FB
6286
6287 aSig1 = extractFloat128Frac1( a );
6288 aSig0 = extractFloat128Frac0( a );
6289 aExp = extractFloat128Exp( a );
6290 bSig1 = extractFloat128Frac1( b );
6291 bSig0 = extractFloat128Frac0( b );
6292 bExp = extractFloat128Exp( b );
6293 expDiff = aExp - bExp;
6294 if ( 0 < expDiff ) {
6295 if ( aExp == 0x7FFF ) {
ff32e16e
PM
6296 if (aSig0 | aSig1) {
6297 return propagateFloat128NaN(a, b, status);
6298 }
158142c2
FB
6299 return a;
6300 }
6301 if ( bExp == 0 ) {
6302 --expDiff;
6303 }
6304 else {
6305 bSig0 |= LIT64( 0x0001000000000000 );
6306 }
6307 shift128ExtraRightJamming(
6308 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6309 zExp = aExp;
6310 }
6311 else if ( expDiff < 0 ) {
6312 if ( bExp == 0x7FFF ) {
ff32e16e
PM
6313 if (bSig0 | bSig1) {
6314 return propagateFloat128NaN(a, b, status);
6315 }
158142c2
FB
6316 return packFloat128( zSign, 0x7FFF, 0, 0 );
6317 }
6318 if ( aExp == 0 ) {
6319 ++expDiff;
6320 }
6321 else {
6322 aSig0 |= LIT64( 0x0001000000000000 );
6323 }
6324 shift128ExtraRightJamming(
6325 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6326 zExp = bExp;
6327 }
6328 else {
6329 if ( aExp == 0x7FFF ) {
6330 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
ff32e16e 6331 return propagateFloat128NaN(a, b, status);
158142c2
FB
6332 }
6333 return a;
6334 }
6335 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
fe76d976 6336 if ( aExp == 0 ) {
a2f2d288 6337 if (status->flush_to_zero) {
e6afc87f 6338 if (zSig0 | zSig1) {
ff32e16e 6339 float_raise(float_flag_output_denormal, status);
e6afc87f
PM
6340 }
6341 return packFloat128(zSign, 0, 0, 0);
6342 }
fe76d976
PB
6343 return packFloat128( zSign, 0, zSig0, zSig1 );
6344 }
158142c2
FB
6345 zSig2 = 0;
6346 zSig0 |= LIT64( 0x0002000000000000 );
6347 zExp = aExp;
6348 goto shiftRight1;
6349 }
6350 aSig0 |= LIT64( 0x0001000000000000 );
6351 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6352 --zExp;
6353 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6354 ++zExp;
6355 shiftRight1:
6356 shift128ExtraRightJamming(
6357 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6358 roundAndPack:
ff32e16e 6359 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
158142c2
FB
6360
6361}
6362
6363/*----------------------------------------------------------------------------
6364| Returns the result of subtracting the absolute values of the quadruple-
6365| precision floating-point values `a' and `b'. If `zSign' is 1, the
6366| difference is negated before being returned. `zSign' is ignored if the
6367| result is a NaN. The subtraction is performed according to the IEC/IEEE
6368| Standard for Binary Floating-Point Arithmetic.
6369*----------------------------------------------------------------------------*/
6370
e5a41ffa
PM
6371static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6372 float_status *status)
158142c2 6373{
f4014512 6374 int32_t aExp, bExp, zExp;
bb98fe42 6375 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
f4014512 6376 int32_t expDiff;
158142c2
FB
6377 float128 z;
6378
6379 aSig1 = extractFloat128Frac1( a );
6380 aSig0 = extractFloat128Frac0( a );
6381 aExp = extractFloat128Exp( a );
6382 bSig1 = extractFloat128Frac1( b );
6383 bSig0 = extractFloat128Frac0( b );
6384 bExp = extractFloat128Exp( b );
6385 expDiff = aExp - bExp;
6386 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6387 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6388 if ( 0 < expDiff ) goto aExpBigger;
6389 if ( expDiff < 0 ) goto bExpBigger;
6390 if ( aExp == 0x7FFF ) {
6391 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
ff32e16e 6392 return propagateFloat128NaN(a, b, status);
158142c2 6393 }
ff32e16e 6394 float_raise(float_flag_invalid, status);
158142c2
FB
6395 z.low = float128_default_nan_low;
6396 z.high = float128_default_nan_high;
6397 return z;
6398 }
6399 if ( aExp == 0 ) {
6400 aExp = 1;
6401 bExp = 1;
6402 }
6403 if ( bSig0 < aSig0 ) goto aBigger;
6404 if ( aSig0 < bSig0 ) goto bBigger;
6405 if ( bSig1 < aSig1 ) goto aBigger;
6406 if ( aSig1 < bSig1 ) goto bBigger;
a2f2d288
PM
6407 return packFloat128(status->float_rounding_mode == float_round_down,
6408 0, 0, 0);
158142c2
FB
6409 bExpBigger:
6410 if ( bExp == 0x7FFF ) {
ff32e16e
PM
6411 if (bSig0 | bSig1) {
6412 return propagateFloat128NaN(a, b, status);
6413 }
158142c2
FB
6414 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6415 }
6416 if ( aExp == 0 ) {
6417 ++expDiff;
6418 }
6419 else {
6420 aSig0 |= LIT64( 0x4000000000000000 );
6421 }
6422 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6423 bSig0 |= LIT64( 0x4000000000000000 );
6424 bBigger:
6425 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6426 zExp = bExp;
6427 zSign ^= 1;
6428 goto normalizeRoundAndPack;
6429 aExpBigger:
6430 if ( aExp == 0x7FFF ) {
ff32e16e
PM
6431 if (aSig0 | aSig1) {
6432 return propagateFloat128NaN(a, b, status);
6433 }
158142c2
FB
6434 return a;
6435 }
6436 if ( bExp == 0 ) {
6437 --expDiff;
6438 }
6439 else {
6440 bSig0 |= LIT64( 0x4000000000000000 );
6441 }
6442 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6443 aSig0 |= LIT64( 0x4000000000000000 );
6444 aBigger:
6445 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6446 zExp = aExp;
6447 normalizeRoundAndPack:
6448 --zExp;
ff32e16e
PM
6449 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6450 status);
158142c2
FB
6451
6452}
6453
6454/*----------------------------------------------------------------------------
6455| Returns the result of adding the quadruple-precision floating-point values
6456| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
6457| for Binary Floating-Point Arithmetic.
6458*----------------------------------------------------------------------------*/
6459
e5a41ffa 6460float128 float128_add(float128 a, float128 b, float_status *status)
158142c2
FB
6461{
6462 flag aSign, bSign;
6463
6464 aSign = extractFloat128Sign( a );
6465 bSign = extractFloat128Sign( b );
6466 if ( aSign == bSign ) {
ff32e16e 6467 return addFloat128Sigs(a, b, aSign, status);
158142c2
FB
6468 }
6469 else {
ff32e16e 6470 return subFloat128Sigs(a, b, aSign, status);
158142c2
FB
6471 }
6472
6473}
6474
6475/*----------------------------------------------------------------------------
6476| Returns the result of subtracting the quadruple-precision floating-point
6477| values `a' and `b'. The operation is performed according to the IEC/IEEE
6478| Standard for Binary Floating-Point Arithmetic.
6479*----------------------------------------------------------------------------*/
6480
e5a41ffa 6481float128 float128_sub(float128 a, float128 b, float_status *status)
158142c2
FB
6482{
6483 flag aSign, bSign;
6484
6485 aSign = extractFloat128Sign( a );
6486 bSign = extractFloat128Sign( b );
6487 if ( aSign == bSign ) {
ff32e16e 6488 return subFloat128Sigs(a, b, aSign, status);
158142c2
FB
6489 }
6490 else {
ff32e16e 6491 return addFloat128Sigs(a, b, aSign, status);
158142c2
FB
6492 }
6493
6494}
6495
6496/*----------------------------------------------------------------------------
6497| Returns the result of multiplying the quadruple-precision floating-point
6498| values `a' and `b'. The operation is performed according to the IEC/IEEE
6499| Standard for Binary Floating-Point Arithmetic.
6500*----------------------------------------------------------------------------*/
6501
e5a41ffa 6502float128 float128_mul(float128 a, float128 b, float_status *status)
158142c2
FB
6503{
6504 flag aSign, bSign, zSign;
f4014512 6505 int32_t aExp, bExp, zExp;
bb98fe42 6506 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
158142c2
FB
6507 float128 z;
6508
6509 aSig1 = extractFloat128Frac1( a );
6510 aSig0 = extractFloat128Frac0( a );
6511 aExp = extractFloat128Exp( a );
6512 aSign = extractFloat128Sign( a );
6513 bSig1 = extractFloat128Frac1( b );
6514 bSig0 = extractFloat128Frac0( b );
6515 bExp = extractFloat128Exp( b );
6516 bSign = extractFloat128Sign( b );
6517 zSign = aSign ^ bSign;
6518 if ( aExp == 0x7FFF ) {
6519 if ( ( aSig0 | aSig1 )
6520 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
ff32e16e 6521 return propagateFloat128NaN(a, b, status);
158142c2
FB
6522 }
6523 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6524 return packFloat128( zSign, 0x7FFF, 0, 0 );
6525 }
6526 if ( bExp == 0x7FFF ) {
ff32e16e
PM
6527 if (bSig0 | bSig1) {
6528 return propagateFloat128NaN(a, b, status);
6529 }
158142c2
FB
6530 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6531 invalid:
ff32e16e 6532 float_raise(float_flag_invalid, status);
158142c2
FB
6533 z.low = float128_default_nan_low;
6534 z.high = float128_default_nan_high;
6535 return z;
6536 }
6537 return packFloat128( zSign, 0x7FFF, 0, 0 );
6538 }
6539 if ( aExp == 0 ) {
6540 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6541 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6542 }
6543 if ( bExp == 0 ) {
6544 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6545 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6546 }
6547 zExp = aExp + bExp - 0x4000;
6548 aSig0 |= LIT64( 0x0001000000000000 );
6549 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6550 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6551 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6552 zSig2 |= ( zSig3 != 0 );
6553 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6554 shift128ExtraRightJamming(
6555 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6556 ++zExp;
6557 }
ff32e16e 6558 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
158142c2
FB
6559
6560}
6561
6562/*----------------------------------------------------------------------------
6563| Returns the result of dividing the quadruple-precision floating-point value
6564| `a' by the corresponding value `b'. The operation is performed according to
6565| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6566*----------------------------------------------------------------------------*/
6567
e5a41ffa 6568float128 float128_div(float128 a, float128 b, float_status *status)
158142c2
FB
6569{
6570 flag aSign, bSign, zSign;
f4014512 6571 int32_t aExp, bExp, zExp;
bb98fe42
AF
6572 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6573 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
158142c2
FB
6574 float128 z;
6575
6576 aSig1 = extractFloat128Frac1( a );
6577 aSig0 = extractFloat128Frac0( a );
6578 aExp = extractFloat128Exp( a );
6579 aSign = extractFloat128Sign( a );
6580 bSig1 = extractFloat128Frac1( b );
6581 bSig0 = extractFloat128Frac0( b );
6582 bExp = extractFloat128Exp( b );
6583 bSign = extractFloat128Sign( b );
6584 zSign = aSign ^ bSign;
6585 if ( aExp == 0x7FFF ) {
ff32e16e
PM
6586 if (aSig0 | aSig1) {
6587 return propagateFloat128NaN(a, b, status);
6588 }
158142c2 6589 if ( bExp == 0x7FFF ) {
ff32e16e
PM
6590 if (bSig0 | bSig1) {
6591 return propagateFloat128NaN(a, b, status);
6592 }
158142c2
FB
6593 goto invalid;
6594 }
6595 return packFloat128( zSign, 0x7FFF, 0, 0 );
6596 }
6597 if ( bExp == 0x7FFF ) {
ff32e16e
PM
6598 if (bSig0 | bSig1) {
6599 return propagateFloat128NaN(a, b, status);
6600 }
158142c2
FB
6601 return packFloat128( zSign, 0, 0, 0 );
6602 }
6603 if ( bExp == 0 ) {
6604 if ( ( bSig0 | bSig1 ) == 0 ) {
6605 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6606 invalid:
ff32e16e 6607 float_raise(float_flag_invalid, status);
158142c2
FB
6608 z.low = float128_default_nan_low;
6609 z.high = float128_default_nan_high;
6610 return z;
6611 }
ff32e16e 6612 float_raise(float_flag_divbyzero, status);
158142c2
FB
6613 return packFloat128( zSign, 0x7FFF, 0, 0 );
6614 }
6615 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6616 }
6617 if ( aExp == 0 ) {
6618 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6619 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6620 }
6621 zExp = aExp - bExp + 0x3FFD;
6622 shortShift128Left(
6623 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6624 shortShift128Left(
6625 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6626 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6627 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6628 ++zExp;
6629 }
6630 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6631 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6632 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
bb98fe42 6633 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
6634 --zSig0;
6635 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6636 }
6637 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6638 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6639 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6640 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
bb98fe42 6641 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
6642 --zSig1;
6643 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6644 }
6645 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6646 }
6647 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
ff32e16e 6648 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
158142c2
FB
6649
6650}
6651
6652/*----------------------------------------------------------------------------
6653| Returns the remainder of the quadruple-precision floating-point value `a'
6654| with respect to the corresponding value `b'. The operation is performed
6655| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6656*----------------------------------------------------------------------------*/
6657
e5a41ffa 6658float128 float128_rem(float128 a, float128 b, float_status *status)
158142c2 6659{
ed086f3d 6660 flag aSign, zSign;
f4014512 6661 int32_t aExp, bExp, expDiff;
bb98fe42
AF
6662 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6663 uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6664 int64_t sigMean0;
158142c2
FB
6665 float128 z;
6666
6667 aSig1 = extractFloat128Frac1( a );
6668 aSig0 = extractFloat128Frac0( a );
6669 aExp = extractFloat128Exp( a );
6670 aSign = extractFloat128Sign( a );
6671 bSig1 = extractFloat128Frac1( b );
6672 bSig0 = extractFloat128Frac0( b );
6673 bExp = extractFloat128Exp( b );
158142c2
FB
6674 if ( aExp == 0x7FFF ) {
6675 if ( ( aSig0 | aSig1 )
6676 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
ff32e16e 6677 return propagateFloat128NaN(a, b, status);
158142c2
FB
6678 }
6679 goto invalid;
6680 }
6681 if ( bExp == 0x7FFF ) {
ff32e16e
PM
6682 if (bSig0 | bSig1) {
6683 return propagateFloat128NaN(a, b, status);
6684 }
158142c2
FB
6685 return a;
6686 }
6687 if ( bExp == 0 ) {
6688 if ( ( bSig0 | bSig1 ) == 0 ) {
6689 invalid:
ff32e16e 6690 float_raise(float_flag_invalid, status);
158142c2
FB
6691 z.low = float128_default_nan_low;
6692 z.high = float128_default_nan_high;
6693 return z;
6694 }
6695 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6696 }
6697 if ( aExp == 0 ) {
6698 if ( ( aSig0 | aSig1 ) == 0 ) return a;
6699 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6700 }
6701 expDiff = aExp - bExp;
6702 if ( expDiff < -1 ) return a;
6703 shortShift128Left(
6704 aSig0 | LIT64( 0x0001000000000000 ),
6705 aSig1,
6706 15 - ( expDiff < 0 ),
6707 &aSig0,
6708 &aSig1
6709 );
6710 shortShift128Left(
6711 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6712 q = le128( bSig0, bSig1, aSig0, aSig1 );
6713 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6714 expDiff -= 64;
6715 while ( 0 < expDiff ) {
6716 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6717 q = ( 4 < q ) ? q - 4 : 0;
6718 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6719 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6720 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6721 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6722 expDiff -= 61;
6723 }
6724 if ( -64 < expDiff ) {
6725 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6726 q = ( 4 < q ) ? q - 4 : 0;
6727 q >>= - expDiff;
6728 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6729 expDiff += 52;
6730 if ( expDiff < 0 ) {
6731 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6732 }
6733 else {
6734 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6735 }
6736 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6737 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6738 }
6739 else {
6740 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6741 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6742 }
6743 do {
6744 alternateASig0 = aSig0;
6745 alternateASig1 = aSig1;
6746 ++q;
6747 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
bb98fe42 6748 } while ( 0 <= (int64_t) aSig0 );
158142c2 6749 add128(
bb98fe42 6750 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
158142c2
FB
6751 if ( ( sigMean0 < 0 )
6752 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6753 aSig0 = alternateASig0;
6754 aSig1 = alternateASig1;
6755 }
bb98fe42 6756 zSign = ( (int64_t) aSig0 < 0 );
158142c2 6757 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
ff32e16e
PM
6758 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
6759 status);
158142c2
FB
6760}
6761
6762/*----------------------------------------------------------------------------
6763| Returns the square root of the quadruple-precision floating-point value `a'.
6764| The operation is performed according to the IEC/IEEE Standard for Binary
6765| Floating-Point Arithmetic.
6766*----------------------------------------------------------------------------*/
6767
e5a41ffa 6768float128 float128_sqrt(float128 a, float_status *status)
158142c2
FB
6769{
6770 flag aSign;
f4014512 6771 int32_t aExp, zExp;
bb98fe42
AF
6772 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6773 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
158142c2
FB
6774 float128 z;
6775
6776 aSig1 = extractFloat128Frac1( a );
6777 aSig0 = extractFloat128Frac0( a );
6778 aExp = extractFloat128Exp( a );
6779 aSign = extractFloat128Sign( a );
6780 if ( aExp == 0x7FFF ) {
ff32e16e
PM
6781 if (aSig0 | aSig1) {
6782 return propagateFloat128NaN(a, a, status);
6783 }
158142c2
FB
6784 if ( ! aSign ) return a;
6785 goto invalid;
6786 }
6787 if ( aSign ) {
6788 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6789 invalid:
ff32e16e 6790 float_raise(float_flag_invalid, status);
158142c2
FB
6791 z.low = float128_default_nan_low;
6792 z.high = float128_default_nan_high;
6793 return z;
6794 }
6795 if ( aExp == 0 ) {
6796 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6797 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6798 }
6799 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6800 aSig0 |= LIT64( 0x0001000000000000 );
6801 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6802 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6803 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6804 doubleZSig0 = zSig0<<1;
6805 mul64To128( zSig0, zSig0, &term0, &term1 );
6806 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
bb98fe42 6807 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
6808 --zSig0;
6809 doubleZSig0 -= 2;
6810 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6811 }
6812 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6813 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6814 if ( zSig1 == 0 ) zSig1 = 1;
6815 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6816 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6817 mul64To128( zSig1, zSig1, &term2, &term3 );
6818 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
bb98fe42 6819 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
6820 --zSig1;
6821 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6822 term3 |= 1;
6823 term2 |= doubleZSig0;
6824 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6825 }
6826 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6827 }
6828 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
ff32e16e 6829 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
158142c2
FB
6830
6831}
6832
6833/*----------------------------------------------------------------------------
6834| Returns 1 if the quadruple-precision floating-point value `a' is equal to
b689362d
AJ
6835| the corresponding value `b', and 0 otherwise. The invalid exception is
6836| raised if either operand is a NaN. Otherwise, the comparison is performed
158142c2
FB
6837| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6838*----------------------------------------------------------------------------*/
6839
e5a41ffa 6840int float128_eq(float128 a, float128 b, float_status *status)
158142c2
FB
6841{
6842
6843 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6844 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6845 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6846 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6847 ) {
ff32e16e 6848 float_raise(float_flag_invalid, status);
158142c2
FB
6849 return 0;
6850 }
6851 return
6852 ( a.low == b.low )
6853 && ( ( a.high == b.high )
6854 || ( ( a.low == 0 )
bb98fe42 6855 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
6856 );
6857
6858}
6859
6860/*----------------------------------------------------------------------------
6861| Returns 1 if the quadruple-precision floating-point value `a' is less than
f5a64251
AJ
6862| or equal to the corresponding value `b', and 0 otherwise. The invalid
6863| exception is raised if either operand is a NaN. The comparison is performed
6864| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
6865*----------------------------------------------------------------------------*/
6866
e5a41ffa 6867int float128_le(float128 a, float128 b, float_status *status)
158142c2
FB
6868{
6869 flag aSign, bSign;
6870
6871 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6872 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6873 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6874 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6875 ) {
ff32e16e 6876 float_raise(float_flag_invalid, status);
158142c2
FB
6877 return 0;
6878 }
6879 aSign = extractFloat128Sign( a );
6880 bSign = extractFloat128Sign( b );
6881 if ( aSign != bSign ) {
6882 return
6883 aSign
bb98fe42 6884 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
6885 == 0 );
6886 }
6887 return
6888 aSign ? le128( b.high, b.low, a.high, a.low )
6889 : le128( a.high, a.low, b.high, b.low );
6890
6891}
6892
6893/*----------------------------------------------------------------------------
6894| Returns 1 if the quadruple-precision floating-point value `a' is less than
f5a64251
AJ
6895| the corresponding value `b', and 0 otherwise. The invalid exception is
6896| raised if either operand is a NaN. The comparison is performed according
6897| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
6898*----------------------------------------------------------------------------*/
6899
e5a41ffa 6900int float128_lt(float128 a, float128 b, float_status *status)
158142c2
FB
6901{
6902 flag aSign, bSign;
6903
6904 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6905 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6906 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6907 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6908 ) {
ff32e16e 6909 float_raise(float_flag_invalid, status);
158142c2
FB
6910 return 0;
6911 }
6912 aSign = extractFloat128Sign( a );
6913 bSign = extractFloat128Sign( b );
6914 if ( aSign != bSign ) {
6915 return
6916 aSign
bb98fe42 6917 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
6918 != 0 );
6919 }
6920 return
6921 aSign ? lt128( b.high, b.low, a.high, a.low )
6922 : lt128( a.high, a.low, b.high, b.low );
6923
6924}
6925
67b7861d
AJ
6926/*----------------------------------------------------------------------------
6927| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
f5a64251
AJ
6928| be compared, and 0 otherwise. The invalid exception is raised if either
6929| operand is a NaN. The comparison is performed according to the IEC/IEEE
6930| Standard for Binary Floating-Point Arithmetic.
67b7861d
AJ
6931*----------------------------------------------------------------------------*/
6932
e5a41ffa 6933int float128_unordered(float128 a, float128 b, float_status *status)
67b7861d
AJ
6934{
6935 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6936 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6937 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6938 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6939 ) {
ff32e16e 6940 float_raise(float_flag_invalid, status);
67b7861d
AJ
6941 return 1;
6942 }
6943 return 0;
6944}
6945
158142c2
FB
6946/*----------------------------------------------------------------------------
6947| Returns 1 if the quadruple-precision floating-point value `a' is equal to
f5a64251
AJ
6948| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6949| exception. The comparison is performed according to the IEC/IEEE Standard
6950| for Binary Floating-Point Arithmetic.
158142c2
FB
6951*----------------------------------------------------------------------------*/
6952
e5a41ffa 6953int float128_eq_quiet(float128 a, float128 b, float_status *status)
158142c2
FB
6954{
6955
6956 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6957 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6958 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6959 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6960 ) {
b689362d
AJ
6961 if ( float128_is_signaling_nan( a )
6962 || float128_is_signaling_nan( b ) ) {
ff32e16e 6963 float_raise(float_flag_invalid, status);
b689362d 6964 }
158142c2
FB
6965 return 0;
6966 }
6967 return
6968 ( a.low == b.low )
6969 && ( ( a.high == b.high )
6970 || ( ( a.low == 0 )
bb98fe42 6971 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
6972 );
6973
6974}
6975
6976/*----------------------------------------------------------------------------
6977| Returns 1 if the quadruple-precision floating-point value `a' is less than
6978| or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
6979| cause an exception. Otherwise, the comparison is performed according to the
6980| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6981*----------------------------------------------------------------------------*/
6982
e5a41ffa 6983int float128_le_quiet(float128 a, float128 b, float_status *status)
158142c2
FB
6984{
6985 flag aSign, bSign;
6986
6987 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6988 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6989 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6990 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6991 ) {
6992 if ( float128_is_signaling_nan( a )
6993 || float128_is_signaling_nan( b ) ) {
ff32e16e 6994 float_raise(float_flag_invalid, status);
158142c2
FB
6995 }
6996 return 0;
6997 }
6998 aSign = extractFloat128Sign( a );
6999 bSign = extractFloat128Sign( b );
7000 if ( aSign != bSign ) {
7001 return
7002 aSign
bb98fe42 7003 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
7004 == 0 );
7005 }
7006 return
7007 aSign ? le128( b.high, b.low, a.high, a.low )
7008 : le128( a.high, a.low, b.high, b.low );
7009
7010}
7011
7012/*----------------------------------------------------------------------------
7013| Returns 1 if the quadruple-precision floating-point value `a' is less than
7014| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
7015| exception. Otherwise, the comparison is performed according to the IEC/IEEE
7016| Standard for Binary Floating-Point Arithmetic.
7017*----------------------------------------------------------------------------*/
7018
e5a41ffa 7019int float128_lt_quiet(float128 a, float128 b, float_status *status)
158142c2
FB
7020{
7021 flag aSign, bSign;
7022
7023 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
7024 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7025 || ( ( extractFloat128Exp( b ) == 0x7FFF )
7026 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7027 ) {
7028 if ( float128_is_signaling_nan( a )
7029 || float128_is_signaling_nan( b ) ) {
ff32e16e 7030 float_raise(float_flag_invalid, status);
158142c2
FB
7031 }
7032 return 0;
7033 }
7034 aSign = extractFloat128Sign( a );
7035 bSign = extractFloat128Sign( b );
7036 if ( aSign != bSign ) {
7037 return
7038 aSign
bb98fe42 7039 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
7040 != 0 );
7041 }
7042 return
7043 aSign ? lt128( b.high, b.low, a.high, a.low )
7044 : lt128( a.high, a.low, b.high, b.low );
7045
7046}
7047
67b7861d
AJ
7048/*----------------------------------------------------------------------------
7049| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7050| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
7051| comparison is performed according to the IEC/IEEE Standard for Binary
7052| Floating-Point Arithmetic.
7053*----------------------------------------------------------------------------*/
7054
e5a41ffa 7055int float128_unordered_quiet(float128 a, float128 b, float_status *status)
67b7861d
AJ
7056{
7057 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
7058 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7059 || ( ( extractFloat128Exp( b ) == 0x7FFF )
7060 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7061 ) {
7062 if ( float128_is_signaling_nan( a )
7063 || float128_is_signaling_nan( b ) ) {
ff32e16e 7064 float_raise(float_flag_invalid, status);
67b7861d
AJ
7065 }
7066 return 1;
7067 }
7068 return 0;
7069}
7070
1d6bda35 7071/* misc functions */
e5a41ffa 7072float32 uint32_to_float32(uint32_t a, float_status *status)
1d6bda35 7073{
ff32e16e 7074 return int64_to_float32(a, status);
1d6bda35
FB
7075}
7076
e5a41ffa 7077float64 uint32_to_float64(uint32_t a, float_status *status)
1d6bda35 7078{
ff32e16e 7079 return int64_to_float64(a, status);
1d6bda35
FB
7080}
7081
3a87d009 7082uint32_t float32_to_uint32(float32 a, float_status *status)
1d6bda35
FB
7083{
7084 int64_t v;
3a87d009 7085 uint32_t res;
34e1c27b 7086 int old_exc_flags = get_float_exception_flags(status);
1d6bda35 7087
ff32e16e 7088 v = float32_to_int64(a, status);
1d6bda35
FB
7089 if (v < 0) {
7090 res = 0;
1d6bda35
FB
7091 } else if (v > 0xffffffff) {
7092 res = 0xffffffff;
1d6bda35 7093 } else {
34e1c27b 7094 return v;
1d6bda35 7095 }
34e1c27b 7096 set_float_exception_flags(old_exc_flags, status);
ff32e16e 7097 float_raise(float_flag_invalid, status);
1d6bda35
FB
7098 return res;
7099}
7100
3a87d009 7101uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *status)
1d6bda35
FB
7102{
7103 int64_t v;
3a87d009 7104 uint32_t res;
34e1c27b 7105 int old_exc_flags = get_float_exception_flags(status);
1d6bda35 7106
ff32e16e 7107 v = float32_to_int64_round_to_zero(a, status);
1d6bda35
FB
7108 if (v < 0) {
7109 res = 0;
1d6bda35
FB
7110 } else if (v > 0xffffffff) {
7111 res = 0xffffffff;
1d6bda35 7112 } else {
34e1c27b 7113 return v;
1d6bda35 7114 }
34e1c27b 7115 set_float_exception_flags(old_exc_flags, status);
ff32e16e 7116 float_raise(float_flag_invalid, status);
1d6bda35
FB
7117 return res;
7118}
7119
0bb721d7 7120int16_t float32_to_int16(float32 a, float_status *status)
f581bf54
WN
7121{
7122 int32_t v;
0bb721d7 7123 int16_t res;
f581bf54
WN
7124 int old_exc_flags = get_float_exception_flags(status);
7125
ff32e16e 7126 v = float32_to_int32(a, status);
f581bf54
WN
7127 if (v < -0x8000) {
7128 res = -0x8000;
7129 } else if (v > 0x7fff) {
7130 res = 0x7fff;
7131 } else {
7132 return v;
7133 }
7134
7135 set_float_exception_flags(old_exc_flags, status);
ff32e16e 7136 float_raise(float_flag_invalid, status);
f581bf54
WN
7137 return res;
7138}
7139
0bb721d7 7140uint16_t float32_to_uint16(float32 a, float_status *status)
f581bf54
WN
7141{
7142 int32_t v;
0bb721d7 7143 uint16_t res;
f581bf54
WN
7144 int old_exc_flags = get_float_exception_flags(status);
7145
ff32e16e 7146 v = float32_to_int32(a, status);
f581bf54
WN
7147 if (v < 0) {
7148 res = 0;
7149 } else if (v > 0xffff) {
7150 res = 0xffff;
7151 } else {
7152 return v;
7153 }
7154
7155 set_float_exception_flags(old_exc_flags, status);
ff32e16e 7156 float_raise(float_flag_invalid, status);
f581bf54
WN
7157 return res;
7158}
7159
0bb721d7 7160uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *status)
cbcef455
PM
7161{
7162 int64_t v;
0bb721d7 7163 uint16_t res;
34e1c27b 7164 int old_exc_flags = get_float_exception_flags(status);
cbcef455 7165
ff32e16e 7166 v = float32_to_int64_round_to_zero(a, status);
cbcef455
PM
7167 if (v < 0) {
7168 res = 0;
cbcef455
PM
7169 } else if (v > 0xffff) {
7170 res = 0xffff;
cbcef455 7171 } else {
34e1c27b 7172 return v;
cbcef455 7173 }
34e1c27b 7174 set_float_exception_flags(old_exc_flags, status);
ff32e16e 7175 float_raise(float_flag_invalid, status);
cbcef455
PM
7176 return res;
7177}
7178
3a87d009 7179uint32_t float64_to_uint32(float64 a, float_status *status)
1d6bda35 7180{
5e7f654f 7181 uint64_t v;
3a87d009 7182 uint32_t res;
5e7f654f 7183 int old_exc_flags = get_float_exception_flags(status);
1d6bda35 7184
ff32e16e 7185 v = float64_to_uint64(a, status);
5e7f654f 7186 if (v > 0xffffffff) {
1d6bda35 7187 res = 0xffffffff;
1d6bda35 7188 } else {
5e7f654f 7189 return v;
1d6bda35 7190 }
5e7f654f 7191 set_float_exception_flags(old_exc_flags, status);
ff32e16e 7192 float_raise(float_flag_invalid, status);
1d6bda35
FB
7193 return res;
7194}
7195
3a87d009 7196uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *status)
1d6bda35 7197{
fd728f2f 7198 uint64_t v;
3a87d009 7199 uint32_t res;
fd728f2f 7200 int old_exc_flags = get_float_exception_flags(status);
1d6bda35 7201
ff32e16e 7202 v = float64_to_uint64_round_to_zero(a, status);
fd728f2f 7203 if (v > 0xffffffff) {
1d6bda35 7204 res = 0xffffffff;
1d6bda35 7205 } else {
fd728f2f 7206 return v;
1d6bda35 7207 }
fd728f2f 7208 set_float_exception_flags(old_exc_flags, status);
ff32e16e 7209 float_raise(float_flag_invalid, status);
1d6bda35
FB
7210 return res;
7211}
7212
0bb721d7 7213int16_t float64_to_int16(float64 a, float_status *status)
f581bf54
WN
7214{
7215 int64_t v;
0bb721d7 7216 int16_t res;
f581bf54
WN
7217 int old_exc_flags = get_float_exception_flags(status);
7218
ff32e16e 7219 v = float64_to_int32(a, status);
f581bf54
WN
7220 if (v < -0x8000) {
7221 res = -0x8000;
7222 } else if (v > 0x7fff) {
7223 res = 0x7fff;
7224 } else {
7225 return v;
7226 }
7227
7228 set_float_exception_flags(old_exc_flags, status);
ff32e16e 7229 float_raise(float_flag_invalid, status);
f581bf54
WN
7230 return res;
7231}
7232
0bb721d7 7233uint16_t float64_to_uint16(float64 a, float_status *status)
f581bf54
WN
7234{
7235 int64_t v;
0bb721d7 7236 uint16_t res;
f581bf54
WN
7237 int old_exc_flags = get_float_exception_flags(status);
7238
ff32e16e 7239 v = float64_to_int32(a, status);
f581bf54
WN
7240 if (v < 0) {
7241 res = 0;
7242 } else if (v > 0xffff) {
7243 res = 0xffff;
7244 } else {
7245 return v;
7246 }
7247
7248 set_float_exception_flags(old_exc_flags, status);
ff32e16e 7249 float_raise(float_flag_invalid, status);
f581bf54
WN
7250 return res;
7251}
7252
0bb721d7 7253uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *status)
cbcef455
PM
7254{
7255 int64_t v;
0bb721d7 7256 uint16_t res;
34e1c27b 7257 int old_exc_flags = get_float_exception_flags(status);
cbcef455 7258
ff32e16e 7259 v = float64_to_int64_round_to_zero(a, status);
cbcef455
PM
7260 if (v < 0) {
7261 res = 0;
cbcef455
PM
7262 } else if (v > 0xffff) {
7263 res = 0xffff;
cbcef455 7264 } else {
34e1c27b 7265 return v;
cbcef455 7266 }
34e1c27b 7267 set_float_exception_flags(old_exc_flags, status);
ff32e16e 7268 float_raise(float_flag_invalid, status);
cbcef455
PM
7269 return res;
7270}
7271
fb3ea83a
TM
7272/*----------------------------------------------------------------------------
7273| Returns the result of converting the double-precision floating-point value
7274| `a' to the 64-bit unsigned integer format. The conversion is
7275| performed according to the IEC/IEEE Standard for Binary Floating-Point
7276| Arithmetic---which means in particular that the conversion is rounded
7277| according to the current rounding mode. If `a' is a NaN, the largest
7278| positive integer is returned. If the conversion overflows, the
7279| largest unsigned integer is returned. If 'a' is negative, the value is
7280| rounded and zero is returned; negative values that do not round to zero
7281| will raise the inexact exception.
7282*----------------------------------------------------------------------------*/
75d62a58 7283
e5a41ffa 7284uint64_t float64_to_uint64(float64 a, float_status *status)
fb3ea83a
TM
7285{
7286 flag aSign;
7287 int_fast16_t aExp, shiftCount;
7288 uint64_t aSig, aSigExtra;
ff32e16e 7289 a = float64_squash_input_denormal(a, status);
75d62a58 7290
fb3ea83a
TM
7291 aSig = extractFloat64Frac(a);
7292 aExp = extractFloat64Exp(a);
7293 aSign = extractFloat64Sign(a);
7294 if (aSign && (aExp > 1022)) {
ff32e16e 7295 float_raise(float_flag_invalid, status);
fb3ea83a
TM
7296 if (float64_is_any_nan(a)) {
7297 return LIT64(0xFFFFFFFFFFFFFFFF);
7298 } else {
7299 return 0;
7300 }
7301 }
7302 if (aExp) {
7303 aSig |= LIT64(0x0010000000000000);
7304 }
7305 shiftCount = 0x433 - aExp;
7306 if (shiftCount <= 0) {
7307 if (0x43E < aExp) {
ff32e16e 7308 float_raise(float_flag_invalid, status);
fb3ea83a
TM
7309 return LIT64(0xFFFFFFFFFFFFFFFF);
7310 }
7311 aSigExtra = 0;
7312 aSig <<= -shiftCount;
7313 } else {
7314 shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra);
7315 }
ff32e16e 7316 return roundAndPackUint64(aSign, aSig, aSigExtra, status);
75d62a58
JM
7317}
7318
e5a41ffa 7319uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *status)
75d62a58 7320{
a2f2d288 7321 signed char current_rounding_mode = status->float_rounding_mode;
ff32e16e
PM
7322 set_float_rounding_mode(float_round_to_zero, status);
7323 int64_t v = float64_to_uint64(a, status);
7324 set_float_rounding_mode(current_rounding_mode, status);
0a87a310 7325 return v;
75d62a58
JM
7326}
7327
1d6bda35 7328#define COMPARE(s, nan_exp) \
e5a41ffa
PM
7329static inline int float ## s ## _compare_internal(float ## s a, float ## s b,\
7330 int is_quiet, float_status *status) \
1d6bda35
FB
7331{ \
7332 flag aSign, bSign; \
bb98fe42 7333 uint ## s ## _t av, bv; \
ff32e16e
PM
7334 a = float ## s ## _squash_input_denormal(a, status); \
7335 b = float ## s ## _squash_input_denormal(b, status); \
1d6bda35
FB
7336 \
7337 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
7338 extractFloat ## s ## Frac( a ) ) || \
7339 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
7340 extractFloat ## s ## Frac( b ) )) { \
7341 if (!is_quiet || \
7342 float ## s ## _is_signaling_nan( a ) || \
7343 float ## s ## _is_signaling_nan( b ) ) { \
ff32e16e 7344 float_raise(float_flag_invalid, status); \
1d6bda35
FB
7345 } \
7346 return float_relation_unordered; \
7347 } \
7348 aSign = extractFloat ## s ## Sign( a ); \
7349 bSign = extractFloat ## s ## Sign( b ); \
f090c9d4 7350 av = float ## s ## _val(a); \
cd8a2533 7351 bv = float ## s ## _val(b); \
1d6bda35 7352 if ( aSign != bSign ) { \
bb98fe42 7353 if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) { \
1d6bda35
FB
7354 /* zero case */ \
7355 return float_relation_equal; \
7356 } else { \
7357 return 1 - (2 * aSign); \
7358 } \
7359 } else { \
f090c9d4 7360 if (av == bv) { \
1d6bda35
FB
7361 return float_relation_equal; \
7362 } else { \
f090c9d4 7363 return 1 - 2 * (aSign ^ ( av < bv )); \
1d6bda35
FB
7364 } \
7365 } \
7366} \
7367 \
e5a41ffa 7368int float ## s ## _compare(float ## s a, float ## s b, float_status *status) \
1d6bda35 7369{ \
ff32e16e 7370 return float ## s ## _compare_internal(a, b, 0, status); \
1d6bda35
FB
7371} \
7372 \
e5a41ffa
PM
7373int float ## s ## _compare_quiet(float ## s a, float ## s b, \
7374 float_status *status) \
1d6bda35 7375{ \
ff32e16e 7376 return float ## s ## _compare_internal(a, b, 1, status); \
1d6bda35
FB
7377}
7378
7379COMPARE(32, 0xff)
7380COMPARE(64, 0x7ff)
9ee6e8bb 7381
e5a41ffa
PM
7382static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
7383 int is_quiet, float_status *status)
f6714d36
AJ
7384{
7385 flag aSign, bSign;
7386
7387 if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7388 ( extractFloatx80Frac( a )<<1 ) ) ||
7389 ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7390 ( extractFloatx80Frac( b )<<1 ) )) {
7391 if (!is_quiet ||
7392 floatx80_is_signaling_nan( a ) ||
7393 floatx80_is_signaling_nan( b ) ) {
ff32e16e 7394 float_raise(float_flag_invalid, status);
f6714d36
AJ
7395 }
7396 return float_relation_unordered;
7397 }
7398 aSign = extractFloatx80Sign( a );
7399 bSign = extractFloatx80Sign( b );
7400 if ( aSign != bSign ) {
7401
7402 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7403 ( ( a.low | b.low ) == 0 ) ) {
7404 /* zero case */
7405 return float_relation_equal;
7406 } else {
7407 return 1 - (2 * aSign);
7408 }
7409 } else {
7410 if (a.low == b.low && a.high == b.high) {
7411 return float_relation_equal;
7412 } else {
7413 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7414 }
7415 }
7416}
7417
e5a41ffa 7418int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
f6714d36 7419{
ff32e16e 7420 return floatx80_compare_internal(a, b, 0, status);
f6714d36
AJ
7421}
7422
e5a41ffa 7423int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
f6714d36 7424{
ff32e16e 7425 return floatx80_compare_internal(a, b, 1, status);
f6714d36
AJ
7426}
7427
e5a41ffa
PM
7428static inline int float128_compare_internal(float128 a, float128 b,
7429 int is_quiet, float_status *status)
1f587329
BS
7430{
7431 flag aSign, bSign;
7432
7433 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7434 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7435 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7436 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7437 if (!is_quiet ||
7438 float128_is_signaling_nan( a ) ||
7439 float128_is_signaling_nan( b ) ) {
ff32e16e 7440 float_raise(float_flag_invalid, status);
1f587329
BS
7441 }
7442 return float_relation_unordered;
7443 }
7444 aSign = extractFloat128Sign( a );
7445 bSign = extractFloat128Sign( b );
7446 if ( aSign != bSign ) {
7447 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7448 /* zero case */
7449 return float_relation_equal;
7450 } else {
7451 return 1 - (2 * aSign);
7452 }
7453 } else {
7454 if (a.low == b.low && a.high == b.high) {
7455 return float_relation_equal;
7456 } else {
7457 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7458 }
7459 }
7460}
7461
e5a41ffa 7462int float128_compare(float128 a, float128 b, float_status *status)
1f587329 7463{
ff32e16e 7464 return float128_compare_internal(a, b, 0, status);
1f587329
BS
7465}
7466
e5a41ffa 7467int float128_compare_quiet(float128 a, float128 b, float_status *status)
1f587329 7468{
ff32e16e 7469 return float128_compare_internal(a, b, 1, status);
1f587329
BS
7470}
7471
274f1b04
PM
7472/* min() and max() functions. These can't be implemented as
7473 * 'compare and pick one input' because that would mishandle
7474 * NaNs and +0 vs -0.
e17ab310
WN
7475 *
7476 * minnum() and maxnum() functions. These are similar to the min()
7477 * and max() functions but if one of the arguments is a QNaN and
7478 * the other is numerical then the numerical argument is returned.
7479 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
7480 * and maxNum() operations. min() and max() are the typical min/max
7481 * semantics provided by many CPUs which predate that specification.
2d31e060
LA
7482 *
7483 * minnummag() and maxnummag() functions correspond to minNumMag()
7484 * and minNumMag() from the IEEE-754 2008.
274f1b04 7485 */
e70614ea 7486#define MINMAX(s) \
a49db98d 7487static inline float ## s float ## s ## _minmax(float ## s a, float ## s b, \
2d31e060 7488 int ismin, int isieee, \
e5a41ffa
PM
7489 int ismag, \
7490 float_status *status) \
274f1b04
PM
7491{ \
7492 flag aSign, bSign; \
2d31e060 7493 uint ## s ## _t av, bv, aav, abv; \
ff32e16e
PM
7494 a = float ## s ## _squash_input_denormal(a, status); \
7495 b = float ## s ## _squash_input_denormal(b, status); \
274f1b04
PM
7496 if (float ## s ## _is_any_nan(a) || \
7497 float ## s ## _is_any_nan(b)) { \
e17ab310
WN
7498 if (isieee) { \
7499 if (float ## s ## _is_quiet_nan(a) && \
7500 !float ## s ##_is_any_nan(b)) { \
7501 return b; \
7502 } else if (float ## s ## _is_quiet_nan(b) && \
7503 !float ## s ## _is_any_nan(a)) { \
7504 return a; \
7505 } \
7506 } \
ff32e16e 7507 return propagateFloat ## s ## NaN(a, b, status); \
274f1b04
PM
7508 } \
7509 aSign = extractFloat ## s ## Sign(a); \
7510 bSign = extractFloat ## s ## Sign(b); \
7511 av = float ## s ## _val(a); \
7512 bv = float ## s ## _val(b); \
2d31e060
LA
7513 if (ismag) { \
7514 aav = float ## s ## _abs(av); \
7515 abv = float ## s ## _abs(bv); \
7516 if (aav != abv) { \
7517 if (ismin) { \
7518 return (aav < abv) ? a : b; \
7519 } else { \
7520 return (aav < abv) ? b : a; \
7521 } \
7522 } \
7523 } \
274f1b04
PM
7524 if (aSign != bSign) { \
7525 if (ismin) { \
7526 return aSign ? a : b; \
7527 } else { \
7528 return aSign ? b : a; \
7529 } \
7530 } else { \
7531 if (ismin) { \
7532 return (aSign ^ (av < bv)) ? a : b; \
7533 } else { \
7534 return (aSign ^ (av < bv)) ? b : a; \
7535 } \
7536 } \
7537} \
7538 \
e5a41ffa
PM
7539float ## s float ## s ## _min(float ## s a, float ## s b, \
7540 float_status *status) \
274f1b04 7541{ \
ff32e16e 7542 return float ## s ## _minmax(a, b, 1, 0, 0, status); \
274f1b04
PM
7543} \
7544 \
e5a41ffa
PM
7545float ## s float ## s ## _max(float ## s a, float ## s b, \
7546 float_status *status) \
274f1b04 7547{ \
ff32e16e 7548 return float ## s ## _minmax(a, b, 0, 0, 0, status); \
e17ab310
WN
7549} \
7550 \
e5a41ffa
PM
7551float ## s float ## s ## _minnum(float ## s a, float ## s b, \
7552 float_status *status) \
e17ab310 7553{ \
ff32e16e 7554 return float ## s ## _minmax(a, b, 1, 1, 0, status); \
e17ab310
WN
7555} \
7556 \
e5a41ffa
PM
7557float ## s float ## s ## _maxnum(float ## s a, float ## s b, \
7558 float_status *status) \
e17ab310 7559{ \
ff32e16e 7560 return float ## s ## _minmax(a, b, 0, 1, 0, status); \
2d31e060
LA
7561} \
7562 \
e5a41ffa
PM
7563float ## s float ## s ## _minnummag(float ## s a, float ## s b, \
7564 float_status *status) \
2d31e060 7565{ \
ff32e16e 7566 return float ## s ## _minmax(a, b, 1, 1, 1, status); \
2d31e060
LA
7567} \
7568 \
e5a41ffa
PM
7569float ## s float ## s ## _maxnummag(float ## s a, float ## s b, \
7570 float_status *status) \
2d31e060 7571{ \
ff32e16e 7572 return float ## s ## _minmax(a, b, 0, 1, 1, status); \
274f1b04
PM
7573}
7574
e70614ea
WN
7575MINMAX(32)
7576MINMAX(64)
274f1b04
PM
7577
7578
9ee6e8bb 7579/* Multiply A by 2 raised to the power N. */
e5a41ffa 7580float32 float32_scalbn(float32 a, int n, float_status *status)
9ee6e8bb
PB
7581{
7582 flag aSign;
326b9e98 7583 int16_t aExp;
bb98fe42 7584 uint32_t aSig;
9ee6e8bb 7585
ff32e16e 7586 a = float32_squash_input_denormal(a, status);
9ee6e8bb
PB
7587 aSig = extractFloat32Frac( a );
7588 aExp = extractFloat32Exp( a );
7589 aSign = extractFloat32Sign( a );
7590
7591 if ( aExp == 0xFF ) {
326b9e98 7592 if ( aSig ) {
ff32e16e 7593 return propagateFloat32NaN(a, a, status);
326b9e98 7594 }
9ee6e8bb
PB
7595 return a;
7596 }
3c85c37f 7597 if (aExp != 0) {
69397542 7598 aSig |= 0x00800000;
3c85c37f 7599 } else if (aSig == 0) {
69397542 7600 return a;
3c85c37f
PM
7601 } else {
7602 aExp++;
7603 }
69397542 7604
326b9e98
AJ
7605 if (n > 0x200) {
7606 n = 0x200;
7607 } else if (n < -0x200) {
7608 n = -0x200;
7609 }
7610
69397542
PB
7611 aExp += n - 1;
7612 aSig <<= 7;
ff32e16e 7613 return normalizeRoundAndPackFloat32(aSign, aExp, aSig, status);
9ee6e8bb
PB
7614}
7615
e5a41ffa 7616float64 float64_scalbn(float64 a, int n, float_status *status)
9ee6e8bb
PB
7617{
7618 flag aSign;
326b9e98 7619 int16_t aExp;
bb98fe42 7620 uint64_t aSig;
9ee6e8bb 7621
ff32e16e 7622 a = float64_squash_input_denormal(a, status);
9ee6e8bb
PB
7623 aSig = extractFloat64Frac( a );
7624 aExp = extractFloat64Exp( a );
7625 aSign = extractFloat64Sign( a );
7626
7627 if ( aExp == 0x7FF ) {
326b9e98 7628 if ( aSig ) {
ff32e16e 7629 return propagateFloat64NaN(a, a, status);
326b9e98 7630 }
9ee6e8bb
PB
7631 return a;
7632 }
3c85c37f 7633 if (aExp != 0) {
69397542 7634 aSig |= LIT64( 0x0010000000000000 );
3c85c37f 7635 } else if (aSig == 0) {
69397542 7636 return a;
3c85c37f
PM
7637 } else {
7638 aExp++;
7639 }
69397542 7640
326b9e98
AJ
7641 if (n > 0x1000) {
7642 n = 0x1000;
7643 } else if (n < -0x1000) {
7644 n = -0x1000;
7645 }
7646
69397542
PB
7647 aExp += n - 1;
7648 aSig <<= 10;
ff32e16e 7649 return normalizeRoundAndPackFloat64(aSign, aExp, aSig, status);
9ee6e8bb
PB
7650}
7651
e5a41ffa 7652floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
9ee6e8bb
PB
7653{
7654 flag aSign;
326b9e98 7655 int32_t aExp;
bb98fe42 7656 uint64_t aSig;
9ee6e8bb
PB
7657
7658 aSig = extractFloatx80Frac( a );
7659 aExp = extractFloatx80Exp( a );
7660 aSign = extractFloatx80Sign( a );
7661
326b9e98
AJ
7662 if ( aExp == 0x7FFF ) {
7663 if ( aSig<<1 ) {
ff32e16e 7664 return propagateFloatx80NaN(a, a, status);
326b9e98 7665 }
9ee6e8bb
PB
7666 return a;
7667 }
326b9e98 7668
3c85c37f
PM
7669 if (aExp == 0) {
7670 if (aSig == 0) {
7671 return a;
7672 }
7673 aExp++;
7674 }
69397542 7675
326b9e98
AJ
7676 if (n > 0x10000) {
7677 n = 0x10000;
7678 } else if (n < -0x10000) {
7679 n = -0x10000;
7680 }
7681
9ee6e8bb 7682 aExp += n;
a2f2d288
PM
7683 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7684 aSign, aExp, aSig, 0, status);
9ee6e8bb 7685}
9ee6e8bb 7686
e5a41ffa 7687float128 float128_scalbn(float128 a, int n, float_status *status)
9ee6e8bb
PB
7688{
7689 flag aSign;
326b9e98 7690 int32_t aExp;
bb98fe42 7691 uint64_t aSig0, aSig1;
9ee6e8bb
PB
7692
7693 aSig1 = extractFloat128Frac1( a );
7694 aSig0 = extractFloat128Frac0( a );
7695 aExp = extractFloat128Exp( a );
7696 aSign = extractFloat128Sign( a );
7697 if ( aExp == 0x7FFF ) {
326b9e98 7698 if ( aSig0 | aSig1 ) {
ff32e16e 7699 return propagateFloat128NaN(a, a, status);
326b9e98 7700 }
9ee6e8bb
PB
7701 return a;
7702 }
3c85c37f 7703 if (aExp != 0) {
69397542 7704 aSig0 |= LIT64( 0x0001000000000000 );
3c85c37f 7705 } else if (aSig0 == 0 && aSig1 == 0) {
69397542 7706 return a;
3c85c37f
PM
7707 } else {
7708 aExp++;
7709 }
69397542 7710
326b9e98
AJ
7711 if (n > 0x10000) {
7712 n = 0x10000;
7713 } else if (n < -0x10000) {
7714 n = -0x10000;
7715 }
7716
69397542
PB
7717 aExp += n - 1;
7718 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
ff32e16e 7719 , status);
9ee6e8bb
PB
7720
7721}