]> git.proxmox.com Git - mirror_qemu.git/blame - fpu/softfloat.c
hmp: Add nbd_server_remove to mirror QMP command
[mirror_qemu.git] / fpu / softfloat.c
CommitLineData
8d725fac
AF
1/*
2 * QEMU float support
3 *
16017c48
PM
4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
10 * the BSD license
11 * GPL-v2-or-later
12 *
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
8d725fac 16 */
158142c2 17
a7d1ac78
PM
18/*
19===============================================================================
20This C source file is part of the SoftFloat IEC/IEEE Floating-point
21Arithmetic Package, Release 2a.
158142c2
FB
22
23Written by John R. Hauser. This work was made possible in part by the
24International Computer Science Institute, located at Suite 600, 1947 Center
25Street, Berkeley, California 94704. Funding was partially provided by the
26National Science Foundation under grant MIP-9311980. The original version
27of this code was written as part of a project to build a fixed-point vector
28processor in collaboration with the University of California at Berkeley,
29overseen by Profs. Nelson Morgan and John Wawrzynek. More information
a7d1ac78 30is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
158142c2
FB
31arithmetic/SoftFloat.html'.
32
a7d1ac78
PM
33THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
158142c2
FB
38
39Derivative works are acceptable, even for commercial purposes, so long as
a7d1ac78
PM
40(1) they include prominent notice that the work is derivative, and (2) they
41include prominent notice akin to these four paragraphs for those parts of
42this code that are retained.
158142c2 43
a7d1ac78
PM
44===============================================================================
45*/
158142c2 46
16017c48
PM
47/* BSD licensing:
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
50 *
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
53 *
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
56 *
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
60 *
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
64 *
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
76 */
77
78/* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
80 */
81
2ac8bd03
PM
82/* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
84 */
d38ea87a 85#include "qemu/osdep.h"
2ac8bd03 86
6b4c305c 87#include "fpu/softfloat.h"
158142c2 88
dc355b76 89/* We only need stdlib for abort() */
dc355b76 90
158142c2
FB
91/*----------------------------------------------------------------------------
92| Primitive arithmetic functions, including multi-word arithmetic, and
93| division and square root approximations. (Can be specialized to target if
94| desired.)
95*----------------------------------------------------------------------------*/
96#include "softfloat-macros.h"
97
98/*----------------------------------------------------------------------------
99| Functions and definitions to determine: (1) whether tininess for underflow
100| is detected before or after rounding by default, (2) what (if anything)
101| happens when exceptions are raised, (3) how signaling NaNs are distinguished
102| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
103| are propagated from function inputs to output. These details are target-
104| specific.
105*----------------------------------------------------------------------------*/
106#include "softfloat-specialize.h"
107
bb4d4bb3
PM
108/*----------------------------------------------------------------------------
109| Returns the fraction bits of the half-precision floating-point value `a'.
110*----------------------------------------------------------------------------*/
111
a49db98d 112static inline uint32_t extractFloat16Frac(float16 a)
bb4d4bb3
PM
113{
114 return float16_val(a) & 0x3ff;
115}
116
117/*----------------------------------------------------------------------------
118| Returns the exponent bits of the half-precision floating-point value `a'.
119*----------------------------------------------------------------------------*/
120
0c48262d 121static inline int extractFloat16Exp(float16 a)
bb4d4bb3
PM
122{
123 return (float16_val(a) >> 10) & 0x1f;
124}
125
126/*----------------------------------------------------------------------------
127| Returns the sign bit of the single-precision floating-point value `a'.
128*----------------------------------------------------------------------------*/
129
a49db98d 130static inline flag extractFloat16Sign(float16 a)
bb4d4bb3
PM
131{
132 return float16_val(a)>>15;
133}
134
158142c2
FB
135/*----------------------------------------------------------------------------
136| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
137| and 7, and returns the properly rounded 32-bit integer corresponding to the
138| input. If `zSign' is 1, the input is negated before being converted to an
139| integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
140| is simply rounded to an integer, with the inexact exception raised if the
141| input cannot be represented exactly as an integer. However, if the fixed-
142| point input is too large, the invalid exception is raised and the largest
143| positive or negative integer is returned.
144*----------------------------------------------------------------------------*/
145
f4014512 146static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
158142c2 147{
8f506c70 148 int8_t roundingMode;
158142c2 149 flag roundNearestEven;
8f506c70 150 int8_t roundIncrement, roundBits;
760e1416 151 int32_t z;
158142c2 152
a2f2d288 153 roundingMode = status->float_rounding_mode;
158142c2 154 roundNearestEven = ( roundingMode == float_round_nearest_even );
dc355b76
PM
155 switch (roundingMode) {
156 case float_round_nearest_even:
f9288a76 157 case float_round_ties_away:
dc355b76
PM
158 roundIncrement = 0x40;
159 break;
160 case float_round_to_zero:
161 roundIncrement = 0;
162 break;
163 case float_round_up:
164 roundIncrement = zSign ? 0 : 0x7f;
165 break;
166 case float_round_down:
167 roundIncrement = zSign ? 0x7f : 0;
168 break;
169 default:
170 abort();
158142c2
FB
171 }
172 roundBits = absZ & 0x7F;
173 absZ = ( absZ + roundIncrement )>>7;
174 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
175 z = absZ;
176 if ( zSign ) z = - z;
177 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
ff32e16e 178 float_raise(float_flag_invalid, status);
bb98fe42 179 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2 180 }
a2f2d288
PM
181 if (roundBits) {
182 status->float_exception_flags |= float_flag_inexact;
183 }
158142c2
FB
184 return z;
185
186}
187
188/*----------------------------------------------------------------------------
189| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
190| `absZ1', with binary point between bits 63 and 64 (between the input words),
191| and returns the properly rounded 64-bit integer corresponding to the input.
192| If `zSign' is 1, the input is negated before being converted to an integer.
193| Ordinarily, the fixed-point input is simply rounded to an integer, with
194| the inexact exception raised if the input cannot be represented exactly as
195| an integer. However, if the fixed-point input is too large, the invalid
196| exception is raised and the largest positive or negative integer is
197| returned.
198*----------------------------------------------------------------------------*/
199
f42c2224 200static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
e5a41ffa 201 float_status *status)
158142c2 202{
8f506c70 203 int8_t roundingMode;
158142c2 204 flag roundNearestEven, increment;
760e1416 205 int64_t z;
158142c2 206
a2f2d288 207 roundingMode = status->float_rounding_mode;
158142c2 208 roundNearestEven = ( roundingMode == float_round_nearest_even );
dc355b76
PM
209 switch (roundingMode) {
210 case float_round_nearest_even:
f9288a76 211 case float_round_ties_away:
dc355b76
PM
212 increment = ((int64_t) absZ1 < 0);
213 break;
214 case float_round_to_zero:
215 increment = 0;
216 break;
217 case float_round_up:
218 increment = !zSign && absZ1;
219 break;
220 case float_round_down:
221 increment = zSign && absZ1;
222 break;
223 default:
224 abort();
158142c2
FB
225 }
226 if ( increment ) {
227 ++absZ0;
228 if ( absZ0 == 0 ) goto overflow;
bb98fe42 229 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
158142c2
FB
230 }
231 z = absZ0;
232 if ( zSign ) z = - z;
233 if ( z && ( ( z < 0 ) ^ zSign ) ) {
234 overflow:
ff32e16e 235 float_raise(float_flag_invalid, status);
158142c2 236 return
bb98fe42 237 zSign ? (int64_t) LIT64( 0x8000000000000000 )
158142c2
FB
238 : LIT64( 0x7FFFFFFFFFFFFFFF );
239 }
a2f2d288
PM
240 if (absZ1) {
241 status->float_exception_flags |= float_flag_inexact;
242 }
158142c2
FB
243 return z;
244
245}
246
fb3ea83a
TM
247/*----------------------------------------------------------------------------
248| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
249| `absZ1', with binary point between bits 63 and 64 (between the input words),
250| and returns the properly rounded 64-bit unsigned integer corresponding to the
251| input. Ordinarily, the fixed-point input is simply rounded to an integer,
252| with the inexact exception raised if the input cannot be represented exactly
253| as an integer. However, if the fixed-point input is too large, the invalid
254| exception is raised and the largest unsigned integer is returned.
255*----------------------------------------------------------------------------*/
256
f42c2224 257static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
e5a41ffa 258 uint64_t absZ1, float_status *status)
fb3ea83a 259{
8f506c70 260 int8_t roundingMode;
fb3ea83a
TM
261 flag roundNearestEven, increment;
262
a2f2d288 263 roundingMode = status->float_rounding_mode;
fb3ea83a 264 roundNearestEven = (roundingMode == float_round_nearest_even);
dc355b76
PM
265 switch (roundingMode) {
266 case float_round_nearest_even:
f9288a76 267 case float_round_ties_away:
dc355b76
PM
268 increment = ((int64_t)absZ1 < 0);
269 break;
270 case float_round_to_zero:
271 increment = 0;
272 break;
273 case float_round_up:
274 increment = !zSign && absZ1;
275 break;
276 case float_round_down:
277 increment = zSign && absZ1;
278 break;
279 default:
280 abort();
fb3ea83a
TM
281 }
282 if (increment) {
283 ++absZ0;
284 if (absZ0 == 0) {
ff32e16e 285 float_raise(float_flag_invalid, status);
fb3ea83a
TM
286 return LIT64(0xFFFFFFFFFFFFFFFF);
287 }
288 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
289 }
290
291 if (zSign && absZ0) {
ff32e16e 292 float_raise(float_flag_invalid, status);
fb3ea83a
TM
293 return 0;
294 }
295
296 if (absZ1) {
a2f2d288 297 status->float_exception_flags |= float_flag_inexact;
fb3ea83a
TM
298 }
299 return absZ0;
300}
301
158142c2
FB
302/*----------------------------------------------------------------------------
303| Returns the fraction bits of the single-precision floating-point value `a'.
304*----------------------------------------------------------------------------*/
305
a49db98d 306static inline uint32_t extractFloat32Frac( float32 a )
158142c2
FB
307{
308
f090c9d4 309 return float32_val(a) & 0x007FFFFF;
158142c2
FB
310
311}
312
313/*----------------------------------------------------------------------------
314| Returns the exponent bits of the single-precision floating-point value `a'.
315*----------------------------------------------------------------------------*/
316
0c48262d 317static inline int extractFloat32Exp(float32 a)
158142c2
FB
318{
319
f090c9d4 320 return ( float32_val(a)>>23 ) & 0xFF;
158142c2
FB
321
322}
323
324/*----------------------------------------------------------------------------
325| Returns the sign bit of the single-precision floating-point value `a'.
326*----------------------------------------------------------------------------*/
327
a49db98d 328static inline flag extractFloat32Sign( float32 a )
158142c2
FB
329{
330
f090c9d4 331 return float32_val(a)>>31;
158142c2
FB
332
333}
334
37d18660
PM
335/*----------------------------------------------------------------------------
336| If `a' is denormal and we are in flush-to-zero mode then set the
337| input-denormal exception and return zero. Otherwise just return the value.
338*----------------------------------------------------------------------------*/
e5a41ffa 339float32 float32_squash_input_denormal(float32 a, float_status *status)
37d18660 340{
a2f2d288 341 if (status->flush_inputs_to_zero) {
37d18660 342 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
ff32e16e 343 float_raise(float_flag_input_denormal, status);
37d18660
PM
344 return make_float32(float32_val(a) & 0x80000000);
345 }
346 }
347 return a;
348}
349
158142c2
FB
350/*----------------------------------------------------------------------------
351| Normalizes the subnormal single-precision floating-point value represented
352| by the denormalized significand `aSig'. The normalized exponent and
353| significand are stored at the locations pointed to by `zExpPtr' and
354| `zSigPtr', respectively.
355*----------------------------------------------------------------------------*/
356
357static void
0c48262d 358 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
158142c2 359{
8f506c70 360 int8_t shiftCount;
158142c2
FB
361
362 shiftCount = countLeadingZeros32( aSig ) - 8;
363 *zSigPtr = aSig<<shiftCount;
364 *zExpPtr = 1 - shiftCount;
365
366}
367
368/*----------------------------------------------------------------------------
369| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
370| single-precision floating-point value, returning the result. After being
371| shifted into the proper positions, the three fields are simply added
372| together to form the result. This means that any integer portion of `zSig'
373| will be added into the exponent. Since a properly normalized significand
374| will have an integer portion equal to 1, the `zExp' input should be 1 less
375| than the desired result exponent whenever `zSig' is a complete, normalized
376| significand.
377*----------------------------------------------------------------------------*/
378
0c48262d 379static inline float32 packFloat32(flag zSign, int zExp, uint32_t zSig)
158142c2
FB
380{
381
f090c9d4 382 return make_float32(
bb98fe42 383 ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
158142c2
FB
384
385}
386
387/*----------------------------------------------------------------------------
388| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
389| and significand `zSig', and returns the proper single-precision floating-
390| point value corresponding to the abstract input. Ordinarily, the abstract
391| value is simply rounded and packed into the single-precision format, with
392| the inexact exception raised if the abstract input cannot be represented
393| exactly. However, if the abstract value is too large, the overflow and
394| inexact exceptions are raised and an infinity or maximal finite value is
395| returned. If the abstract value is too small, the input value is rounded to
396| a subnormal number, and the underflow and inexact exceptions are raised if
397| the abstract input cannot be represented exactly as a subnormal single-
398| precision floating-point number.
399| The input significand `zSig' has its binary point between bits 30
400| and 29, which is 7 bits to the left of the usual location. This shifted
401| significand must be normalized or smaller. If `zSig' is not normalized,
402| `zExp' must be 0; in that case, the result returned is a subnormal number,
403| and it must not require rounding. In the usual case that `zSig' is
404| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
405| The handling of underflow and overflow follows the IEC/IEEE Standard for
406| Binary Floating-Point Arithmetic.
407*----------------------------------------------------------------------------*/
408
0c48262d 409static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
e5a41ffa 410 float_status *status)
158142c2 411{
8f506c70 412 int8_t roundingMode;
158142c2 413 flag roundNearestEven;
8f506c70 414 int8_t roundIncrement, roundBits;
158142c2
FB
415 flag isTiny;
416
a2f2d288 417 roundingMode = status->float_rounding_mode;
158142c2 418 roundNearestEven = ( roundingMode == float_round_nearest_even );
dc355b76
PM
419 switch (roundingMode) {
420 case float_round_nearest_even:
f9288a76 421 case float_round_ties_away:
dc355b76
PM
422 roundIncrement = 0x40;
423 break;
424 case float_round_to_zero:
425 roundIncrement = 0;
426 break;
427 case float_round_up:
428 roundIncrement = zSign ? 0 : 0x7f;
429 break;
430 case float_round_down:
431 roundIncrement = zSign ? 0x7f : 0;
432 break;
433 default:
434 abort();
435 break;
158142c2
FB
436 }
437 roundBits = zSig & 0x7F;
bb98fe42 438 if ( 0xFD <= (uint16_t) zExp ) {
158142c2
FB
439 if ( ( 0xFD < zExp )
440 || ( ( zExp == 0xFD )
bb98fe42 441 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
158142c2 442 ) {
ff32e16e 443 float_raise(float_flag_overflow | float_flag_inexact, status);
f090c9d4 444 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
158142c2
FB
445 }
446 if ( zExp < 0 ) {
a2f2d288 447 if (status->flush_to_zero) {
ff32e16e 448 float_raise(float_flag_output_denormal, status);
e6afc87f
PM
449 return packFloat32(zSign, 0, 0);
450 }
158142c2 451 isTiny =
a2f2d288
PM
452 (status->float_detect_tininess
453 == float_tininess_before_rounding)
158142c2
FB
454 || ( zExp < -1 )
455 || ( zSig + roundIncrement < 0x80000000 );
456 shift32RightJamming( zSig, - zExp, &zSig );
457 zExp = 0;
458 roundBits = zSig & 0x7F;
ff32e16e
PM
459 if (isTiny && roundBits) {
460 float_raise(float_flag_underflow, status);
461 }
158142c2
FB
462 }
463 }
a2f2d288
PM
464 if (roundBits) {
465 status->float_exception_flags |= float_flag_inexact;
466 }
158142c2
FB
467 zSig = ( zSig + roundIncrement )>>7;
468 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
469 if ( zSig == 0 ) zExp = 0;
470 return packFloat32( zSign, zExp, zSig );
471
472}
473
474/*----------------------------------------------------------------------------
475| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
476| and significand `zSig', and returns the proper single-precision floating-
477| point value corresponding to the abstract input. This routine is just like
478| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
479| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
480| floating-point exponent.
481*----------------------------------------------------------------------------*/
482
483static float32
0c48262d 484 normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
e5a41ffa 485 float_status *status)
158142c2 486{
8f506c70 487 int8_t shiftCount;
158142c2
FB
488
489 shiftCount = countLeadingZeros32( zSig ) - 1;
ff32e16e
PM
490 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
491 status);
158142c2
FB
492
493}
494
495/*----------------------------------------------------------------------------
496| Returns the fraction bits of the double-precision floating-point value `a'.
497*----------------------------------------------------------------------------*/
498
a49db98d 499static inline uint64_t extractFloat64Frac( float64 a )
158142c2
FB
500{
501
f090c9d4 502 return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
158142c2
FB
503
504}
505
506/*----------------------------------------------------------------------------
507| Returns the exponent bits of the double-precision floating-point value `a'.
508*----------------------------------------------------------------------------*/
509
0c48262d 510static inline int extractFloat64Exp(float64 a)
158142c2
FB
511{
512
f090c9d4 513 return ( float64_val(a)>>52 ) & 0x7FF;
158142c2
FB
514
515}
516
517/*----------------------------------------------------------------------------
518| Returns the sign bit of the double-precision floating-point value `a'.
519*----------------------------------------------------------------------------*/
520
a49db98d 521static inline flag extractFloat64Sign( float64 a )
158142c2
FB
522{
523
f090c9d4 524 return float64_val(a)>>63;
158142c2
FB
525
526}
527
37d18660
PM
528/*----------------------------------------------------------------------------
529| If `a' is denormal and we are in flush-to-zero mode then set the
530| input-denormal exception and return zero. Otherwise just return the value.
531*----------------------------------------------------------------------------*/
e5a41ffa 532float64 float64_squash_input_denormal(float64 a, float_status *status)
37d18660 533{
a2f2d288 534 if (status->flush_inputs_to_zero) {
37d18660 535 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
ff32e16e 536 float_raise(float_flag_input_denormal, status);
37d18660
PM
537 return make_float64(float64_val(a) & (1ULL << 63));
538 }
539 }
540 return a;
541}
542
158142c2
FB
543/*----------------------------------------------------------------------------
544| Normalizes the subnormal double-precision floating-point value represented
545| by the denormalized significand `aSig'. The normalized exponent and
546| significand are stored at the locations pointed to by `zExpPtr' and
547| `zSigPtr', respectively.
548*----------------------------------------------------------------------------*/
549
550static void
0c48262d 551 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
158142c2 552{
8f506c70 553 int8_t shiftCount;
158142c2
FB
554
555 shiftCount = countLeadingZeros64( aSig ) - 11;
556 *zSigPtr = aSig<<shiftCount;
557 *zExpPtr = 1 - shiftCount;
558
559}
560
561/*----------------------------------------------------------------------------
562| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
563| double-precision floating-point value, returning the result. After being
564| shifted into the proper positions, the three fields are simply added
565| together to form the result. This means that any integer portion of `zSig'
566| will be added into the exponent. Since a properly normalized significand
567| will have an integer portion equal to 1, the `zExp' input should be 1 less
568| than the desired result exponent whenever `zSig' is a complete, normalized
569| significand.
570*----------------------------------------------------------------------------*/
571
0c48262d 572static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
158142c2
FB
573{
574
f090c9d4 575 return make_float64(
bb98fe42 576 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
158142c2
FB
577
578}
579
580/*----------------------------------------------------------------------------
581| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
582| and significand `zSig', and returns the proper double-precision floating-
583| point value corresponding to the abstract input. Ordinarily, the abstract
584| value is simply rounded and packed into the double-precision format, with
585| the inexact exception raised if the abstract input cannot be represented
586| exactly. However, if the abstract value is too large, the overflow and
587| inexact exceptions are raised and an infinity or maximal finite value is
a7d1ac78
PM
588| returned. If the abstract value is too small, the input value is rounded to
589| a subnormal number, and the underflow and inexact exceptions are raised if
590| the abstract input cannot be represented exactly as a subnormal double-
158142c2
FB
591| precision floating-point number.
592| The input significand `zSig' has its binary point between bits 62
593| and 61, which is 10 bits to the left of the usual location. This shifted
594| significand must be normalized or smaller. If `zSig' is not normalized,
595| `zExp' must be 0; in that case, the result returned is a subnormal number,
596| and it must not require rounding. In the usual case that `zSig' is
597| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
598| The handling of underflow and overflow follows the IEC/IEEE Standard for
599| Binary Floating-Point Arithmetic.
600*----------------------------------------------------------------------------*/
601
0c48262d 602static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
e5a41ffa 603 float_status *status)
158142c2 604{
8f506c70 605 int8_t roundingMode;
158142c2 606 flag roundNearestEven;
0c48262d 607 int roundIncrement, roundBits;
158142c2
FB
608 flag isTiny;
609
a2f2d288 610 roundingMode = status->float_rounding_mode;
158142c2 611 roundNearestEven = ( roundingMode == float_round_nearest_even );
dc355b76
PM
612 switch (roundingMode) {
613 case float_round_nearest_even:
f9288a76 614 case float_round_ties_away:
dc355b76
PM
615 roundIncrement = 0x200;
616 break;
617 case float_round_to_zero:
618 roundIncrement = 0;
619 break;
620 case float_round_up:
621 roundIncrement = zSign ? 0 : 0x3ff;
622 break;
623 case float_round_down:
624 roundIncrement = zSign ? 0x3ff : 0;
625 break;
9ee6f678
BR
626 case float_round_to_odd:
627 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
628 break;
dc355b76
PM
629 default:
630 abort();
158142c2
FB
631 }
632 roundBits = zSig & 0x3FF;
bb98fe42 633 if ( 0x7FD <= (uint16_t) zExp ) {
158142c2
FB
634 if ( ( 0x7FD < zExp )
635 || ( ( zExp == 0x7FD )
bb98fe42 636 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
158142c2 637 ) {
9ee6f678
BR
638 bool overflow_to_inf = roundingMode != float_round_to_odd &&
639 roundIncrement != 0;
ff32e16e 640 float_raise(float_flag_overflow | float_flag_inexact, status);
9ee6f678 641 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
158142c2
FB
642 }
643 if ( zExp < 0 ) {
a2f2d288 644 if (status->flush_to_zero) {
ff32e16e 645 float_raise(float_flag_output_denormal, status);
e6afc87f
PM
646 return packFloat64(zSign, 0, 0);
647 }
158142c2 648 isTiny =
a2f2d288
PM
649 (status->float_detect_tininess
650 == float_tininess_before_rounding)
158142c2
FB
651 || ( zExp < -1 )
652 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
653 shift64RightJamming( zSig, - zExp, &zSig );
654 zExp = 0;
655 roundBits = zSig & 0x3FF;
ff32e16e
PM
656 if (isTiny && roundBits) {
657 float_raise(float_flag_underflow, status);
658 }
9ee6f678
BR
659 if (roundingMode == float_round_to_odd) {
660 /*
661 * For round-to-odd case, the roundIncrement depends on
662 * zSig which just changed.
663 */
664 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
665 }
158142c2
FB
666 }
667 }
a2f2d288
PM
668 if (roundBits) {
669 status->float_exception_flags |= float_flag_inexact;
670 }
158142c2
FB
671 zSig = ( zSig + roundIncrement )>>10;
672 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
673 if ( zSig == 0 ) zExp = 0;
674 return packFloat64( zSign, zExp, zSig );
675
676}
677
678/*----------------------------------------------------------------------------
679| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
680| and significand `zSig', and returns the proper double-precision floating-
681| point value corresponding to the abstract input. This routine is just like
682| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
683| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
684| floating-point exponent.
685*----------------------------------------------------------------------------*/
686
687static float64
0c48262d 688 normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
e5a41ffa 689 float_status *status)
158142c2 690{
8f506c70 691 int8_t shiftCount;
158142c2
FB
692
693 shiftCount = countLeadingZeros64( zSig ) - 1;
ff32e16e
PM
694 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
695 status);
158142c2
FB
696
697}
698
158142c2
FB
699/*----------------------------------------------------------------------------
700| Returns the fraction bits of the extended double-precision floating-point
701| value `a'.
702*----------------------------------------------------------------------------*/
703
a49db98d 704static inline uint64_t extractFloatx80Frac( floatx80 a )
158142c2
FB
705{
706
707 return a.low;
708
709}
710
711/*----------------------------------------------------------------------------
712| Returns the exponent bits of the extended double-precision floating-point
713| value `a'.
714*----------------------------------------------------------------------------*/
715
f4014512 716static inline int32_t extractFloatx80Exp( floatx80 a )
158142c2
FB
717{
718
719 return a.high & 0x7FFF;
720
721}
722
723/*----------------------------------------------------------------------------
724| Returns the sign bit of the extended double-precision floating-point value
725| `a'.
726*----------------------------------------------------------------------------*/
727
a49db98d 728static inline flag extractFloatx80Sign( floatx80 a )
158142c2
FB
729{
730
731 return a.high>>15;
732
733}
734
735/*----------------------------------------------------------------------------
736| Normalizes the subnormal extended double-precision floating-point value
737| represented by the denormalized significand `aSig'. The normalized exponent
738| and significand are stored at the locations pointed to by `zExpPtr' and
739| `zSigPtr', respectively.
740*----------------------------------------------------------------------------*/
741
742static void
f4014512 743 normalizeFloatx80Subnormal( uint64_t aSig, int32_t *zExpPtr, uint64_t *zSigPtr )
158142c2 744{
8f506c70 745 int8_t shiftCount;
158142c2
FB
746
747 shiftCount = countLeadingZeros64( aSig );
748 *zSigPtr = aSig<<shiftCount;
749 *zExpPtr = 1 - shiftCount;
750
751}
752
753/*----------------------------------------------------------------------------
754| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
755| extended double-precision floating-point value, returning the result.
756*----------------------------------------------------------------------------*/
757
f4014512 758static inline floatx80 packFloatx80( flag zSign, int32_t zExp, uint64_t zSig )
158142c2
FB
759{
760 floatx80 z;
761
762 z.low = zSig;
bb98fe42 763 z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
158142c2
FB
764 return z;
765
766}
767
768/*----------------------------------------------------------------------------
769| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
770| and extended significand formed by the concatenation of `zSig0' and `zSig1',
771| and returns the proper extended double-precision floating-point value
772| corresponding to the abstract input. Ordinarily, the abstract value is
773| rounded and packed into the extended double-precision format, with the
774| inexact exception raised if the abstract input cannot be represented
775| exactly. However, if the abstract value is too large, the overflow and
776| inexact exceptions are raised and an infinity or maximal finite value is
777| returned. If the abstract value is too small, the input value is rounded to
778| a subnormal number, and the underflow and inexact exceptions are raised if
779| the abstract input cannot be represented exactly as a subnormal extended
780| double-precision floating-point number.
781| If `roundingPrecision' is 32 or 64, the result is rounded to the same
782| number of bits as single or double precision, respectively. Otherwise, the
783| result is rounded to the full precision of the extended double-precision
784| format.
785| The input significand must be normalized or smaller. If the input
786| significand is not normalized, `zExp' must be 0; in that case, the result
787| returned is a subnormal number, and it must not require rounding. The
788| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
789| Floating-Point Arithmetic.
790*----------------------------------------------------------------------------*/
791
8f506c70 792static floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
f4014512 793 int32_t zExp, uint64_t zSig0, uint64_t zSig1,
e5a41ffa 794 float_status *status)
158142c2 795{
8f506c70 796 int8_t roundingMode;
158142c2 797 flag roundNearestEven, increment, isTiny;
f42c2224 798 int64_t roundIncrement, roundMask, roundBits;
158142c2 799
a2f2d288 800 roundingMode = status->float_rounding_mode;
158142c2
FB
801 roundNearestEven = ( roundingMode == float_round_nearest_even );
802 if ( roundingPrecision == 80 ) goto precision80;
803 if ( roundingPrecision == 64 ) {
804 roundIncrement = LIT64( 0x0000000000000400 );
805 roundMask = LIT64( 0x00000000000007FF );
806 }
807 else if ( roundingPrecision == 32 ) {
808 roundIncrement = LIT64( 0x0000008000000000 );
809 roundMask = LIT64( 0x000000FFFFFFFFFF );
810 }
811 else {
812 goto precision80;
813 }
814 zSig0 |= ( zSig1 != 0 );
dc355b76
PM
815 switch (roundingMode) {
816 case float_round_nearest_even:
f9288a76 817 case float_round_ties_away:
dc355b76
PM
818 break;
819 case float_round_to_zero:
820 roundIncrement = 0;
821 break;
822 case float_round_up:
823 roundIncrement = zSign ? 0 : roundMask;
824 break;
825 case float_round_down:
826 roundIncrement = zSign ? roundMask : 0;
827 break;
828 default:
829 abort();
158142c2
FB
830 }
831 roundBits = zSig0 & roundMask;
bb98fe42 832 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
158142c2
FB
833 if ( ( 0x7FFE < zExp )
834 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
835 ) {
836 goto overflow;
837 }
838 if ( zExp <= 0 ) {
a2f2d288 839 if (status->flush_to_zero) {
ff32e16e 840 float_raise(float_flag_output_denormal, status);
e6afc87f
PM
841 return packFloatx80(zSign, 0, 0);
842 }
158142c2 843 isTiny =
a2f2d288
PM
844 (status->float_detect_tininess
845 == float_tininess_before_rounding)
158142c2
FB
846 || ( zExp < 0 )
847 || ( zSig0 <= zSig0 + roundIncrement );
848 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
849 zExp = 0;
850 roundBits = zSig0 & roundMask;
ff32e16e
PM
851 if (isTiny && roundBits) {
852 float_raise(float_flag_underflow, status);
853 }
a2f2d288
PM
854 if (roundBits) {
855 status->float_exception_flags |= float_flag_inexact;
856 }
158142c2 857 zSig0 += roundIncrement;
bb98fe42 858 if ( (int64_t) zSig0 < 0 ) zExp = 1;
158142c2
FB
859 roundIncrement = roundMask + 1;
860 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
861 roundMask |= roundIncrement;
862 }
863 zSig0 &= ~ roundMask;
864 return packFloatx80( zSign, zExp, zSig0 );
865 }
866 }
a2f2d288
PM
867 if (roundBits) {
868 status->float_exception_flags |= float_flag_inexact;
869 }
158142c2
FB
870 zSig0 += roundIncrement;
871 if ( zSig0 < roundIncrement ) {
872 ++zExp;
873 zSig0 = LIT64( 0x8000000000000000 );
874 }
875 roundIncrement = roundMask + 1;
876 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
877 roundMask |= roundIncrement;
878 }
879 zSig0 &= ~ roundMask;
880 if ( zSig0 == 0 ) zExp = 0;
881 return packFloatx80( zSign, zExp, zSig0 );
882 precision80:
dc355b76
PM
883 switch (roundingMode) {
884 case float_round_nearest_even:
f9288a76 885 case float_round_ties_away:
dc355b76
PM
886 increment = ((int64_t)zSig1 < 0);
887 break;
888 case float_round_to_zero:
889 increment = 0;
890 break;
891 case float_round_up:
892 increment = !zSign && zSig1;
893 break;
894 case float_round_down:
895 increment = zSign && zSig1;
896 break;
897 default:
898 abort();
158142c2 899 }
bb98fe42 900 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
158142c2
FB
901 if ( ( 0x7FFE < zExp )
902 || ( ( zExp == 0x7FFE )
903 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
904 && increment
905 )
906 ) {
907 roundMask = 0;
908 overflow:
ff32e16e 909 float_raise(float_flag_overflow | float_flag_inexact, status);
158142c2
FB
910 if ( ( roundingMode == float_round_to_zero )
911 || ( zSign && ( roundingMode == float_round_up ) )
912 || ( ! zSign && ( roundingMode == float_round_down ) )
913 ) {
914 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
915 }
916 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
917 }
918 if ( zExp <= 0 ) {
919 isTiny =
a2f2d288
PM
920 (status->float_detect_tininess
921 == float_tininess_before_rounding)
158142c2
FB
922 || ( zExp < 0 )
923 || ! increment
924 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
925 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
926 zExp = 0;
ff32e16e
PM
927 if (isTiny && zSig1) {
928 float_raise(float_flag_underflow, status);
929 }
a2f2d288
PM
930 if (zSig1) {
931 status->float_exception_flags |= float_flag_inexact;
932 }
dc355b76
PM
933 switch (roundingMode) {
934 case float_round_nearest_even:
f9288a76 935 case float_round_ties_away:
dc355b76
PM
936 increment = ((int64_t)zSig1 < 0);
937 break;
938 case float_round_to_zero:
939 increment = 0;
940 break;
941 case float_round_up:
942 increment = !zSign && zSig1;
943 break;
944 case float_round_down:
945 increment = zSign && zSig1;
946 break;
947 default:
948 abort();
158142c2
FB
949 }
950 if ( increment ) {
951 ++zSig0;
952 zSig0 &=
bb98fe42
AF
953 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
954 if ( (int64_t) zSig0 < 0 ) zExp = 1;
158142c2
FB
955 }
956 return packFloatx80( zSign, zExp, zSig0 );
957 }
958 }
a2f2d288
PM
959 if (zSig1) {
960 status->float_exception_flags |= float_flag_inexact;
961 }
158142c2
FB
962 if ( increment ) {
963 ++zSig0;
964 if ( zSig0 == 0 ) {
965 ++zExp;
966 zSig0 = LIT64( 0x8000000000000000 );
967 }
968 else {
bb98fe42 969 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
158142c2
FB
970 }
971 }
972 else {
973 if ( zSig0 == 0 ) zExp = 0;
974 }
975 return packFloatx80( zSign, zExp, zSig0 );
976
977}
978
979/*----------------------------------------------------------------------------
980| Takes an abstract floating-point value having sign `zSign', exponent
981| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
982| and returns the proper extended double-precision floating-point value
983| corresponding to the abstract input. This routine is just like
984| `roundAndPackFloatx80' except that the input significand does not have to be
985| normalized.
986*----------------------------------------------------------------------------*/
987
8f506c70 988static floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
f4014512 989 flag zSign, int32_t zExp,
e5a41ffa
PM
990 uint64_t zSig0, uint64_t zSig1,
991 float_status *status)
158142c2 992{
8f506c70 993 int8_t shiftCount;
158142c2
FB
994
995 if ( zSig0 == 0 ) {
996 zSig0 = zSig1;
997 zSig1 = 0;
998 zExp -= 64;
999 }
1000 shiftCount = countLeadingZeros64( zSig0 );
1001 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1002 zExp -= shiftCount;
ff32e16e
PM
1003 return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
1004 zSig0, zSig1, status);
158142c2
FB
1005
1006}
1007
158142c2
FB
1008/*----------------------------------------------------------------------------
1009| Returns the least-significant 64 fraction bits of the quadruple-precision
1010| floating-point value `a'.
1011*----------------------------------------------------------------------------*/
1012
a49db98d 1013static inline uint64_t extractFloat128Frac1( float128 a )
158142c2
FB
1014{
1015
1016 return a.low;
1017
1018}
1019
1020/*----------------------------------------------------------------------------
1021| Returns the most-significant 48 fraction bits of the quadruple-precision
1022| floating-point value `a'.
1023*----------------------------------------------------------------------------*/
1024
a49db98d 1025static inline uint64_t extractFloat128Frac0( float128 a )
158142c2
FB
1026{
1027
1028 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
1029
1030}
1031
1032/*----------------------------------------------------------------------------
1033| Returns the exponent bits of the quadruple-precision floating-point value
1034| `a'.
1035*----------------------------------------------------------------------------*/
1036
f4014512 1037static inline int32_t extractFloat128Exp( float128 a )
158142c2
FB
1038{
1039
1040 return ( a.high>>48 ) & 0x7FFF;
1041
1042}
1043
1044/*----------------------------------------------------------------------------
1045| Returns the sign bit of the quadruple-precision floating-point value `a'.
1046*----------------------------------------------------------------------------*/
1047
a49db98d 1048static inline flag extractFloat128Sign( float128 a )
158142c2
FB
1049{
1050
1051 return a.high>>63;
1052
1053}
1054
1055/*----------------------------------------------------------------------------
1056| Normalizes the subnormal quadruple-precision floating-point value
1057| represented by the denormalized significand formed by the concatenation of
1058| `aSig0' and `aSig1'. The normalized exponent is stored at the location
1059| pointed to by `zExpPtr'. The most significant 49 bits of the normalized
1060| significand are stored at the location pointed to by `zSig0Ptr', and the
1061| least significant 64 bits of the normalized significand are stored at the
1062| location pointed to by `zSig1Ptr'.
1063*----------------------------------------------------------------------------*/
1064
1065static void
1066 normalizeFloat128Subnormal(
bb98fe42
AF
1067 uint64_t aSig0,
1068 uint64_t aSig1,
f4014512 1069 int32_t *zExpPtr,
bb98fe42
AF
1070 uint64_t *zSig0Ptr,
1071 uint64_t *zSig1Ptr
158142c2
FB
1072 )
1073{
8f506c70 1074 int8_t shiftCount;
158142c2
FB
1075
1076 if ( aSig0 == 0 ) {
1077 shiftCount = countLeadingZeros64( aSig1 ) - 15;
1078 if ( shiftCount < 0 ) {
1079 *zSig0Ptr = aSig1>>( - shiftCount );
1080 *zSig1Ptr = aSig1<<( shiftCount & 63 );
1081 }
1082 else {
1083 *zSig0Ptr = aSig1<<shiftCount;
1084 *zSig1Ptr = 0;
1085 }
1086 *zExpPtr = - shiftCount - 63;
1087 }
1088 else {
1089 shiftCount = countLeadingZeros64( aSig0 ) - 15;
1090 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
1091 *zExpPtr = 1 - shiftCount;
1092 }
1093
1094}
1095
1096/*----------------------------------------------------------------------------
1097| Packs the sign `zSign', the exponent `zExp', and the significand formed
1098| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
1099| floating-point value, returning the result. After being shifted into the
1100| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
1101| added together to form the most significant 32 bits of the result. This
1102| means that any integer portion of `zSig0' will be added into the exponent.
1103| Since a properly normalized significand will have an integer portion equal
1104| to 1, the `zExp' input should be 1 less than the desired result exponent
1105| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
1106| significand.
1107*----------------------------------------------------------------------------*/
1108
a49db98d 1109static inline float128
f4014512 1110 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
158142c2
FB
1111{
1112 float128 z;
1113
1114 z.low = zSig1;
bb98fe42 1115 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
158142c2
FB
1116 return z;
1117
1118}
1119
1120/*----------------------------------------------------------------------------
1121| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1122| and extended significand formed by the concatenation of `zSig0', `zSig1',
1123| and `zSig2', and returns the proper quadruple-precision floating-point value
1124| corresponding to the abstract input. Ordinarily, the abstract value is
1125| simply rounded and packed into the quadruple-precision format, with the
1126| inexact exception raised if the abstract input cannot be represented
1127| exactly. However, if the abstract value is too large, the overflow and
1128| inexact exceptions are raised and an infinity or maximal finite value is
1129| returned. If the abstract value is too small, the input value is rounded to
1130| a subnormal number, and the underflow and inexact exceptions are raised if
1131| the abstract input cannot be represented exactly as a subnormal quadruple-
1132| precision floating-point number.
1133| The input significand must be normalized or smaller. If the input
1134| significand is not normalized, `zExp' must be 0; in that case, the result
1135| returned is a subnormal number, and it must not require rounding. In the
1136| usual case that the input significand is normalized, `zExp' must be 1 less
1137| than the ``true'' floating-point exponent. The handling of underflow and
1138| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1139*----------------------------------------------------------------------------*/
1140
f4014512 1141static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
e5a41ffa
PM
1142 uint64_t zSig0, uint64_t zSig1,
1143 uint64_t zSig2, float_status *status)
158142c2 1144{
8f506c70 1145 int8_t roundingMode;
158142c2
FB
1146 flag roundNearestEven, increment, isTiny;
1147
a2f2d288 1148 roundingMode = status->float_rounding_mode;
158142c2 1149 roundNearestEven = ( roundingMode == float_round_nearest_even );
dc355b76
PM
1150 switch (roundingMode) {
1151 case float_round_nearest_even:
f9288a76 1152 case float_round_ties_away:
dc355b76
PM
1153 increment = ((int64_t)zSig2 < 0);
1154 break;
1155 case float_round_to_zero:
1156 increment = 0;
1157 break;
1158 case float_round_up:
1159 increment = !zSign && zSig2;
1160 break;
1161 case float_round_down:
1162 increment = zSign && zSig2;
1163 break;
9ee6f678
BR
1164 case float_round_to_odd:
1165 increment = !(zSig1 & 0x1) && zSig2;
1166 break;
dc355b76
PM
1167 default:
1168 abort();
158142c2 1169 }
bb98fe42 1170 if ( 0x7FFD <= (uint32_t) zExp ) {
158142c2
FB
1171 if ( ( 0x7FFD < zExp )
1172 || ( ( zExp == 0x7FFD )
1173 && eq128(
1174 LIT64( 0x0001FFFFFFFFFFFF ),
1175 LIT64( 0xFFFFFFFFFFFFFFFF ),
1176 zSig0,
1177 zSig1
1178 )
1179 && increment
1180 )
1181 ) {
ff32e16e 1182 float_raise(float_flag_overflow | float_flag_inexact, status);
158142c2
FB
1183 if ( ( roundingMode == float_round_to_zero )
1184 || ( zSign && ( roundingMode == float_round_up ) )
1185 || ( ! zSign && ( roundingMode == float_round_down ) )
9ee6f678 1186 || (roundingMode == float_round_to_odd)
158142c2
FB
1187 ) {
1188 return
1189 packFloat128(
1190 zSign,
1191 0x7FFE,
1192 LIT64( 0x0000FFFFFFFFFFFF ),
1193 LIT64( 0xFFFFFFFFFFFFFFFF )
1194 );
1195 }
1196 return packFloat128( zSign, 0x7FFF, 0, 0 );
1197 }
1198 if ( zExp < 0 ) {
a2f2d288 1199 if (status->flush_to_zero) {
ff32e16e 1200 float_raise(float_flag_output_denormal, status);
e6afc87f
PM
1201 return packFloat128(zSign, 0, 0, 0);
1202 }
158142c2 1203 isTiny =
a2f2d288
PM
1204 (status->float_detect_tininess
1205 == float_tininess_before_rounding)
158142c2
FB
1206 || ( zExp < -1 )
1207 || ! increment
1208 || lt128(
1209 zSig0,
1210 zSig1,
1211 LIT64( 0x0001FFFFFFFFFFFF ),
1212 LIT64( 0xFFFFFFFFFFFFFFFF )
1213 );
1214 shift128ExtraRightJamming(
1215 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1216 zExp = 0;
ff32e16e
PM
1217 if (isTiny && zSig2) {
1218 float_raise(float_flag_underflow, status);
1219 }
dc355b76
PM
1220 switch (roundingMode) {
1221 case float_round_nearest_even:
f9288a76 1222 case float_round_ties_away:
dc355b76
PM
1223 increment = ((int64_t)zSig2 < 0);
1224 break;
1225 case float_round_to_zero:
1226 increment = 0;
1227 break;
1228 case float_round_up:
1229 increment = !zSign && zSig2;
1230 break;
1231 case float_round_down:
1232 increment = zSign && zSig2;
1233 break;
9ee6f678
BR
1234 case float_round_to_odd:
1235 increment = !(zSig1 & 0x1) && zSig2;
1236 break;
dc355b76
PM
1237 default:
1238 abort();
158142c2
FB
1239 }
1240 }
1241 }
a2f2d288
PM
1242 if (zSig2) {
1243 status->float_exception_flags |= float_flag_inexact;
1244 }
158142c2
FB
1245 if ( increment ) {
1246 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1247 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1248 }
1249 else {
1250 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1251 }
1252 return packFloat128( zSign, zExp, zSig0, zSig1 );
1253
1254}
1255
1256/*----------------------------------------------------------------------------
1257| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1258| and significand formed by the concatenation of `zSig0' and `zSig1', and
1259| returns the proper quadruple-precision floating-point value corresponding
1260| to the abstract input. This routine is just like `roundAndPackFloat128'
1261| except that the input significand has fewer bits and does not have to be
1262| normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1263| point exponent.
1264*----------------------------------------------------------------------------*/
1265
f4014512 1266static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
e5a41ffa
PM
1267 uint64_t zSig0, uint64_t zSig1,
1268 float_status *status)
158142c2 1269{
8f506c70 1270 int8_t shiftCount;
bb98fe42 1271 uint64_t zSig2;
158142c2
FB
1272
1273 if ( zSig0 == 0 ) {
1274 zSig0 = zSig1;
1275 zSig1 = 0;
1276 zExp -= 64;
1277 }
1278 shiftCount = countLeadingZeros64( zSig0 ) - 15;
1279 if ( 0 <= shiftCount ) {
1280 zSig2 = 0;
1281 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1282 }
1283 else {
1284 shift128ExtraRightJamming(
1285 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1286 }
1287 zExp -= shiftCount;
ff32e16e 1288 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
158142c2
FB
1289
1290}
1291
158142c2
FB
1292/*----------------------------------------------------------------------------
1293| Returns the result of converting the 32-bit two's complement integer `a'
1294| to the single-precision floating-point format. The conversion is performed
1295| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1296*----------------------------------------------------------------------------*/
1297
e5a41ffa 1298float32 int32_to_float32(int32_t a, float_status *status)
158142c2
FB
1299{
1300 flag zSign;
1301
f090c9d4 1302 if ( a == 0 ) return float32_zero;
bb98fe42 1303 if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
158142c2 1304 zSign = ( a < 0 );
ff32e16e 1305 return normalizeRoundAndPackFloat32(zSign, 0x9C, zSign ? -a : a, status);
158142c2
FB
1306}
1307
1308/*----------------------------------------------------------------------------
1309| Returns the result of converting the 32-bit two's complement integer `a'
1310| to the double-precision floating-point format. The conversion is performed
1311| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1312*----------------------------------------------------------------------------*/
1313
e5a41ffa 1314float64 int32_to_float64(int32_t a, float_status *status)
158142c2
FB
1315{
1316 flag zSign;
3a87d009 1317 uint32_t absA;
8f506c70 1318 int8_t shiftCount;
bb98fe42 1319 uint64_t zSig;
158142c2 1320
f090c9d4 1321 if ( a == 0 ) return float64_zero;
158142c2
FB
1322 zSign = ( a < 0 );
1323 absA = zSign ? - a : a;
1324 shiftCount = countLeadingZeros32( absA ) + 21;
1325 zSig = absA;
1326 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1327
1328}
1329
158142c2
FB
1330/*----------------------------------------------------------------------------
1331| Returns the result of converting the 32-bit two's complement integer `a'
1332| to the extended double-precision floating-point format. The conversion
1333| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1334| Arithmetic.
1335*----------------------------------------------------------------------------*/
1336
e5a41ffa 1337floatx80 int32_to_floatx80(int32_t a, float_status *status)
158142c2
FB
1338{
1339 flag zSign;
3a87d009 1340 uint32_t absA;
8f506c70 1341 int8_t shiftCount;
bb98fe42 1342 uint64_t zSig;
158142c2
FB
1343
1344 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1345 zSign = ( a < 0 );
1346 absA = zSign ? - a : a;
1347 shiftCount = countLeadingZeros32( absA ) + 32;
1348 zSig = absA;
1349 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1350
1351}
1352
158142c2
FB
1353/*----------------------------------------------------------------------------
1354| Returns the result of converting the 32-bit two's complement integer `a' to
1355| the quadruple-precision floating-point format. The conversion is performed
1356| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1357*----------------------------------------------------------------------------*/
1358
e5a41ffa 1359float128 int32_to_float128(int32_t a, float_status *status)
158142c2
FB
1360{
1361 flag zSign;
3a87d009 1362 uint32_t absA;
8f506c70 1363 int8_t shiftCount;
bb98fe42 1364 uint64_t zSig0;
158142c2
FB
1365
1366 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1367 zSign = ( a < 0 );
1368 absA = zSign ? - a : a;
1369 shiftCount = countLeadingZeros32( absA ) + 17;
1370 zSig0 = absA;
1371 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1372
1373}
1374
158142c2
FB
1375/*----------------------------------------------------------------------------
1376| Returns the result of converting the 64-bit two's complement integer `a'
1377| to the single-precision floating-point format. The conversion is performed
1378| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1379*----------------------------------------------------------------------------*/
1380
e5a41ffa 1381float32 int64_to_float32(int64_t a, float_status *status)
158142c2
FB
1382{
1383 flag zSign;
182f42fd 1384 uint64_t absA;
8f506c70 1385 int8_t shiftCount;
158142c2 1386
f090c9d4 1387 if ( a == 0 ) return float32_zero;
158142c2
FB
1388 zSign = ( a < 0 );
1389 absA = zSign ? - a : a;
1390 shiftCount = countLeadingZeros64( absA ) - 40;
1391 if ( 0 <= shiftCount ) {
1392 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1393 }
1394 else {
1395 shiftCount += 7;
1396 if ( shiftCount < 0 ) {
1397 shift64RightJamming( absA, - shiftCount, &absA );
1398 }
1399 else {
1400 absA <<= shiftCount;
1401 }
ff32e16e 1402 return roundAndPackFloat32(zSign, 0x9C - shiftCount, absA, status);
158142c2
FB
1403 }
1404
1405}
1406
1407/*----------------------------------------------------------------------------
1408| Returns the result of converting the 64-bit two's complement integer `a'
1409| to the double-precision floating-point format. The conversion is performed
1410| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1411*----------------------------------------------------------------------------*/
1412
e5a41ffa 1413float64 int64_to_float64(int64_t a, float_status *status)
158142c2
FB
1414{
1415 flag zSign;
1416
f090c9d4 1417 if ( a == 0 ) return float64_zero;
bb98fe42 1418 if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
158142c2
FB
1419 return packFloat64( 1, 0x43E, 0 );
1420 }
1421 zSign = ( a < 0 );
ff32e16e 1422 return normalizeRoundAndPackFloat64(zSign, 0x43C, zSign ? -a : a, status);
158142c2
FB
1423}
1424
158142c2
FB
1425/*----------------------------------------------------------------------------
1426| Returns the result of converting the 64-bit two's complement integer `a'
1427| to the extended double-precision floating-point format. The conversion
1428| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1429| Arithmetic.
1430*----------------------------------------------------------------------------*/
1431
e5a41ffa 1432floatx80 int64_to_floatx80(int64_t a, float_status *status)
158142c2
FB
1433{
1434 flag zSign;
182f42fd 1435 uint64_t absA;
8f506c70 1436 int8_t shiftCount;
158142c2
FB
1437
1438 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1439 zSign = ( a < 0 );
1440 absA = zSign ? - a : a;
1441 shiftCount = countLeadingZeros64( absA );
1442 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1443
1444}
1445
158142c2
FB
1446/*----------------------------------------------------------------------------
1447| Returns the result of converting the 64-bit two's complement integer `a' to
1448| the quadruple-precision floating-point format. The conversion is performed
1449| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1450*----------------------------------------------------------------------------*/
1451
e5a41ffa 1452float128 int64_to_float128(int64_t a, float_status *status)
158142c2
FB
1453{
1454 flag zSign;
182f42fd 1455 uint64_t absA;
8f506c70 1456 int8_t shiftCount;
f4014512 1457 int32_t zExp;
bb98fe42 1458 uint64_t zSig0, zSig1;
158142c2
FB
1459
1460 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1461 zSign = ( a < 0 );
1462 absA = zSign ? - a : a;
1463 shiftCount = countLeadingZeros64( absA ) + 49;
1464 zExp = 0x406E - shiftCount;
1465 if ( 64 <= shiftCount ) {
1466 zSig1 = 0;
1467 zSig0 = absA;
1468 shiftCount -= 64;
1469 }
1470 else {
1471 zSig1 = absA;
1472 zSig0 = 0;
1473 }
1474 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1475 return packFloat128( zSign, zExp, zSig0, zSig1 );
1476
1477}
1478
6bb8e0f1
PM
1479/*----------------------------------------------------------------------------
1480| Returns the result of converting the 64-bit unsigned integer `a'
1481| to the single-precision floating-point format. The conversion is performed
1482| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1483*----------------------------------------------------------------------------*/
1484
e5a41ffa 1485float32 uint64_to_float32(uint64_t a, float_status *status)
6bb8e0f1
PM
1486{
1487 int shiftcount;
1488
1489 if (a == 0) {
1490 return float32_zero;
1491 }
1492
1493 /* Determine (left) shift needed to put first set bit into bit posn 23
1494 * (since packFloat32() expects the binary point between bits 23 and 22);
1495 * this is the fast case for smallish numbers.
1496 */
1497 shiftcount = countLeadingZeros64(a) - 40;
1498 if (shiftcount >= 0) {
1499 return packFloat32(0, 0x95 - shiftcount, a << shiftcount);
1500 }
1501 /* Otherwise we need to do a round-and-pack. roundAndPackFloat32()
1502 * expects the binary point between bits 30 and 29, hence the + 7.
1503 */
1504 shiftcount += 7;
1505 if (shiftcount < 0) {
1506 shift64RightJamming(a, -shiftcount, &a);
1507 } else {
1508 a <<= shiftcount;
1509 }
1510
ff32e16e 1511 return roundAndPackFloat32(0, 0x9c - shiftcount, a, status);
6bb8e0f1
PM
1512}
1513
1514/*----------------------------------------------------------------------------
1515| Returns the result of converting the 64-bit unsigned integer `a'
1516| to the double-precision floating-point format. The conversion is performed
1517| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1518*----------------------------------------------------------------------------*/
1519
e5a41ffa 1520float64 uint64_to_float64(uint64_t a, float_status *status)
6bb8e0f1
PM
1521{
1522 int exp = 0x43C;
1523 int shiftcount;
1524
1525 if (a == 0) {
1526 return float64_zero;
1527 }
1528
1529 shiftcount = countLeadingZeros64(a) - 1;
1530 if (shiftcount < 0) {
1531 shift64RightJamming(a, -shiftcount, &a);
1532 } else {
1533 a <<= shiftcount;
1534 }
ff32e16e 1535 return roundAndPackFloat64(0, exp - shiftcount, a, status);
6bb8e0f1
PM
1536}
1537
1538/*----------------------------------------------------------------------------
1539| Returns the result of converting the 64-bit unsigned integer `a'
1540| to the quadruple-precision floating-point format. The conversion is performed
1541| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1542*----------------------------------------------------------------------------*/
1543
e5a41ffa 1544float128 uint64_to_float128(uint64_t a, float_status *status)
1e397ead
RH
1545{
1546 if (a == 0) {
1547 return float128_zero;
1548 }
ff32e16e 1549 return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status);
1e397ead
RH
1550}
1551
158142c2
FB
1552/*----------------------------------------------------------------------------
1553| Returns the result of converting the single-precision floating-point value
1554| `a' to the 32-bit two's complement integer format. The conversion is
1555| performed according to the IEC/IEEE Standard for Binary Floating-Point
1556| Arithmetic---which means in particular that the conversion is rounded
1557| according to the current rounding mode. If `a' is a NaN, the largest
1558| positive integer is returned. Otherwise, if the conversion overflows, the
1559| largest integer with the same sign as `a' is returned.
1560*----------------------------------------------------------------------------*/
1561
f4014512 1562int32_t float32_to_int32(float32 a, float_status *status)
158142c2
FB
1563{
1564 flag aSign;
0c48262d 1565 int aExp;
07d792d2 1566 int shiftCount;
bb98fe42
AF
1567 uint32_t aSig;
1568 uint64_t aSig64;
158142c2 1569
ff32e16e 1570 a = float32_squash_input_denormal(a, status);
158142c2
FB
1571 aSig = extractFloat32Frac( a );
1572 aExp = extractFloat32Exp( a );
1573 aSign = extractFloat32Sign( a );
1574 if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1575 if ( aExp ) aSig |= 0x00800000;
1576 shiftCount = 0xAF - aExp;
1577 aSig64 = aSig;
1578 aSig64 <<= 32;
1579 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
ff32e16e 1580 return roundAndPackInt32(aSign, aSig64, status);
158142c2
FB
1581
1582}
1583
1584/*----------------------------------------------------------------------------
1585| Returns the result of converting the single-precision floating-point value
1586| `a' to the 32-bit two's complement integer format. The conversion is
1587| performed according to the IEC/IEEE Standard for Binary Floating-Point
1588| Arithmetic, except that the conversion is always rounded toward zero.
1589| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1590| the conversion overflows, the largest integer with the same sign as `a' is
1591| returned.
1592*----------------------------------------------------------------------------*/
1593
f4014512 1594int32_t float32_to_int32_round_to_zero(float32 a, float_status *status)
158142c2
FB
1595{
1596 flag aSign;
0c48262d 1597 int aExp;
07d792d2 1598 int shiftCount;
bb98fe42 1599 uint32_t aSig;
b3a6a2e0 1600 int32_t z;
ff32e16e 1601 a = float32_squash_input_denormal(a, status);
158142c2
FB
1602
1603 aSig = extractFloat32Frac( a );
1604 aExp = extractFloat32Exp( a );
1605 aSign = extractFloat32Sign( a );
1606 shiftCount = aExp - 0x9E;
1607 if ( 0 <= shiftCount ) {
f090c9d4 1608 if ( float32_val(a) != 0xCF000000 ) {
ff32e16e 1609 float_raise(float_flag_invalid, status);
158142c2
FB
1610 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1611 }
bb98fe42 1612 return (int32_t) 0x80000000;
158142c2
FB
1613 }
1614 else if ( aExp <= 0x7E ) {
a2f2d288
PM
1615 if (aExp | aSig) {
1616 status->float_exception_flags |= float_flag_inexact;
1617 }
158142c2
FB
1618 return 0;
1619 }
1620 aSig = ( aSig | 0x00800000 )<<8;
1621 z = aSig>>( - shiftCount );
bb98fe42 1622 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
a2f2d288 1623 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
1624 }
1625 if ( aSign ) z = - z;
1626 return z;
1627
1628}
1629
cbcef455
PM
1630/*----------------------------------------------------------------------------
1631| Returns the result of converting the single-precision floating-point value
1632| `a' to the 16-bit two's complement integer format. The conversion is
1633| performed according to the IEC/IEEE Standard for Binary Floating-Point
1634| Arithmetic, except that the conversion is always rounded toward zero.
1635| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1636| the conversion overflows, the largest integer with the same sign as `a' is
1637| returned.
1638*----------------------------------------------------------------------------*/
1639
0bb721d7 1640int16_t float32_to_int16_round_to_zero(float32 a, float_status *status)
cbcef455
PM
1641{
1642 flag aSign;
0c48262d 1643 int aExp;
07d792d2 1644 int shiftCount;
bb98fe42 1645 uint32_t aSig;
f4014512 1646 int32_t z;
cbcef455
PM
1647
1648 aSig = extractFloat32Frac( a );
1649 aExp = extractFloat32Exp( a );
1650 aSign = extractFloat32Sign( a );
1651 shiftCount = aExp - 0x8E;
1652 if ( 0 <= shiftCount ) {
1653 if ( float32_val(a) != 0xC7000000 ) {
ff32e16e 1654 float_raise(float_flag_invalid, status);
cbcef455
PM
1655 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1656 return 0x7FFF;
1657 }
1658 }
bb98fe42 1659 return (int32_t) 0xffff8000;
cbcef455
PM
1660 }
1661 else if ( aExp <= 0x7E ) {
1662 if ( aExp | aSig ) {
a2f2d288 1663 status->float_exception_flags |= float_flag_inexact;
cbcef455
PM
1664 }
1665 return 0;
1666 }
1667 shiftCount -= 0x10;
1668 aSig = ( aSig | 0x00800000 )<<8;
1669 z = aSig>>( - shiftCount );
bb98fe42 1670 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
a2f2d288 1671 status->float_exception_flags |= float_flag_inexact;
cbcef455
PM
1672 }
1673 if ( aSign ) {
1674 z = - z;
1675 }
1676 return z;
1677
1678}
1679
158142c2
FB
1680/*----------------------------------------------------------------------------
1681| Returns the result of converting the single-precision floating-point value
1682| `a' to the 64-bit two's complement integer format. The conversion is
1683| performed according to the IEC/IEEE Standard for Binary Floating-Point
1684| Arithmetic---which means in particular that the conversion is rounded
1685| according to the current rounding mode. If `a' is a NaN, the largest
1686| positive integer is returned. Otherwise, if the conversion overflows, the
1687| largest integer with the same sign as `a' is returned.
1688*----------------------------------------------------------------------------*/
1689
f42c2224 1690int64_t float32_to_int64(float32 a, float_status *status)
158142c2
FB
1691{
1692 flag aSign;
0c48262d 1693 int aExp;
07d792d2 1694 int shiftCount;
bb98fe42
AF
1695 uint32_t aSig;
1696 uint64_t aSig64, aSigExtra;
ff32e16e 1697 a = float32_squash_input_denormal(a, status);
158142c2
FB
1698
1699 aSig = extractFloat32Frac( a );
1700 aExp = extractFloat32Exp( a );
1701 aSign = extractFloat32Sign( a );
1702 shiftCount = 0xBE - aExp;
1703 if ( shiftCount < 0 ) {
ff32e16e 1704 float_raise(float_flag_invalid, status);
158142c2
FB
1705 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1706 return LIT64( 0x7FFFFFFFFFFFFFFF );
1707 }
bb98fe42 1708 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
1709 }
1710 if ( aExp ) aSig |= 0x00800000;
1711 aSig64 = aSig;
1712 aSig64 <<= 40;
1713 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
ff32e16e 1714 return roundAndPackInt64(aSign, aSig64, aSigExtra, status);
158142c2
FB
1715
1716}
1717
2f18bbf9
TM
1718/*----------------------------------------------------------------------------
1719| Returns the result of converting the single-precision floating-point value
1720| `a' to the 64-bit unsigned integer format. The conversion is
1721| performed according to the IEC/IEEE Standard for Binary Floating-Point
1722| Arithmetic---which means in particular that the conversion is rounded
1723| according to the current rounding mode. If `a' is a NaN, the largest
1724| unsigned integer is returned. Otherwise, if the conversion overflows, the
1725| largest unsigned integer is returned. If the 'a' is negative, the result
1726| is rounded and zero is returned; values that do not round to zero will
1727| raise the inexact exception flag.
1728*----------------------------------------------------------------------------*/
1729
182f42fd 1730uint64_t float32_to_uint64(float32 a, float_status *status)
2f18bbf9
TM
1731{
1732 flag aSign;
0c48262d 1733 int aExp;
07d792d2 1734 int shiftCount;
2f18bbf9
TM
1735 uint32_t aSig;
1736 uint64_t aSig64, aSigExtra;
ff32e16e 1737 a = float32_squash_input_denormal(a, status);
2f18bbf9
TM
1738
1739 aSig = extractFloat32Frac(a);
1740 aExp = extractFloat32Exp(a);
1741 aSign = extractFloat32Sign(a);
1742 if ((aSign) && (aExp > 126)) {
ff32e16e 1743 float_raise(float_flag_invalid, status);
2f18bbf9
TM
1744 if (float32_is_any_nan(a)) {
1745 return LIT64(0xFFFFFFFFFFFFFFFF);
1746 } else {
1747 return 0;
1748 }
1749 }
1750 shiftCount = 0xBE - aExp;
1751 if (aExp) {
1752 aSig |= 0x00800000;
1753 }
1754 if (shiftCount < 0) {
ff32e16e 1755 float_raise(float_flag_invalid, status);
2f18bbf9
TM
1756 return LIT64(0xFFFFFFFFFFFFFFFF);
1757 }
1758
1759 aSig64 = aSig;
1760 aSig64 <<= 40;
1761 shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra);
ff32e16e 1762 return roundAndPackUint64(aSign, aSig64, aSigExtra, status);
2f18bbf9
TM
1763}
1764
a13d4489
TM
1765/*----------------------------------------------------------------------------
1766| Returns the result of converting the single-precision floating-point value
1767| `a' to the 64-bit unsigned integer format. The conversion is
1768| performed according to the IEC/IEEE Standard for Binary Floating-Point
1769| Arithmetic, except that the conversion is always rounded toward zero. If
1770| `a' is a NaN, the largest unsigned integer is returned. Otherwise, if the
1771| conversion overflows, the largest unsigned integer is returned. If the
1772| 'a' is negative, the result is rounded and zero is returned; values that do
1773| not round to zero will raise the inexact flag.
1774*----------------------------------------------------------------------------*/
1775
182f42fd 1776uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *status)
a13d4489 1777{
a2f2d288 1778 signed char current_rounding_mode = status->float_rounding_mode;
ff32e16e
PM
1779 set_float_rounding_mode(float_round_to_zero, status);
1780 int64_t v = float32_to_uint64(a, status);
1781 set_float_rounding_mode(current_rounding_mode, status);
a13d4489
TM
1782 return v;
1783}
1784
158142c2
FB
1785/*----------------------------------------------------------------------------
1786| Returns the result of converting the single-precision floating-point value
1787| `a' to the 64-bit two's complement integer format. The conversion is
1788| performed according to the IEC/IEEE Standard for Binary Floating-Point
1789| Arithmetic, except that the conversion is always rounded toward zero. If
1790| `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1791| conversion overflows, the largest integer with the same sign as `a' is
1792| returned.
1793*----------------------------------------------------------------------------*/
1794
f42c2224 1795int64_t float32_to_int64_round_to_zero(float32 a, float_status *status)
158142c2
FB
1796{
1797 flag aSign;
0c48262d 1798 int aExp;
07d792d2 1799 int shiftCount;
bb98fe42
AF
1800 uint32_t aSig;
1801 uint64_t aSig64;
f42c2224 1802 int64_t z;
ff32e16e 1803 a = float32_squash_input_denormal(a, status);
158142c2
FB
1804
1805 aSig = extractFloat32Frac( a );
1806 aExp = extractFloat32Exp( a );
1807 aSign = extractFloat32Sign( a );
1808 shiftCount = aExp - 0xBE;
1809 if ( 0 <= shiftCount ) {
f090c9d4 1810 if ( float32_val(a) != 0xDF000000 ) {
ff32e16e 1811 float_raise(float_flag_invalid, status);
158142c2
FB
1812 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1813 return LIT64( 0x7FFFFFFFFFFFFFFF );
1814 }
1815 }
bb98fe42 1816 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
1817 }
1818 else if ( aExp <= 0x7E ) {
a2f2d288
PM
1819 if (aExp | aSig) {
1820 status->float_exception_flags |= float_flag_inexact;
1821 }
158142c2
FB
1822 return 0;
1823 }
1824 aSig64 = aSig | 0x00800000;
1825 aSig64 <<= 40;
1826 z = aSig64>>( - shiftCount );
bb98fe42 1827 if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
a2f2d288 1828 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
1829 }
1830 if ( aSign ) z = - z;
1831 return z;
1832
1833}
1834
1835/*----------------------------------------------------------------------------
1836| Returns the result of converting the single-precision floating-point value
1837| `a' to the double-precision floating-point format. The conversion is
1838| performed according to the IEC/IEEE Standard for Binary Floating-Point
1839| Arithmetic.
1840*----------------------------------------------------------------------------*/
1841
e5a41ffa 1842float64 float32_to_float64(float32 a, float_status *status)
158142c2
FB
1843{
1844 flag aSign;
0c48262d 1845 int aExp;
bb98fe42 1846 uint32_t aSig;
ff32e16e 1847 a = float32_squash_input_denormal(a, status);
158142c2
FB
1848
1849 aSig = extractFloat32Frac( a );
1850 aExp = extractFloat32Exp( a );
1851 aSign = extractFloat32Sign( a );
1852 if ( aExp == 0xFF ) {
ff32e16e
PM
1853 if (aSig) {
1854 return commonNaNToFloat64(float32ToCommonNaN(a, status), status);
1855 }
158142c2
FB
1856 return packFloat64( aSign, 0x7FF, 0 );
1857 }
1858 if ( aExp == 0 ) {
1859 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1860 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1861 --aExp;
1862 }
bb98fe42 1863 return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
158142c2
FB
1864
1865}
1866
158142c2
FB
1867/*----------------------------------------------------------------------------
1868| Returns the result of converting the single-precision floating-point value
1869| `a' to the extended double-precision floating-point format. The conversion
1870| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1871| Arithmetic.
1872*----------------------------------------------------------------------------*/
1873
e5a41ffa 1874floatx80 float32_to_floatx80(float32 a, float_status *status)
158142c2
FB
1875{
1876 flag aSign;
0c48262d 1877 int aExp;
bb98fe42 1878 uint32_t aSig;
158142c2 1879
ff32e16e 1880 a = float32_squash_input_denormal(a, status);
158142c2
FB
1881 aSig = extractFloat32Frac( a );
1882 aExp = extractFloat32Exp( a );
1883 aSign = extractFloat32Sign( a );
1884 if ( aExp == 0xFF ) {
ff32e16e
PM
1885 if (aSig) {
1886 return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
1887 }
158142c2
FB
1888 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1889 }
1890 if ( aExp == 0 ) {
1891 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1892 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1893 }
1894 aSig |= 0x00800000;
bb98fe42 1895 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
158142c2
FB
1896
1897}
1898
158142c2
FB
1899/*----------------------------------------------------------------------------
1900| Returns the result of converting the single-precision floating-point value
1901| `a' to the double-precision floating-point format. The conversion is
1902| performed according to the IEC/IEEE Standard for Binary Floating-Point
1903| Arithmetic.
1904*----------------------------------------------------------------------------*/
1905
e5a41ffa 1906float128 float32_to_float128(float32 a, float_status *status)
158142c2
FB
1907{
1908 flag aSign;
0c48262d 1909 int aExp;
bb98fe42 1910 uint32_t aSig;
158142c2 1911
ff32e16e 1912 a = float32_squash_input_denormal(a, status);
158142c2
FB
1913 aSig = extractFloat32Frac( a );
1914 aExp = extractFloat32Exp( a );
1915 aSign = extractFloat32Sign( a );
1916 if ( aExp == 0xFF ) {
ff32e16e
PM
1917 if (aSig) {
1918 return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
1919 }
158142c2
FB
1920 return packFloat128( aSign, 0x7FFF, 0, 0 );
1921 }
1922 if ( aExp == 0 ) {
1923 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1924 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1925 --aExp;
1926 }
bb98fe42 1927 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
158142c2
FB
1928
1929}
1930
158142c2
FB
1931/*----------------------------------------------------------------------------
1932| Rounds the single-precision floating-point value `a' to an integer, and
1933| returns the result as a single-precision floating-point value. The
1934| operation is performed according to the IEC/IEEE Standard for Binary
1935| Floating-Point Arithmetic.
1936*----------------------------------------------------------------------------*/
1937
e5a41ffa 1938float32 float32_round_to_int(float32 a, float_status *status)
158142c2
FB
1939{
1940 flag aSign;
0c48262d 1941 int aExp;
bb98fe42 1942 uint32_t lastBitMask, roundBitsMask;
bb98fe42 1943 uint32_t z;
ff32e16e 1944 a = float32_squash_input_denormal(a, status);
158142c2
FB
1945
1946 aExp = extractFloat32Exp( a );
1947 if ( 0x96 <= aExp ) {
1948 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
ff32e16e 1949 return propagateFloat32NaN(a, a, status);
158142c2
FB
1950 }
1951 return a;
1952 }
1953 if ( aExp <= 0x7E ) {
bb98fe42 1954 if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
a2f2d288 1955 status->float_exception_flags |= float_flag_inexact;
158142c2 1956 aSign = extractFloat32Sign( a );
a2f2d288 1957 switch (status->float_rounding_mode) {
158142c2
FB
1958 case float_round_nearest_even:
1959 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1960 return packFloat32( aSign, 0x7F, 0 );
1961 }
1962 break;
f9288a76
PM
1963 case float_round_ties_away:
1964 if (aExp == 0x7E) {
1965 return packFloat32(aSign, 0x7F, 0);
1966 }
1967 break;
158142c2 1968 case float_round_down:
f090c9d4 1969 return make_float32(aSign ? 0xBF800000 : 0);
158142c2 1970 case float_round_up:
f090c9d4 1971 return make_float32(aSign ? 0x80000000 : 0x3F800000);
158142c2
FB
1972 }
1973 return packFloat32( aSign, 0, 0 );
1974 }
1975 lastBitMask = 1;
1976 lastBitMask <<= 0x96 - aExp;
1977 roundBitsMask = lastBitMask - 1;
f090c9d4 1978 z = float32_val(a);
a2f2d288 1979 switch (status->float_rounding_mode) {
dc355b76 1980 case float_round_nearest_even:
158142c2 1981 z += lastBitMask>>1;
dc355b76
PM
1982 if ((z & roundBitsMask) == 0) {
1983 z &= ~lastBitMask;
1984 }
1985 break;
f9288a76
PM
1986 case float_round_ties_away:
1987 z += lastBitMask >> 1;
1988 break;
dc355b76
PM
1989 case float_round_to_zero:
1990 break;
1991 case float_round_up:
1992 if (!extractFloat32Sign(make_float32(z))) {
1993 z += roundBitsMask;
1994 }
1995 break;
1996 case float_round_down:
1997 if (extractFloat32Sign(make_float32(z))) {
158142c2
FB
1998 z += roundBitsMask;
1999 }
dc355b76
PM
2000 break;
2001 default:
2002 abort();
158142c2
FB
2003 }
2004 z &= ~ roundBitsMask;
a2f2d288
PM
2005 if (z != float32_val(a)) {
2006 status->float_exception_flags |= float_flag_inexact;
2007 }
f090c9d4 2008 return make_float32(z);
158142c2
FB
2009
2010}
2011
2012/*----------------------------------------------------------------------------
2013| Returns the result of adding the absolute values of the single-precision
2014| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
2015| before being returned. `zSign' is ignored if the result is a NaN.
2016| The addition is performed according to the IEC/IEEE Standard for Binary
2017| Floating-Point Arithmetic.
2018*----------------------------------------------------------------------------*/
2019
e5a41ffa
PM
2020static float32 addFloat32Sigs(float32 a, float32 b, flag zSign,
2021 float_status *status)
158142c2 2022{
0c48262d 2023 int aExp, bExp, zExp;
bb98fe42 2024 uint32_t aSig, bSig, zSig;
0c48262d 2025 int expDiff;
158142c2
FB
2026
2027 aSig = extractFloat32Frac( a );
2028 aExp = extractFloat32Exp( a );
2029 bSig = extractFloat32Frac( b );
2030 bExp = extractFloat32Exp( b );
2031 expDiff = aExp - bExp;
2032 aSig <<= 6;
2033 bSig <<= 6;
2034 if ( 0 < expDiff ) {
2035 if ( aExp == 0xFF ) {
ff32e16e
PM
2036 if (aSig) {
2037 return propagateFloat32NaN(a, b, status);
2038 }
158142c2
FB
2039 return a;
2040 }
2041 if ( bExp == 0 ) {
2042 --expDiff;
2043 }
2044 else {
2045 bSig |= 0x20000000;
2046 }
2047 shift32RightJamming( bSig, expDiff, &bSig );
2048 zExp = aExp;
2049 }
2050 else if ( expDiff < 0 ) {
2051 if ( bExp == 0xFF ) {
ff32e16e
PM
2052 if (bSig) {
2053 return propagateFloat32NaN(a, b, status);
2054 }
158142c2
FB
2055 return packFloat32( zSign, 0xFF, 0 );
2056 }
2057 if ( aExp == 0 ) {
2058 ++expDiff;
2059 }
2060 else {
2061 aSig |= 0x20000000;
2062 }
2063 shift32RightJamming( aSig, - expDiff, &aSig );
2064 zExp = bExp;
2065 }
2066 else {
2067 if ( aExp == 0xFF ) {
ff32e16e
PM
2068 if (aSig | bSig) {
2069 return propagateFloat32NaN(a, b, status);
2070 }
158142c2
FB
2071 return a;
2072 }
fe76d976 2073 if ( aExp == 0 ) {
a2f2d288 2074 if (status->flush_to_zero) {
e6afc87f 2075 if (aSig | bSig) {
ff32e16e 2076 float_raise(float_flag_output_denormal, status);
e6afc87f
PM
2077 }
2078 return packFloat32(zSign, 0, 0);
2079 }
fe76d976
PB
2080 return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
2081 }
158142c2
FB
2082 zSig = 0x40000000 + aSig + bSig;
2083 zExp = aExp;
2084 goto roundAndPack;
2085 }
2086 aSig |= 0x20000000;
2087 zSig = ( aSig + bSig )<<1;
2088 --zExp;
bb98fe42 2089 if ( (int32_t) zSig < 0 ) {
158142c2
FB
2090 zSig = aSig + bSig;
2091 ++zExp;
2092 }
2093 roundAndPack:
ff32e16e 2094 return roundAndPackFloat32(zSign, zExp, zSig, status);
158142c2
FB
2095
2096}
2097
2098/*----------------------------------------------------------------------------
2099| Returns the result of subtracting the absolute values of the single-
2100| precision floating-point values `a' and `b'. If `zSign' is 1, the
2101| difference is negated before being returned. `zSign' is ignored if the
2102| result is a NaN. The subtraction is performed according to the IEC/IEEE
2103| Standard for Binary Floating-Point Arithmetic.
2104*----------------------------------------------------------------------------*/
2105
e5a41ffa
PM
2106static float32 subFloat32Sigs(float32 a, float32 b, flag zSign,
2107 float_status *status)
158142c2 2108{
0c48262d 2109 int aExp, bExp, zExp;
bb98fe42 2110 uint32_t aSig, bSig, zSig;
0c48262d 2111 int expDiff;
158142c2
FB
2112
2113 aSig = extractFloat32Frac( a );
2114 aExp = extractFloat32Exp( a );
2115 bSig = extractFloat32Frac( b );
2116 bExp = extractFloat32Exp( b );
2117 expDiff = aExp - bExp;
2118 aSig <<= 7;
2119 bSig <<= 7;
2120 if ( 0 < expDiff ) goto aExpBigger;
2121 if ( expDiff < 0 ) goto bExpBigger;
2122 if ( aExp == 0xFF ) {
ff32e16e
PM
2123 if (aSig | bSig) {
2124 return propagateFloat32NaN(a, b, status);
2125 }
2126 float_raise(float_flag_invalid, status);
af39bc8c 2127 return float32_default_nan(status);
158142c2
FB
2128 }
2129 if ( aExp == 0 ) {
2130 aExp = 1;
2131 bExp = 1;
2132 }
2133 if ( bSig < aSig ) goto aBigger;
2134 if ( aSig < bSig ) goto bBigger;
a2f2d288 2135 return packFloat32(status->float_rounding_mode == float_round_down, 0, 0);
158142c2
FB
2136 bExpBigger:
2137 if ( bExp == 0xFF ) {
ff32e16e
PM
2138 if (bSig) {
2139 return propagateFloat32NaN(a, b, status);
2140 }
158142c2
FB
2141 return packFloat32( zSign ^ 1, 0xFF, 0 );
2142 }
2143 if ( aExp == 0 ) {
2144 ++expDiff;
2145 }
2146 else {
2147 aSig |= 0x40000000;
2148 }
2149 shift32RightJamming( aSig, - expDiff, &aSig );
2150 bSig |= 0x40000000;
2151 bBigger:
2152 zSig = bSig - aSig;
2153 zExp = bExp;
2154 zSign ^= 1;
2155 goto normalizeRoundAndPack;
2156 aExpBigger:
2157 if ( aExp == 0xFF ) {
ff32e16e
PM
2158 if (aSig) {
2159 return propagateFloat32NaN(a, b, status);
2160 }
158142c2
FB
2161 return a;
2162 }
2163 if ( bExp == 0 ) {
2164 --expDiff;
2165 }
2166 else {
2167 bSig |= 0x40000000;
2168 }
2169 shift32RightJamming( bSig, expDiff, &bSig );
2170 aSig |= 0x40000000;
2171 aBigger:
2172 zSig = aSig - bSig;
2173 zExp = aExp;
2174 normalizeRoundAndPack:
2175 --zExp;
ff32e16e 2176 return normalizeRoundAndPackFloat32(zSign, zExp, zSig, status);
158142c2
FB
2177
2178}
2179
2180/*----------------------------------------------------------------------------
2181| Returns the result of adding the single-precision floating-point values `a'
2182| and `b'. The operation is performed according to the IEC/IEEE Standard for
2183| Binary Floating-Point Arithmetic.
2184*----------------------------------------------------------------------------*/
2185
e5a41ffa 2186float32 float32_add(float32 a, float32 b, float_status *status)
158142c2
FB
2187{
2188 flag aSign, bSign;
ff32e16e
PM
2189 a = float32_squash_input_denormal(a, status);
2190 b = float32_squash_input_denormal(b, status);
158142c2
FB
2191
2192 aSign = extractFloat32Sign( a );
2193 bSign = extractFloat32Sign( b );
2194 if ( aSign == bSign ) {
ff32e16e 2195 return addFloat32Sigs(a, b, aSign, status);
158142c2
FB
2196 }
2197 else {
ff32e16e 2198 return subFloat32Sigs(a, b, aSign, status);
158142c2
FB
2199 }
2200
2201}
2202
2203/*----------------------------------------------------------------------------
2204| Returns the result of subtracting the single-precision floating-point values
2205| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2206| for Binary Floating-Point Arithmetic.
2207*----------------------------------------------------------------------------*/
2208
e5a41ffa 2209float32 float32_sub(float32 a, float32 b, float_status *status)
158142c2
FB
2210{
2211 flag aSign, bSign;
ff32e16e
PM
2212 a = float32_squash_input_denormal(a, status);
2213 b = float32_squash_input_denormal(b, status);
158142c2
FB
2214
2215 aSign = extractFloat32Sign( a );
2216 bSign = extractFloat32Sign( b );
2217 if ( aSign == bSign ) {
ff32e16e 2218 return subFloat32Sigs(a, b, aSign, status);
158142c2
FB
2219 }
2220 else {
ff32e16e 2221 return addFloat32Sigs(a, b, aSign, status);
158142c2
FB
2222 }
2223
2224}
2225
2226/*----------------------------------------------------------------------------
2227| Returns the result of multiplying the single-precision floating-point values
2228| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2229| for Binary Floating-Point Arithmetic.
2230*----------------------------------------------------------------------------*/
2231
e5a41ffa 2232float32 float32_mul(float32 a, float32 b, float_status *status)
158142c2
FB
2233{
2234 flag aSign, bSign, zSign;
0c48262d 2235 int aExp, bExp, zExp;
bb98fe42
AF
2236 uint32_t aSig, bSig;
2237 uint64_t zSig64;
2238 uint32_t zSig;
158142c2 2239
ff32e16e
PM
2240 a = float32_squash_input_denormal(a, status);
2241 b = float32_squash_input_denormal(b, status);
37d18660 2242
158142c2
FB
2243 aSig = extractFloat32Frac( a );
2244 aExp = extractFloat32Exp( a );
2245 aSign = extractFloat32Sign( a );
2246 bSig = extractFloat32Frac( b );
2247 bExp = extractFloat32Exp( b );
2248 bSign = extractFloat32Sign( b );
2249 zSign = aSign ^ bSign;
2250 if ( aExp == 0xFF ) {
2251 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
ff32e16e 2252 return propagateFloat32NaN(a, b, status);
158142c2
FB
2253 }
2254 if ( ( bExp | bSig ) == 0 ) {
ff32e16e 2255 float_raise(float_flag_invalid, status);
af39bc8c 2256 return float32_default_nan(status);
158142c2
FB
2257 }
2258 return packFloat32( zSign, 0xFF, 0 );
2259 }
2260 if ( bExp == 0xFF ) {
ff32e16e
PM
2261 if (bSig) {
2262 return propagateFloat32NaN(a, b, status);
2263 }
158142c2 2264 if ( ( aExp | aSig ) == 0 ) {
ff32e16e 2265 float_raise(float_flag_invalid, status);
af39bc8c 2266 return float32_default_nan(status);
158142c2
FB
2267 }
2268 return packFloat32( zSign, 0xFF, 0 );
2269 }
2270 if ( aExp == 0 ) {
2271 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2272 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2273 }
2274 if ( bExp == 0 ) {
2275 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
2276 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2277 }
2278 zExp = aExp + bExp - 0x7F;
2279 aSig = ( aSig | 0x00800000 )<<7;
2280 bSig = ( bSig | 0x00800000 )<<8;
bb98fe42 2281 shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
158142c2 2282 zSig = zSig64;
bb98fe42 2283 if ( 0 <= (int32_t) ( zSig<<1 ) ) {
158142c2
FB
2284 zSig <<= 1;
2285 --zExp;
2286 }
ff32e16e 2287 return roundAndPackFloat32(zSign, zExp, zSig, status);
158142c2
FB
2288
2289}
2290
2291/*----------------------------------------------------------------------------
2292| Returns the result of dividing the single-precision floating-point value `a'
2293| by the corresponding value `b'. The operation is performed according to the
2294| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2295*----------------------------------------------------------------------------*/
2296
e5a41ffa 2297float32 float32_div(float32 a, float32 b, float_status *status)
158142c2
FB
2298{
2299 flag aSign, bSign, zSign;
0c48262d 2300 int aExp, bExp, zExp;
bb98fe42 2301 uint32_t aSig, bSig, zSig;
ff32e16e
PM
2302 a = float32_squash_input_denormal(a, status);
2303 b = float32_squash_input_denormal(b, status);
158142c2
FB
2304
2305 aSig = extractFloat32Frac( a );
2306 aExp = extractFloat32Exp( a );
2307 aSign = extractFloat32Sign( a );
2308 bSig = extractFloat32Frac( b );
2309 bExp = extractFloat32Exp( b );
2310 bSign = extractFloat32Sign( b );
2311 zSign = aSign ^ bSign;
2312 if ( aExp == 0xFF ) {
ff32e16e
PM
2313 if (aSig) {
2314 return propagateFloat32NaN(a, b, status);
2315 }
158142c2 2316 if ( bExp == 0xFF ) {
ff32e16e
PM
2317 if (bSig) {
2318 return propagateFloat32NaN(a, b, status);
2319 }
2320 float_raise(float_flag_invalid, status);
af39bc8c 2321 return float32_default_nan(status);
158142c2
FB
2322 }
2323 return packFloat32( zSign, 0xFF, 0 );
2324 }
2325 if ( bExp == 0xFF ) {
ff32e16e
PM
2326 if (bSig) {
2327 return propagateFloat32NaN(a, b, status);
2328 }
158142c2
FB
2329 return packFloat32( zSign, 0, 0 );
2330 }
2331 if ( bExp == 0 ) {
2332 if ( bSig == 0 ) {
2333 if ( ( aExp | aSig ) == 0 ) {
ff32e16e 2334 float_raise(float_flag_invalid, status);
af39bc8c 2335 return float32_default_nan(status);
158142c2 2336 }
ff32e16e 2337 float_raise(float_flag_divbyzero, status);
158142c2
FB
2338 return packFloat32( zSign, 0xFF, 0 );
2339 }
2340 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2341 }
2342 if ( aExp == 0 ) {
2343 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2344 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2345 }
2346 zExp = aExp - bExp + 0x7D;
2347 aSig = ( aSig | 0x00800000 )<<7;
2348 bSig = ( bSig | 0x00800000 )<<8;
2349 if ( bSig <= ( aSig + aSig ) ) {
2350 aSig >>= 1;
2351 ++zExp;
2352 }
bb98fe42 2353 zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
158142c2 2354 if ( ( zSig & 0x3F ) == 0 ) {
bb98fe42 2355 zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
158142c2 2356 }
ff32e16e 2357 return roundAndPackFloat32(zSign, zExp, zSig, status);
158142c2
FB
2358
2359}
2360
2361/*----------------------------------------------------------------------------
2362| Returns the remainder of the single-precision floating-point value `a'
2363| with respect to the corresponding value `b'. The operation is performed
2364| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2365*----------------------------------------------------------------------------*/
2366
e5a41ffa 2367float32 float32_rem(float32 a, float32 b, float_status *status)
158142c2 2368{
ed086f3d 2369 flag aSign, zSign;
0c48262d 2370 int aExp, bExp, expDiff;
bb98fe42
AF
2371 uint32_t aSig, bSig;
2372 uint32_t q;
2373 uint64_t aSig64, bSig64, q64;
2374 uint32_t alternateASig;
2375 int32_t sigMean;
ff32e16e
PM
2376 a = float32_squash_input_denormal(a, status);
2377 b = float32_squash_input_denormal(b, status);
158142c2
FB
2378
2379 aSig = extractFloat32Frac( a );
2380 aExp = extractFloat32Exp( a );
2381 aSign = extractFloat32Sign( a );
2382 bSig = extractFloat32Frac( b );
2383 bExp = extractFloat32Exp( b );
158142c2
FB
2384 if ( aExp == 0xFF ) {
2385 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
ff32e16e 2386 return propagateFloat32NaN(a, b, status);
158142c2 2387 }
ff32e16e 2388 float_raise(float_flag_invalid, status);
af39bc8c 2389 return float32_default_nan(status);
158142c2
FB
2390 }
2391 if ( bExp == 0xFF ) {
ff32e16e
PM
2392 if (bSig) {
2393 return propagateFloat32NaN(a, b, status);
2394 }
158142c2
FB
2395 return a;
2396 }
2397 if ( bExp == 0 ) {
2398 if ( bSig == 0 ) {
ff32e16e 2399 float_raise(float_flag_invalid, status);
af39bc8c 2400 return float32_default_nan(status);
158142c2
FB
2401 }
2402 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2403 }
2404 if ( aExp == 0 ) {
2405 if ( aSig == 0 ) return a;
2406 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2407 }
2408 expDiff = aExp - bExp;
2409 aSig |= 0x00800000;
2410 bSig |= 0x00800000;
2411 if ( expDiff < 32 ) {
2412 aSig <<= 8;
2413 bSig <<= 8;
2414 if ( expDiff < 0 ) {
2415 if ( expDiff < -1 ) return a;
2416 aSig >>= 1;
2417 }
2418 q = ( bSig <= aSig );
2419 if ( q ) aSig -= bSig;
2420 if ( 0 < expDiff ) {
bb98fe42 2421 q = ( ( (uint64_t) aSig )<<32 ) / bSig;
158142c2
FB
2422 q >>= 32 - expDiff;
2423 bSig >>= 2;
2424 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2425 }
2426 else {
2427 aSig >>= 2;
2428 bSig >>= 2;
2429 }
2430 }
2431 else {
2432 if ( bSig <= aSig ) aSig -= bSig;
bb98fe42
AF
2433 aSig64 = ( (uint64_t) aSig )<<40;
2434 bSig64 = ( (uint64_t) bSig )<<40;
158142c2
FB
2435 expDiff -= 64;
2436 while ( 0 < expDiff ) {
2437 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2438 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2439 aSig64 = - ( ( bSig * q64 )<<38 );
2440 expDiff -= 62;
2441 }
2442 expDiff += 64;
2443 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2444 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2445 q = q64>>( 64 - expDiff );
2446 bSig <<= 6;
2447 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2448 }
2449 do {
2450 alternateASig = aSig;
2451 ++q;
2452 aSig -= bSig;
bb98fe42 2453 } while ( 0 <= (int32_t) aSig );
158142c2
FB
2454 sigMean = aSig + alternateASig;
2455 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2456 aSig = alternateASig;
2457 }
bb98fe42 2458 zSign = ( (int32_t) aSig < 0 );
158142c2 2459 if ( zSign ) aSig = - aSig;
ff32e16e 2460 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
158142c2
FB
2461}
2462
369be8f6
PM
2463/*----------------------------------------------------------------------------
2464| Returns the result of multiplying the single-precision floating-point values
2465| `a' and `b' then adding 'c', with no intermediate rounding step after the
2466| multiplication. The operation is performed according to the IEC/IEEE
2467| Standard for Binary Floating-Point Arithmetic 754-2008.
2468| The flags argument allows the caller to select negation of the
2469| addend, the intermediate product, or the final result. (The difference
2470| between this and having the caller do a separate negation is that negating
2471| externally will flip the sign bit on NaNs.)
2472*----------------------------------------------------------------------------*/
2473
e5a41ffa
PM
2474float32 float32_muladd(float32 a, float32 b, float32 c, int flags,
2475 float_status *status)
369be8f6
PM
2476{
2477 flag aSign, bSign, cSign, zSign;
0c48262d 2478 int aExp, bExp, cExp, pExp, zExp, expDiff;
369be8f6
PM
2479 uint32_t aSig, bSig, cSig;
2480 flag pInf, pZero, pSign;
2481 uint64_t pSig64, cSig64, zSig64;
2482 uint32_t pSig;
2483 int shiftcount;
2484 flag signflip, infzero;
2485
ff32e16e
PM
2486 a = float32_squash_input_denormal(a, status);
2487 b = float32_squash_input_denormal(b, status);
2488 c = float32_squash_input_denormal(c, status);
369be8f6
PM
2489 aSig = extractFloat32Frac(a);
2490 aExp = extractFloat32Exp(a);
2491 aSign = extractFloat32Sign(a);
2492 bSig = extractFloat32Frac(b);
2493 bExp = extractFloat32Exp(b);
2494 bSign = extractFloat32Sign(b);
2495 cSig = extractFloat32Frac(c);
2496 cExp = extractFloat32Exp(c);
2497 cSign = extractFloat32Sign(c);
2498
2499 infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) ||
2500 (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0));
2501
2502 /* It is implementation-defined whether the cases of (0,inf,qnan)
2503 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
2504 * they return if they do), so we have to hand this information
2505 * off to the target-specific pick-a-NaN routine.
2506 */
2507 if (((aExp == 0xff) && aSig) ||
2508 ((bExp == 0xff) && bSig) ||
2509 ((cExp == 0xff) && cSig)) {
ff32e16e 2510 return propagateFloat32MulAddNaN(a, b, c, infzero, status);
369be8f6
PM
2511 }
2512
2513 if (infzero) {
ff32e16e 2514 float_raise(float_flag_invalid, status);
af39bc8c 2515 return float32_default_nan(status);
369be8f6
PM
2516 }
2517
2518 if (flags & float_muladd_negate_c) {
2519 cSign ^= 1;
2520 }
2521
2522 signflip = (flags & float_muladd_negate_result) ? 1 : 0;
2523
2524 /* Work out the sign and type of the product */
2525 pSign = aSign ^ bSign;
2526 if (flags & float_muladd_negate_product) {
2527 pSign ^= 1;
2528 }
2529 pInf = (aExp == 0xff) || (bExp == 0xff);
2530 pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
2531
2532 if (cExp == 0xff) {
2533 if (pInf && (pSign ^ cSign)) {
2534 /* addition of opposite-signed infinities => InvalidOperation */
ff32e16e 2535 float_raise(float_flag_invalid, status);
af39bc8c 2536 return float32_default_nan(status);
369be8f6
PM
2537 }
2538 /* Otherwise generate an infinity of the same sign */
2539 return packFloat32(cSign ^ signflip, 0xff, 0);
2540 }
2541
2542 if (pInf) {
2543 return packFloat32(pSign ^ signflip, 0xff, 0);
2544 }
2545
2546 if (pZero) {
2547 if (cExp == 0) {
2548 if (cSig == 0) {
2549 /* Adding two exact zeroes */
2550 if (pSign == cSign) {
2551 zSign = pSign;
a2f2d288 2552 } else if (status->float_rounding_mode == float_round_down) {
369be8f6
PM
2553 zSign = 1;
2554 } else {
2555 zSign = 0;
2556 }
2557 return packFloat32(zSign ^ signflip, 0, 0);
2558 }
2559 /* Exact zero plus a denorm */
a2f2d288 2560 if (status->flush_to_zero) {
ff32e16e 2561 float_raise(float_flag_output_denormal, status);
369be8f6
PM
2562 return packFloat32(cSign ^ signflip, 0, 0);
2563 }
2564 }
2565 /* Zero plus something non-zero : just return the something */
67d43538
PM
2566 if (flags & float_muladd_halve_result) {
2567 if (cExp == 0) {
2568 normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2569 }
2570 /* Subtract one to halve, and one again because roundAndPackFloat32
2571 * wants one less than the true exponent.
2572 */
2573 cExp -= 2;
2574 cSig = (cSig | 0x00800000) << 7;
ff32e16e 2575 return roundAndPackFloat32(cSign ^ signflip, cExp, cSig, status);
67d43538 2576 }
a6e7c184 2577 return packFloat32(cSign ^ signflip, cExp, cSig);
369be8f6
PM
2578 }
2579
2580 if (aExp == 0) {
2581 normalizeFloat32Subnormal(aSig, &aExp, &aSig);
2582 }
2583 if (bExp == 0) {
2584 normalizeFloat32Subnormal(bSig, &bExp, &bSig);
2585 }
2586
2587 /* Calculate the actual result a * b + c */
2588
2589 /* Multiply first; this is easy. */
2590 /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
2591 * because we want the true exponent, not the "one-less-than"
2592 * flavour that roundAndPackFloat32() takes.
2593 */
2594 pExp = aExp + bExp - 0x7e;
2595 aSig = (aSig | 0x00800000) << 7;
2596 bSig = (bSig | 0x00800000) << 8;
2597 pSig64 = (uint64_t)aSig * bSig;
2598 if ((int64_t)(pSig64 << 1) >= 0) {
2599 pSig64 <<= 1;
2600 pExp--;
2601 }
2602
2603 zSign = pSign ^ signflip;
2604
2605 /* Now pSig64 is the significand of the multiply, with the explicit bit in
2606 * position 62.
2607 */
2608 if (cExp == 0) {
2609 if (!cSig) {
2610 /* Throw out the special case of c being an exact zero now */
2611 shift64RightJamming(pSig64, 32, &pSig64);
2612 pSig = pSig64;
67d43538
PM
2613 if (flags & float_muladd_halve_result) {
2614 pExp--;
2615 }
369be8f6 2616 return roundAndPackFloat32(zSign, pExp - 1,
ff32e16e 2617 pSig, status);
369be8f6
PM
2618 }
2619 normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2620 }
2621
2622 cSig64 = (uint64_t)cSig << (62 - 23);
2623 cSig64 |= LIT64(0x4000000000000000);
2624 expDiff = pExp - cExp;
2625
2626 if (pSign == cSign) {
2627 /* Addition */
2628 if (expDiff > 0) {
2629 /* scale c to match p */
2630 shift64RightJamming(cSig64, expDiff, &cSig64);
2631 zExp = pExp;
2632 } else if (expDiff < 0) {
2633 /* scale p to match c */
2634 shift64RightJamming(pSig64, -expDiff, &pSig64);
2635 zExp = cExp;
2636 } else {
2637 /* no scaling needed */
2638 zExp = cExp;
2639 }
2640 /* Add significands and make sure explicit bit ends up in posn 62 */
2641 zSig64 = pSig64 + cSig64;
2642 if ((int64_t)zSig64 < 0) {
2643 shift64RightJamming(zSig64, 1, &zSig64);
2644 } else {
2645 zExp--;
2646 }
2647 } else {
2648 /* Subtraction */
2649 if (expDiff > 0) {
2650 shift64RightJamming(cSig64, expDiff, &cSig64);
2651 zSig64 = pSig64 - cSig64;
2652 zExp = pExp;
2653 } else if (expDiff < 0) {
2654 shift64RightJamming(pSig64, -expDiff, &pSig64);
2655 zSig64 = cSig64 - pSig64;
2656 zExp = cExp;
2657 zSign ^= 1;
2658 } else {
2659 zExp = pExp;
2660 if (cSig64 < pSig64) {
2661 zSig64 = pSig64 - cSig64;
2662 } else if (pSig64 < cSig64) {
2663 zSig64 = cSig64 - pSig64;
2664 zSign ^= 1;
2665 } else {
2666 /* Exact zero */
2667 zSign = signflip;
a2f2d288 2668 if (status->float_rounding_mode == float_round_down) {
369be8f6
PM
2669 zSign ^= 1;
2670 }
2671 return packFloat32(zSign, 0, 0);
2672 }
2673 }
2674 --zExp;
2675 /* Normalize to put the explicit bit back into bit 62. */
2676 shiftcount = countLeadingZeros64(zSig64) - 1;
2677 zSig64 <<= shiftcount;
2678 zExp -= shiftcount;
2679 }
67d43538
PM
2680 if (flags & float_muladd_halve_result) {
2681 zExp--;
2682 }
2683
369be8f6 2684 shift64RightJamming(zSig64, 32, &zSig64);
ff32e16e 2685 return roundAndPackFloat32(zSign, zExp, zSig64, status);
369be8f6
PM
2686}
2687
2688
158142c2
FB
2689/*----------------------------------------------------------------------------
2690| Returns the square root of the single-precision floating-point value `a'.
2691| The operation is performed according to the IEC/IEEE Standard for Binary
2692| Floating-Point Arithmetic.
2693*----------------------------------------------------------------------------*/
2694
e5a41ffa 2695float32 float32_sqrt(float32 a, float_status *status)
158142c2
FB
2696{
2697 flag aSign;
0c48262d 2698 int aExp, zExp;
bb98fe42
AF
2699 uint32_t aSig, zSig;
2700 uint64_t rem, term;
ff32e16e 2701 a = float32_squash_input_denormal(a, status);
158142c2
FB
2702
2703 aSig = extractFloat32Frac( a );
2704 aExp = extractFloat32Exp( a );
2705 aSign = extractFloat32Sign( a );
2706 if ( aExp == 0xFF ) {
ff32e16e
PM
2707 if (aSig) {
2708 return propagateFloat32NaN(a, float32_zero, status);
2709 }
158142c2 2710 if ( ! aSign ) return a;
ff32e16e 2711 float_raise(float_flag_invalid, status);
af39bc8c 2712 return float32_default_nan(status);
158142c2
FB
2713 }
2714 if ( aSign ) {
2715 if ( ( aExp | aSig ) == 0 ) return a;
ff32e16e 2716 float_raise(float_flag_invalid, status);
af39bc8c 2717 return float32_default_nan(status);
158142c2
FB
2718 }
2719 if ( aExp == 0 ) {
f090c9d4 2720 if ( aSig == 0 ) return float32_zero;
158142c2
FB
2721 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2722 }
2723 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2724 aSig = ( aSig | 0x00800000 )<<8;
2725 zSig = estimateSqrt32( aExp, aSig ) + 2;
2726 if ( ( zSig & 0x7F ) <= 5 ) {
2727 if ( zSig < 2 ) {
2728 zSig = 0x7FFFFFFF;
2729 goto roundAndPack;
2730 }
2731 aSig >>= aExp & 1;
bb98fe42
AF
2732 term = ( (uint64_t) zSig ) * zSig;
2733 rem = ( ( (uint64_t) aSig )<<32 ) - term;
2734 while ( (int64_t) rem < 0 ) {
158142c2 2735 --zSig;
bb98fe42 2736 rem += ( ( (uint64_t) zSig )<<1 ) | 1;
158142c2
FB
2737 }
2738 zSig |= ( rem != 0 );
2739 }
2740 shift32RightJamming( zSig, 1, &zSig );
2741 roundAndPack:
ff32e16e 2742 return roundAndPackFloat32(0, zExp, zSig, status);
158142c2
FB
2743
2744}
2745
8229c991
AJ
2746/*----------------------------------------------------------------------------
2747| Returns the binary exponential of the single-precision floating-point value
2748| `a'. The operation is performed according to the IEC/IEEE Standard for
2749| Binary Floating-Point Arithmetic.
2750|
2751| Uses the following identities:
2752|
2753| 1. -------------------------------------------------------------------------
2754| x x*ln(2)
2755| 2 = e
2756|
2757| 2. -------------------------------------------------------------------------
2758| 2 3 4 5 n
2759| x x x x x x x
2760| e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2761| 1! 2! 3! 4! 5! n!
2762*----------------------------------------------------------------------------*/
2763
2764static const float64 float32_exp2_coefficients[15] =
2765{
d5138cf4
PM
2766 const_float64( 0x3ff0000000000000ll ), /* 1 */
2767 const_float64( 0x3fe0000000000000ll ), /* 2 */
2768 const_float64( 0x3fc5555555555555ll ), /* 3 */
2769 const_float64( 0x3fa5555555555555ll ), /* 4 */
2770 const_float64( 0x3f81111111111111ll ), /* 5 */
2771 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
2772 const_float64( 0x3f2a01a01a01a01all ), /* 7 */
2773 const_float64( 0x3efa01a01a01a01all ), /* 8 */
2774 const_float64( 0x3ec71de3a556c734ll ), /* 9 */
2775 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2776 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2777 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2778 const_float64( 0x3de6124613a86d09ll ), /* 13 */
2779 const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2780 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
8229c991
AJ
2781};
2782
e5a41ffa 2783float32 float32_exp2(float32 a, float_status *status)
8229c991
AJ
2784{
2785 flag aSign;
0c48262d 2786 int aExp;
bb98fe42 2787 uint32_t aSig;
8229c991
AJ
2788 float64 r, x, xn;
2789 int i;
ff32e16e 2790 a = float32_squash_input_denormal(a, status);
8229c991
AJ
2791
2792 aSig = extractFloat32Frac( a );
2793 aExp = extractFloat32Exp( a );
2794 aSign = extractFloat32Sign( a );
2795
2796 if ( aExp == 0xFF) {
ff32e16e
PM
2797 if (aSig) {
2798 return propagateFloat32NaN(a, float32_zero, status);
2799 }
8229c991
AJ
2800 return (aSign) ? float32_zero : a;
2801 }
2802 if (aExp == 0) {
2803 if (aSig == 0) return float32_one;
2804 }
2805
ff32e16e 2806 float_raise(float_flag_inexact, status);
8229c991
AJ
2807
2808 /* ******************************* */
2809 /* using float64 for approximation */
2810 /* ******************************* */
ff32e16e
PM
2811 x = float32_to_float64(a, status);
2812 x = float64_mul(x, float64_ln2, status);
8229c991
AJ
2813
2814 xn = x;
2815 r = float64_one;
2816 for (i = 0 ; i < 15 ; i++) {
2817 float64 f;
2818
ff32e16e
PM
2819 f = float64_mul(xn, float32_exp2_coefficients[i], status);
2820 r = float64_add(r, f, status);
8229c991 2821
ff32e16e 2822 xn = float64_mul(xn, x, status);
8229c991
AJ
2823 }
2824
2825 return float64_to_float32(r, status);
2826}
2827
374dfc33
AJ
2828/*----------------------------------------------------------------------------
2829| Returns the binary log of the single-precision floating-point value `a'.
2830| The operation is performed according to the IEC/IEEE Standard for Binary
2831| Floating-Point Arithmetic.
2832*----------------------------------------------------------------------------*/
e5a41ffa 2833float32 float32_log2(float32 a, float_status *status)
374dfc33
AJ
2834{
2835 flag aSign, zSign;
0c48262d 2836 int aExp;
bb98fe42 2837 uint32_t aSig, zSig, i;
374dfc33 2838
ff32e16e 2839 a = float32_squash_input_denormal(a, status);
374dfc33
AJ
2840 aSig = extractFloat32Frac( a );
2841 aExp = extractFloat32Exp( a );
2842 aSign = extractFloat32Sign( a );
2843
2844 if ( aExp == 0 ) {
2845 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2846 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2847 }
2848 if ( aSign ) {
ff32e16e 2849 float_raise(float_flag_invalid, status);
af39bc8c 2850 return float32_default_nan(status);
374dfc33
AJ
2851 }
2852 if ( aExp == 0xFF ) {
ff32e16e
PM
2853 if (aSig) {
2854 return propagateFloat32NaN(a, float32_zero, status);
2855 }
374dfc33
AJ
2856 return a;
2857 }
2858
2859 aExp -= 0x7F;
2860 aSig |= 0x00800000;
2861 zSign = aExp < 0;
2862 zSig = aExp << 23;
2863
2864 for (i = 1 << 22; i > 0; i >>= 1) {
bb98fe42 2865 aSig = ( (uint64_t)aSig * aSig ) >> 23;
374dfc33
AJ
2866 if ( aSig & 0x01000000 ) {
2867 aSig >>= 1;
2868 zSig |= i;
2869 }
2870 }
2871
2872 if ( zSign )
2873 zSig = -zSig;
2874
ff32e16e 2875 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
374dfc33
AJ
2876}
2877
158142c2
FB
2878/*----------------------------------------------------------------------------
2879| Returns 1 if the single-precision floating-point value `a' is equal to
b689362d
AJ
2880| the corresponding value `b', and 0 otherwise. The invalid exception is
2881| raised if either operand is a NaN. Otherwise, the comparison is performed
158142c2
FB
2882| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2883*----------------------------------------------------------------------------*/
2884
e5a41ffa 2885int float32_eq(float32 a, float32 b, float_status *status)
158142c2 2886{
b689362d 2887 uint32_t av, bv;
ff32e16e
PM
2888 a = float32_squash_input_denormal(a, status);
2889 b = float32_squash_input_denormal(b, status);
158142c2
FB
2890
2891 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2892 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2893 ) {
ff32e16e 2894 float_raise(float_flag_invalid, status);
158142c2
FB
2895 return 0;
2896 }
b689362d
AJ
2897 av = float32_val(a);
2898 bv = float32_val(b);
2899 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
158142c2
FB
2900}
2901
2902/*----------------------------------------------------------------------------
2903| Returns 1 if the single-precision floating-point value `a' is less than
f5a64251
AJ
2904| or equal to the corresponding value `b', and 0 otherwise. The invalid
2905| exception is raised if either operand is a NaN. The comparison is performed
2906| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
2907*----------------------------------------------------------------------------*/
2908
e5a41ffa 2909int float32_le(float32 a, float32 b, float_status *status)
158142c2
FB
2910{
2911 flag aSign, bSign;
bb98fe42 2912 uint32_t av, bv;
ff32e16e
PM
2913 a = float32_squash_input_denormal(a, status);
2914 b = float32_squash_input_denormal(b, status);
158142c2
FB
2915
2916 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2917 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2918 ) {
ff32e16e 2919 float_raise(float_flag_invalid, status);
158142c2
FB
2920 return 0;
2921 }
2922 aSign = extractFloat32Sign( a );
2923 bSign = extractFloat32Sign( b );
f090c9d4
PB
2924 av = float32_val(a);
2925 bv = float32_val(b);
bb98fe42 2926 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 2927 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
2928
2929}
2930
2931/*----------------------------------------------------------------------------
2932| Returns 1 if the single-precision floating-point value `a' is less than
f5a64251
AJ
2933| the corresponding value `b', and 0 otherwise. The invalid exception is
2934| raised if either operand is a NaN. The comparison is performed according
2935| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
2936*----------------------------------------------------------------------------*/
2937
e5a41ffa 2938int float32_lt(float32 a, float32 b, float_status *status)
158142c2
FB
2939{
2940 flag aSign, bSign;
bb98fe42 2941 uint32_t av, bv;
ff32e16e
PM
2942 a = float32_squash_input_denormal(a, status);
2943 b = float32_squash_input_denormal(b, status);
158142c2
FB
2944
2945 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2946 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2947 ) {
ff32e16e 2948 float_raise(float_flag_invalid, status);
158142c2
FB
2949 return 0;
2950 }
2951 aSign = extractFloat32Sign( a );
2952 bSign = extractFloat32Sign( b );
f090c9d4
PB
2953 av = float32_val(a);
2954 bv = float32_val(b);
bb98fe42 2955 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 2956 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
2957
2958}
2959
67b7861d
AJ
2960/*----------------------------------------------------------------------------
2961| Returns 1 if the single-precision floating-point values `a' and `b' cannot
f5a64251
AJ
2962| be compared, and 0 otherwise. The invalid exception is raised if either
2963| operand is a NaN. The comparison is performed according to the IEC/IEEE
2964| Standard for Binary Floating-Point Arithmetic.
67b7861d
AJ
2965*----------------------------------------------------------------------------*/
2966
e5a41ffa 2967int float32_unordered(float32 a, float32 b, float_status *status)
67b7861d 2968{
ff32e16e
PM
2969 a = float32_squash_input_denormal(a, status);
2970 b = float32_squash_input_denormal(b, status);
67b7861d
AJ
2971
2972 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2973 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2974 ) {
ff32e16e 2975 float_raise(float_flag_invalid, status);
67b7861d
AJ
2976 return 1;
2977 }
2978 return 0;
2979}
b689362d 2980
158142c2
FB
2981/*----------------------------------------------------------------------------
2982| Returns 1 if the single-precision floating-point value `a' is equal to
f5a64251
AJ
2983| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2984| exception. The comparison is performed according to the IEC/IEEE Standard
2985| for Binary Floating-Point Arithmetic.
158142c2
FB
2986*----------------------------------------------------------------------------*/
2987
e5a41ffa 2988int float32_eq_quiet(float32 a, float32 b, float_status *status)
158142c2 2989{
ff32e16e
PM
2990 a = float32_squash_input_denormal(a, status);
2991 b = float32_squash_input_denormal(b, status);
158142c2
FB
2992
2993 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2994 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2995 ) {
af39bc8c
AM
2996 if (float32_is_signaling_nan(a, status)
2997 || float32_is_signaling_nan(b, status)) {
ff32e16e 2998 float_raise(float_flag_invalid, status);
b689362d 2999 }
158142c2
FB
3000 return 0;
3001 }
b689362d
AJ
3002 return ( float32_val(a) == float32_val(b) ) ||
3003 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
158142c2
FB
3004}
3005
3006/*----------------------------------------------------------------------------
3007| Returns 1 if the single-precision floating-point value `a' is less than or
3008| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3009| cause an exception. Otherwise, the comparison is performed according to the
3010| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3011*----------------------------------------------------------------------------*/
3012
e5a41ffa 3013int float32_le_quiet(float32 a, float32 b, float_status *status)
158142c2
FB
3014{
3015 flag aSign, bSign;
bb98fe42 3016 uint32_t av, bv;
ff32e16e
PM
3017 a = float32_squash_input_denormal(a, status);
3018 b = float32_squash_input_denormal(b, status);
158142c2
FB
3019
3020 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3021 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3022 ) {
af39bc8c
AM
3023 if (float32_is_signaling_nan(a, status)
3024 || float32_is_signaling_nan(b, status)) {
ff32e16e 3025 float_raise(float_flag_invalid, status);
158142c2
FB
3026 }
3027 return 0;
3028 }
3029 aSign = extractFloat32Sign( a );
3030 bSign = extractFloat32Sign( b );
f090c9d4
PB
3031 av = float32_val(a);
3032 bv = float32_val(b);
bb98fe42 3033 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 3034 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
3035
3036}
3037
3038/*----------------------------------------------------------------------------
3039| Returns 1 if the single-precision floating-point value `a' is less than
3040| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3041| exception. Otherwise, the comparison is performed according to the IEC/IEEE
3042| Standard for Binary Floating-Point Arithmetic.
3043*----------------------------------------------------------------------------*/
3044
e5a41ffa 3045int float32_lt_quiet(float32 a, float32 b, float_status *status)
158142c2
FB
3046{
3047 flag aSign, bSign;
bb98fe42 3048 uint32_t av, bv;
ff32e16e
PM
3049 a = float32_squash_input_denormal(a, status);
3050 b = float32_squash_input_denormal(b, status);
158142c2
FB
3051
3052 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3053 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3054 ) {
af39bc8c
AM
3055 if (float32_is_signaling_nan(a, status)
3056 || float32_is_signaling_nan(b, status)) {
ff32e16e 3057 float_raise(float_flag_invalid, status);
158142c2
FB
3058 }
3059 return 0;
3060 }
3061 aSign = extractFloat32Sign( a );
3062 bSign = extractFloat32Sign( b );
f090c9d4
PB
3063 av = float32_val(a);
3064 bv = float32_val(b);
bb98fe42 3065 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 3066 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
3067
3068}
3069
67b7861d
AJ
3070/*----------------------------------------------------------------------------
3071| Returns 1 if the single-precision floating-point values `a' and `b' cannot
3072| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
3073| comparison is performed according to the IEC/IEEE Standard for Binary
3074| Floating-Point Arithmetic.
3075*----------------------------------------------------------------------------*/
3076
e5a41ffa 3077int float32_unordered_quiet(float32 a, float32 b, float_status *status)
67b7861d 3078{
ff32e16e
PM
3079 a = float32_squash_input_denormal(a, status);
3080 b = float32_squash_input_denormal(b, status);
67b7861d
AJ
3081
3082 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3083 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3084 ) {
af39bc8c
AM
3085 if (float32_is_signaling_nan(a, status)
3086 || float32_is_signaling_nan(b, status)) {
ff32e16e 3087 float_raise(float_flag_invalid, status);
67b7861d
AJ
3088 }
3089 return 1;
3090 }
3091 return 0;
3092}
3093
158142c2
FB
3094/*----------------------------------------------------------------------------
3095| Returns the result of converting the double-precision floating-point value
3096| `a' to the 32-bit two's complement integer format. The conversion is
3097| performed according to the IEC/IEEE Standard for Binary Floating-Point
3098| Arithmetic---which means in particular that the conversion is rounded
3099| according to the current rounding mode. If `a' is a NaN, the largest
3100| positive integer is returned. Otherwise, if the conversion overflows, the
3101| largest integer with the same sign as `a' is returned.
3102*----------------------------------------------------------------------------*/
3103
f4014512 3104int32_t float64_to_int32(float64 a, float_status *status)
158142c2
FB
3105{
3106 flag aSign;
0c48262d 3107 int aExp;
07d792d2 3108 int shiftCount;
bb98fe42 3109 uint64_t aSig;
ff32e16e 3110 a = float64_squash_input_denormal(a, status);
158142c2
FB
3111
3112 aSig = extractFloat64Frac( a );
3113 aExp = extractFloat64Exp( a );
3114 aSign = extractFloat64Sign( a );
3115 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3116 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3117 shiftCount = 0x42C - aExp;
3118 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
ff32e16e 3119 return roundAndPackInt32(aSign, aSig, status);
158142c2
FB
3120
3121}
3122
3123/*----------------------------------------------------------------------------
3124| Returns the result of converting the double-precision floating-point value
3125| `a' to the 32-bit two's complement integer format. The conversion is
3126| performed according to the IEC/IEEE Standard for Binary Floating-Point
3127| Arithmetic, except that the conversion is always rounded toward zero.
3128| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
3129| the conversion overflows, the largest integer with the same sign as `a' is
3130| returned.
3131*----------------------------------------------------------------------------*/
3132
f4014512 3133int32_t float64_to_int32_round_to_zero(float64 a, float_status *status)
158142c2
FB
3134{
3135 flag aSign;
0c48262d 3136 int aExp;
07d792d2 3137 int shiftCount;
bb98fe42 3138 uint64_t aSig, savedASig;
b3a6a2e0 3139 int32_t z;
ff32e16e 3140 a = float64_squash_input_denormal(a, status);
158142c2
FB
3141
3142 aSig = extractFloat64Frac( a );
3143 aExp = extractFloat64Exp( a );
3144 aSign = extractFloat64Sign( a );
3145 if ( 0x41E < aExp ) {
3146 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3147 goto invalid;
3148 }
3149 else if ( aExp < 0x3FF ) {
a2f2d288
PM
3150 if (aExp || aSig) {
3151 status->float_exception_flags |= float_flag_inexact;
3152 }
158142c2
FB
3153 return 0;
3154 }
3155 aSig |= LIT64( 0x0010000000000000 );
3156 shiftCount = 0x433 - aExp;
3157 savedASig = aSig;
3158 aSig >>= shiftCount;
3159 z = aSig;
3160 if ( aSign ) z = - z;
3161 if ( ( z < 0 ) ^ aSign ) {
3162 invalid:
ff32e16e 3163 float_raise(float_flag_invalid, status);
bb98fe42 3164 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2
FB
3165 }
3166 if ( ( aSig<<shiftCount ) != savedASig ) {
a2f2d288 3167 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
3168 }
3169 return z;
3170
3171}
3172
cbcef455
PM
3173/*----------------------------------------------------------------------------
3174| Returns the result of converting the double-precision floating-point value
3175| `a' to the 16-bit two's complement integer format. The conversion is
3176| performed according to the IEC/IEEE Standard for Binary Floating-Point
3177| Arithmetic, except that the conversion is always rounded toward zero.
3178| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
3179| the conversion overflows, the largest integer with the same sign as `a' is
3180| returned.
3181*----------------------------------------------------------------------------*/
3182
0bb721d7 3183int16_t float64_to_int16_round_to_zero(float64 a, float_status *status)
cbcef455
PM
3184{
3185 flag aSign;
0c48262d 3186 int aExp;
07d792d2 3187 int shiftCount;
bb98fe42 3188 uint64_t aSig, savedASig;
f4014512 3189 int32_t z;
cbcef455
PM
3190
3191 aSig = extractFloat64Frac( a );
3192 aExp = extractFloat64Exp( a );
3193 aSign = extractFloat64Sign( a );
3194 if ( 0x40E < aExp ) {
3195 if ( ( aExp == 0x7FF ) && aSig ) {
3196 aSign = 0;
3197 }
3198 goto invalid;
3199 }
3200 else if ( aExp < 0x3FF ) {
3201 if ( aExp || aSig ) {
a2f2d288 3202 status->float_exception_flags |= float_flag_inexact;
cbcef455
PM
3203 }
3204 return 0;
3205 }
3206 aSig |= LIT64( 0x0010000000000000 );
3207 shiftCount = 0x433 - aExp;
3208 savedASig = aSig;
3209 aSig >>= shiftCount;
3210 z = aSig;
3211 if ( aSign ) {
3212 z = - z;
3213 }
3214 if ( ( (int16_t)z < 0 ) ^ aSign ) {
3215 invalid:
ff32e16e 3216 float_raise(float_flag_invalid, status);
bb98fe42 3217 return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
cbcef455
PM
3218 }
3219 if ( ( aSig<<shiftCount ) != savedASig ) {
a2f2d288 3220 status->float_exception_flags |= float_flag_inexact;
cbcef455
PM
3221 }
3222 return z;
3223}
3224
158142c2
FB
3225/*----------------------------------------------------------------------------
3226| Returns the result of converting the double-precision floating-point value
3227| `a' to the 64-bit two's complement integer format. The conversion is
3228| performed according to the IEC/IEEE Standard for Binary Floating-Point
3229| Arithmetic---which means in particular that the conversion is rounded
3230| according to the current rounding mode. If `a' is a NaN, the largest
3231| positive integer is returned. Otherwise, if the conversion overflows, the
3232| largest integer with the same sign as `a' is returned.
3233*----------------------------------------------------------------------------*/
3234
f42c2224 3235int64_t float64_to_int64(float64 a, float_status *status)
158142c2
FB
3236{
3237 flag aSign;
0c48262d 3238 int aExp;
07d792d2 3239 int shiftCount;
bb98fe42 3240 uint64_t aSig, aSigExtra;
ff32e16e 3241 a = float64_squash_input_denormal(a, status);
158142c2
FB
3242
3243 aSig = extractFloat64Frac( a );
3244 aExp = extractFloat64Exp( a );
3245 aSign = extractFloat64Sign( a );
3246 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3247 shiftCount = 0x433 - aExp;
3248 if ( shiftCount <= 0 ) {
3249 if ( 0x43E < aExp ) {
ff32e16e 3250 float_raise(float_flag_invalid, status);
158142c2
FB
3251 if ( ! aSign
3252 || ( ( aExp == 0x7FF )
3253 && ( aSig != LIT64( 0x0010000000000000 ) ) )
3254 ) {
3255 return LIT64( 0x7FFFFFFFFFFFFFFF );
3256 }
bb98fe42 3257 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
3258 }
3259 aSigExtra = 0;
3260 aSig <<= - shiftCount;
3261 }
3262 else {
3263 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3264 }
ff32e16e 3265 return roundAndPackInt64(aSign, aSig, aSigExtra, status);
158142c2
FB
3266
3267}
3268
3269/*----------------------------------------------------------------------------
3270| Returns the result of converting the double-precision floating-point value
3271| `a' to the 64-bit two's complement integer format. The conversion is
3272| performed according to the IEC/IEEE Standard for Binary Floating-Point
3273| Arithmetic, except that the conversion is always rounded toward zero.
3274| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
3275| the conversion overflows, the largest integer with the same sign as `a' is
3276| returned.
3277*----------------------------------------------------------------------------*/
3278
f42c2224 3279int64_t float64_to_int64_round_to_zero(float64 a, float_status *status)
158142c2
FB
3280{
3281 flag aSign;
0c48262d 3282 int aExp;
07d792d2 3283 int shiftCount;
bb98fe42 3284 uint64_t aSig;
f42c2224 3285 int64_t z;
ff32e16e 3286 a = float64_squash_input_denormal(a, status);
158142c2
FB
3287
3288 aSig = extractFloat64Frac( a );
3289 aExp = extractFloat64Exp( a );
3290 aSign = extractFloat64Sign( a );
3291 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3292 shiftCount = aExp - 0x433;
3293 if ( 0 <= shiftCount ) {
3294 if ( 0x43E <= aExp ) {
f090c9d4 3295 if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
ff32e16e 3296 float_raise(float_flag_invalid, status);
158142c2
FB
3297 if ( ! aSign
3298 || ( ( aExp == 0x7FF )
3299 && ( aSig != LIT64( 0x0010000000000000 ) ) )
3300 ) {
3301 return LIT64( 0x7FFFFFFFFFFFFFFF );
3302 }
3303 }
bb98fe42 3304 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
3305 }
3306 z = aSig<<shiftCount;
3307 }
3308 else {
3309 if ( aExp < 0x3FE ) {
a2f2d288
PM
3310 if (aExp | aSig) {
3311 status->float_exception_flags |= float_flag_inexact;
3312 }
158142c2
FB
3313 return 0;
3314 }
3315 z = aSig>>( - shiftCount );
bb98fe42 3316 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
a2f2d288 3317 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
3318 }
3319 }
3320 if ( aSign ) z = - z;
3321 return z;
3322
3323}
3324
3325/*----------------------------------------------------------------------------
3326| Returns the result of converting the double-precision floating-point value
3327| `a' to the single-precision floating-point format. The conversion is
3328| performed according to the IEC/IEEE Standard for Binary Floating-Point
3329| Arithmetic.
3330*----------------------------------------------------------------------------*/
3331
e5a41ffa 3332float32 float64_to_float32(float64 a, float_status *status)
158142c2
FB
3333{
3334 flag aSign;
0c48262d 3335 int aExp;
bb98fe42
AF
3336 uint64_t aSig;
3337 uint32_t zSig;
ff32e16e 3338 a = float64_squash_input_denormal(a, status);
158142c2
FB
3339
3340 aSig = extractFloat64Frac( a );
3341 aExp = extractFloat64Exp( a );
3342 aSign = extractFloat64Sign( a );
3343 if ( aExp == 0x7FF ) {
ff32e16e
PM
3344 if (aSig) {
3345 return commonNaNToFloat32(float64ToCommonNaN(a, status), status);
3346 }
158142c2
FB
3347 return packFloat32( aSign, 0xFF, 0 );
3348 }
3349 shift64RightJamming( aSig, 22, &aSig );
3350 zSig = aSig;
3351 if ( aExp || zSig ) {
3352 zSig |= 0x40000000;
3353 aExp -= 0x381;
3354 }
ff32e16e 3355 return roundAndPackFloat32(aSign, aExp, zSig, status);
158142c2
FB
3356
3357}
3358
60011498
PB
3359
3360/*----------------------------------------------------------------------------
3361| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3362| half-precision floating-point value, returning the result. After being
3363| shifted into the proper positions, the three fields are simply added
3364| together to form the result. This means that any integer portion of `zSig'
3365| will be added into the exponent. Since a properly normalized significand
3366| will have an integer portion equal to 1, the `zExp' input should be 1 less
3367| than the desired result exponent whenever `zSig' is a complete, normalized
3368| significand.
3369*----------------------------------------------------------------------------*/
0c48262d 3370static float16 packFloat16(flag zSign, int zExp, uint16_t zSig)
60011498 3371{
bb4d4bb3 3372 return make_float16(
bb98fe42 3373 (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
60011498
PB
3374}
3375
c4a1c5e7
PM
3376/*----------------------------------------------------------------------------
3377| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3378| and significand `zSig', and returns the proper half-precision floating-
3379| point value corresponding to the abstract input. Ordinarily, the abstract
3380| value is simply rounded and packed into the half-precision format, with
3381| the inexact exception raised if the abstract input cannot be represented
3382| exactly. However, if the abstract value is too large, the overflow and
3383| inexact exceptions are raised and an infinity or maximal finite value is
3384| returned. If the abstract value is too small, the input value is rounded to
3385| a subnormal number, and the underflow and inexact exceptions are raised if
3386| the abstract input cannot be represented exactly as a subnormal half-
3387| precision floating-point number.
3388| The `ieee' flag indicates whether to use IEEE standard half precision, or
3389| ARM-style "alternative representation", which omits the NaN and Inf
3390| encodings in order to raise the maximum representable exponent by one.
3391| The input significand `zSig' has its binary point between bits 22
3392| and 23, which is 13 bits to the left of the usual location. This shifted
3393| significand must be normalized or smaller. If `zSig' is not normalized,
3394| `zExp' must be 0; in that case, the result returned is a subnormal number,
3395| and it must not require rounding. In the usual case that `zSig' is
3396| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3397| Note the slightly odd position of the binary point in zSig compared with the
3398| other roundAndPackFloat functions. This should probably be fixed if we
3399| need to implement more float16 routines than just conversion.
3400| The handling of underflow and overflow follows the IEC/IEEE Standard for
3401| Binary Floating-Point Arithmetic.
3402*----------------------------------------------------------------------------*/
3403
0c48262d 3404static float16 roundAndPackFloat16(flag zSign, int zExp,
e5a41ffa
PM
3405 uint32_t zSig, flag ieee,
3406 float_status *status)
c4a1c5e7
PM
3407{
3408 int maxexp = ieee ? 29 : 30;
3409 uint32_t mask;
3410 uint32_t increment;
c4a1c5e7
PM
3411 bool rounding_bumps_exp;
3412 bool is_tiny = false;
3413
3414 /* Calculate the mask of bits of the mantissa which are not
3415 * representable in half-precision and will be lost.
3416 */
3417 if (zExp < 1) {
3418 /* Will be denormal in halfprec */
3419 mask = 0x00ffffff;
3420 if (zExp >= -11) {
3421 mask >>= 11 + zExp;
3422 }
3423 } else {
3424 /* Normal number in halfprec */
3425 mask = 0x00001fff;
3426 }
3427
a2f2d288 3428 switch (status->float_rounding_mode) {
c4a1c5e7
PM
3429 case float_round_nearest_even:
3430 increment = (mask + 1) >> 1;
3431 if ((zSig & mask) == increment) {
3432 increment = zSig & (increment << 1);
3433 }
3434 break;
f9288a76
PM
3435 case float_round_ties_away:
3436 increment = (mask + 1) >> 1;
3437 break;
c4a1c5e7
PM
3438 case float_round_up:
3439 increment = zSign ? 0 : mask;
3440 break;
3441 case float_round_down:
3442 increment = zSign ? mask : 0;
3443 break;
3444 default: /* round_to_zero */
3445 increment = 0;
3446 break;
3447 }
3448
3449 rounding_bumps_exp = (zSig + increment >= 0x01000000);
3450
3451 if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3452 if (ieee) {
ff32e16e 3453 float_raise(float_flag_overflow | float_flag_inexact, status);
c4a1c5e7
PM
3454 return packFloat16(zSign, 0x1f, 0);
3455 } else {
ff32e16e 3456 float_raise(float_flag_invalid, status);
c4a1c5e7
PM
3457 return packFloat16(zSign, 0x1f, 0x3ff);
3458 }
3459 }
3460
3461 if (zExp < 0) {
3462 /* Note that flush-to-zero does not affect half-precision results */
3463 is_tiny =
a2f2d288 3464 (status->float_detect_tininess == float_tininess_before_rounding)
c4a1c5e7
PM
3465 || (zExp < -1)
3466 || (!rounding_bumps_exp);
3467 }
3468 if (zSig & mask) {
ff32e16e 3469 float_raise(float_flag_inexact, status);
c4a1c5e7 3470 if (is_tiny) {
ff32e16e 3471 float_raise(float_flag_underflow, status);
c4a1c5e7
PM
3472 }
3473 }
3474
3475 zSig += increment;
3476 if (rounding_bumps_exp) {
3477 zSig >>= 1;
3478 zExp++;
3479 }
3480
3481 if (zExp < -10) {
3482 return packFloat16(zSign, 0, 0);
3483 }
3484 if (zExp < 0) {
3485 zSig >>= -zExp;
3486 zExp = 0;
3487 }
3488 return packFloat16(zSign, zExp, zSig >> 13);
3489}
3490
0c48262d 3491static void normalizeFloat16Subnormal(uint32_t aSig, int *zExpPtr,
c4a1c5e7
PM
3492 uint32_t *zSigPtr)
3493{
3494 int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3495 *zSigPtr = aSig << shiftCount;
3496 *zExpPtr = 1 - shiftCount;
3497}
3498
60011498
PB
3499/* Half precision floats come in two formats: standard IEEE and "ARM" format.
3500 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
bb4d4bb3 3501
e5a41ffa 3502float32 float16_to_float32(float16 a, flag ieee, float_status *status)
60011498
PB
3503{
3504 flag aSign;
0c48262d 3505 int aExp;
bb98fe42 3506 uint32_t aSig;
60011498 3507
bb4d4bb3
PM
3508 aSign = extractFloat16Sign(a);
3509 aExp = extractFloat16Exp(a);
3510 aSig = extractFloat16Frac(a);
60011498
PB
3511
3512 if (aExp == 0x1f && ieee) {
3513 if (aSig) {
ff32e16e 3514 return commonNaNToFloat32(float16ToCommonNaN(a, status), status);
60011498 3515 }
4be8eeac 3516 return packFloat32(aSign, 0xff, 0);
60011498
PB
3517 }
3518 if (aExp == 0) {
60011498
PB
3519 if (aSig == 0) {
3520 return packFloat32(aSign, 0, 0);
3521 }
3522
c4a1c5e7
PM
3523 normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3524 aExp--;
60011498
PB
3525 }
3526 return packFloat32( aSign, aExp + 0x70, aSig << 13);
3527}
3528
e5a41ffa 3529float16 float32_to_float16(float32 a, flag ieee, float_status *status)
60011498
PB
3530{
3531 flag aSign;
0c48262d 3532 int aExp;
bb98fe42 3533 uint32_t aSig;
38970efa 3534
ff32e16e 3535 a = float32_squash_input_denormal(a, status);
60011498
PB
3536
3537 aSig = extractFloat32Frac( a );
3538 aExp = extractFloat32Exp( a );
3539 aSign = extractFloat32Sign( a );
3540 if ( aExp == 0xFF ) {
3541 if (aSig) {
600e30d2 3542 /* Input is a NaN */
600e30d2 3543 if (!ieee) {
ff32e16e 3544 float_raise(float_flag_invalid, status);
600e30d2
PM
3545 return packFloat16(aSign, 0, 0);
3546 }
38970efa 3547 return commonNaNToFloat16(
ff32e16e 3548 float32ToCommonNaN(a, status), status);
60011498 3549 }
600e30d2
PM
3550 /* Infinity */
3551 if (!ieee) {
ff32e16e 3552 float_raise(float_flag_invalid, status);
600e30d2
PM
3553 return packFloat16(aSign, 0x1f, 0x3ff);
3554 }
3555 return packFloat16(aSign, 0x1f, 0);
60011498 3556 }
600e30d2 3557 if (aExp == 0 && aSig == 0) {
60011498
PB
3558 return packFloat16(aSign, 0, 0);
3559 }
38970efa
PM
3560 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3561 * even if the input is denormal; however this is harmless because
3562 * the largest possible single-precision denormal is still smaller
3563 * than the smallest representable half-precision denormal, and so we
3564 * will end up ignoring aSig and returning via the "always return zero"
3565 * codepath.
3566 */
60011498 3567 aSig |= 0x00800000;
c4a1c5e7 3568 aExp -= 0x71;
60011498 3569
ff32e16e 3570 return roundAndPackFloat16(aSign, aExp, aSig, ieee, status);
60011498
PB
3571}
3572
e5a41ffa 3573float64 float16_to_float64(float16 a, flag ieee, float_status *status)
14c9a07e
PM
3574{
3575 flag aSign;
0c48262d 3576 int aExp;
14c9a07e
PM
3577 uint32_t aSig;
3578
3579 aSign = extractFloat16Sign(a);
3580 aExp = extractFloat16Exp(a);
3581 aSig = extractFloat16Frac(a);
3582
3583 if (aExp == 0x1f && ieee) {
3584 if (aSig) {
3585 return commonNaNToFloat64(
ff32e16e 3586 float16ToCommonNaN(a, status), status);
14c9a07e
PM
3587 }
3588 return packFloat64(aSign, 0x7ff, 0);
3589 }
3590 if (aExp == 0) {
3591 if (aSig == 0) {
3592 return packFloat64(aSign, 0, 0);
3593 }
3594
3595 normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3596 aExp--;
3597 }
3598 return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);
3599}
3600
e5a41ffa 3601float16 float64_to_float16(float64 a, flag ieee, float_status *status)
14c9a07e
PM
3602{
3603 flag aSign;
0c48262d 3604 int aExp;
14c9a07e
PM
3605 uint64_t aSig;
3606 uint32_t zSig;
3607
ff32e16e 3608 a = float64_squash_input_denormal(a, status);
14c9a07e
PM
3609
3610 aSig = extractFloat64Frac(a);
3611 aExp = extractFloat64Exp(a);
3612 aSign = extractFloat64Sign(a);
3613 if (aExp == 0x7FF) {
3614 if (aSig) {
3615 /* Input is a NaN */
3616 if (!ieee) {
ff32e16e 3617 float_raise(float_flag_invalid, status);
14c9a07e
PM
3618 return packFloat16(aSign, 0, 0);
3619 }
3620 return commonNaNToFloat16(
ff32e16e 3621 float64ToCommonNaN(a, status), status);
14c9a07e
PM
3622 }
3623 /* Infinity */
3624 if (!ieee) {
ff32e16e 3625 float_raise(float_flag_invalid, status);
14c9a07e
PM
3626 return packFloat16(aSign, 0x1f, 0x3ff);
3627 }
3628 return packFloat16(aSign, 0x1f, 0);
3629 }
3630 shift64RightJamming(aSig, 29, &aSig);
3631 zSig = aSig;
3632 if (aExp == 0 && zSig == 0) {
3633 return packFloat16(aSign, 0, 0);
3634 }
3635 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3636 * even if the input is denormal; however this is harmless because
3637 * the largest possible single-precision denormal is still smaller
3638 * than the smallest representable half-precision denormal, and so we
3639 * will end up ignoring aSig and returning via the "always return zero"
3640 * codepath.
3641 */
3642 zSig |= 0x00800000;
3643 aExp -= 0x3F1;
3644
ff32e16e 3645 return roundAndPackFloat16(aSign, aExp, zSig, ieee, status);
14c9a07e
PM
3646}
3647
158142c2
FB
3648/*----------------------------------------------------------------------------
3649| Returns the result of converting the double-precision floating-point value
3650| `a' to the extended double-precision floating-point format. The conversion
3651| is performed according to the IEC/IEEE Standard for Binary Floating-Point
3652| Arithmetic.
3653*----------------------------------------------------------------------------*/
3654
e5a41ffa 3655floatx80 float64_to_floatx80(float64 a, float_status *status)
158142c2
FB
3656{
3657 flag aSign;
0c48262d 3658 int aExp;
bb98fe42 3659 uint64_t aSig;
158142c2 3660
ff32e16e 3661 a = float64_squash_input_denormal(a, status);
158142c2
FB
3662 aSig = extractFloat64Frac( a );
3663 aExp = extractFloat64Exp( a );
3664 aSign = extractFloat64Sign( a );
3665 if ( aExp == 0x7FF ) {
ff32e16e
PM
3666 if (aSig) {
3667 return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
3668 }
158142c2
FB
3669 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3670 }
3671 if ( aExp == 0 ) {
3672 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3673 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3674 }
3675 return
3676 packFloatx80(
3677 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
3678
3679}
3680
158142c2
FB
3681/*----------------------------------------------------------------------------
3682| Returns the result of converting the double-precision floating-point value
3683| `a' to the quadruple-precision floating-point format. The conversion is
3684| performed according to the IEC/IEEE Standard for Binary Floating-Point
3685| Arithmetic.
3686*----------------------------------------------------------------------------*/
3687
e5a41ffa 3688float128 float64_to_float128(float64 a, float_status *status)
158142c2
FB
3689{
3690 flag aSign;
0c48262d 3691 int aExp;
bb98fe42 3692 uint64_t aSig, zSig0, zSig1;
158142c2 3693
ff32e16e 3694 a = float64_squash_input_denormal(a, status);
158142c2
FB
3695 aSig = extractFloat64Frac( a );
3696 aExp = extractFloat64Exp( a );
3697 aSign = extractFloat64Sign( a );
3698 if ( aExp == 0x7FF ) {
ff32e16e
PM
3699 if (aSig) {
3700 return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
3701 }
158142c2
FB
3702 return packFloat128( aSign, 0x7FFF, 0, 0 );
3703 }
3704 if ( aExp == 0 ) {
3705 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3706 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3707 --aExp;
3708 }
3709 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
3710 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
3711
3712}
3713
158142c2
FB
3714/*----------------------------------------------------------------------------
3715| Rounds the double-precision floating-point value `a' to an integer, and
3716| returns the result as a double-precision floating-point value. The
3717| operation is performed according to the IEC/IEEE Standard for Binary
3718| Floating-Point Arithmetic.
3719*----------------------------------------------------------------------------*/
3720
e5a41ffa 3721float64 float64_round_to_int(float64 a, float_status *status)
158142c2
FB
3722{
3723 flag aSign;
0c48262d 3724 int aExp;
bb98fe42 3725 uint64_t lastBitMask, roundBitsMask;
bb98fe42 3726 uint64_t z;
ff32e16e 3727 a = float64_squash_input_denormal(a, status);
158142c2
FB
3728
3729 aExp = extractFloat64Exp( a );
3730 if ( 0x433 <= aExp ) {
3731 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
ff32e16e 3732 return propagateFloat64NaN(a, a, status);
158142c2
FB
3733 }
3734 return a;
3735 }
3736 if ( aExp < 0x3FF ) {
bb98fe42 3737 if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
a2f2d288 3738 status->float_exception_flags |= float_flag_inexact;
158142c2 3739 aSign = extractFloat64Sign( a );
a2f2d288 3740 switch (status->float_rounding_mode) {
158142c2
FB
3741 case float_round_nearest_even:
3742 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3743 return packFloat64( aSign, 0x3FF, 0 );
3744 }
3745 break;
f9288a76
PM
3746 case float_round_ties_away:
3747 if (aExp == 0x3FE) {
3748 return packFloat64(aSign, 0x3ff, 0);
3749 }
3750 break;
158142c2 3751 case float_round_down:
f090c9d4 3752 return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
158142c2 3753 case float_round_up:
f090c9d4
PB
3754 return make_float64(
3755 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
158142c2
FB
3756 }
3757 return packFloat64( aSign, 0, 0 );
3758 }
3759 lastBitMask = 1;
3760 lastBitMask <<= 0x433 - aExp;
3761 roundBitsMask = lastBitMask - 1;
f090c9d4 3762 z = float64_val(a);
a2f2d288 3763 switch (status->float_rounding_mode) {
dc355b76
PM
3764 case float_round_nearest_even:
3765 z += lastBitMask >> 1;
3766 if ((z & roundBitsMask) == 0) {
3767 z &= ~lastBitMask;
3768 }
3769 break;
f9288a76
PM
3770 case float_round_ties_away:
3771 z += lastBitMask >> 1;
3772 break;
dc355b76
PM
3773 case float_round_to_zero:
3774 break;
3775 case float_round_up:
3776 if (!extractFloat64Sign(make_float64(z))) {
3777 z += roundBitsMask;
3778 }
3779 break;
3780 case float_round_down:
3781 if (extractFloat64Sign(make_float64(z))) {
158142c2
FB
3782 z += roundBitsMask;
3783 }
dc355b76
PM
3784 break;
3785 default:
3786 abort();
158142c2
FB
3787 }
3788 z &= ~ roundBitsMask;
a2f2d288
PM
3789 if (z != float64_val(a)) {
3790 status->float_exception_flags |= float_flag_inexact;
3791 }
f090c9d4 3792 return make_float64(z);
158142c2
FB
3793
3794}
3795
e5a41ffa 3796float64 float64_trunc_to_int(float64 a, float_status *status)
e6e5906b
PB
3797{
3798 int oldmode;
3799 float64 res;
a2f2d288
PM
3800 oldmode = status->float_rounding_mode;
3801 status->float_rounding_mode = float_round_to_zero;
ff32e16e 3802 res = float64_round_to_int(a, status);
a2f2d288 3803 status->float_rounding_mode = oldmode;
e6e5906b
PB
3804 return res;
3805}
3806
158142c2
FB
3807/*----------------------------------------------------------------------------
3808| Returns the result of adding the absolute values of the double-precision
3809| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
3810| before being returned. `zSign' is ignored if the result is a NaN.
3811| The addition is performed according to the IEC/IEEE Standard for Binary
3812| Floating-Point Arithmetic.
3813*----------------------------------------------------------------------------*/
3814
e5a41ffa
PM
3815static float64 addFloat64Sigs(float64 a, float64 b, flag zSign,
3816 float_status *status)
158142c2 3817{
0c48262d 3818 int aExp, bExp, zExp;
bb98fe42 3819 uint64_t aSig, bSig, zSig;
0c48262d 3820 int expDiff;
158142c2
FB
3821
3822 aSig = extractFloat64Frac( a );
3823 aExp = extractFloat64Exp( a );
3824 bSig = extractFloat64Frac( b );
3825 bExp = extractFloat64Exp( b );
3826 expDiff = aExp - bExp;
3827 aSig <<= 9;
3828 bSig <<= 9;
3829 if ( 0 < expDiff ) {
3830 if ( aExp == 0x7FF ) {
ff32e16e
PM
3831 if (aSig) {
3832 return propagateFloat64NaN(a, b, status);
3833 }
158142c2
FB
3834 return a;
3835 }
3836 if ( bExp == 0 ) {
3837 --expDiff;
3838 }
3839 else {
3840 bSig |= LIT64( 0x2000000000000000 );
3841 }
3842 shift64RightJamming( bSig, expDiff, &bSig );
3843 zExp = aExp;
3844 }
3845 else if ( expDiff < 0 ) {
3846 if ( bExp == 0x7FF ) {
ff32e16e
PM
3847 if (bSig) {
3848 return propagateFloat64NaN(a, b, status);
3849 }
158142c2
FB
3850 return packFloat64( zSign, 0x7FF, 0 );
3851 }
3852 if ( aExp == 0 ) {
3853 ++expDiff;
3854 }
3855 else {
3856 aSig |= LIT64( 0x2000000000000000 );
3857 }
3858 shift64RightJamming( aSig, - expDiff, &aSig );
3859 zExp = bExp;
3860 }
3861 else {
3862 if ( aExp == 0x7FF ) {
ff32e16e
PM
3863 if (aSig | bSig) {
3864 return propagateFloat64NaN(a, b, status);
3865 }
158142c2
FB
3866 return a;
3867 }
fe76d976 3868 if ( aExp == 0 ) {
a2f2d288 3869 if (status->flush_to_zero) {
e6afc87f 3870 if (aSig | bSig) {
ff32e16e 3871 float_raise(float_flag_output_denormal, status);
e6afc87f
PM
3872 }
3873 return packFloat64(zSign, 0, 0);
3874 }
fe76d976
PB
3875 return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3876 }
158142c2
FB
3877 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3878 zExp = aExp;
3879 goto roundAndPack;
3880 }
3881 aSig |= LIT64( 0x2000000000000000 );
3882 zSig = ( aSig + bSig )<<1;
3883 --zExp;
bb98fe42 3884 if ( (int64_t) zSig < 0 ) {
158142c2
FB
3885 zSig = aSig + bSig;
3886 ++zExp;
3887 }
3888 roundAndPack:
ff32e16e 3889 return roundAndPackFloat64(zSign, zExp, zSig, status);
158142c2
FB
3890
3891}
3892
3893/*----------------------------------------------------------------------------
3894| Returns the result of subtracting the absolute values of the double-
3895| precision floating-point values `a' and `b'. If `zSign' is 1, the
3896| difference is negated before being returned. `zSign' is ignored if the
3897| result is a NaN. The subtraction is performed according to the IEC/IEEE
3898| Standard for Binary Floating-Point Arithmetic.
3899*----------------------------------------------------------------------------*/
3900
e5a41ffa
PM
3901static float64 subFloat64Sigs(float64 a, float64 b, flag zSign,
3902 float_status *status)
158142c2 3903{
0c48262d 3904 int aExp, bExp, zExp;
bb98fe42 3905 uint64_t aSig, bSig, zSig;
0c48262d 3906 int expDiff;
158142c2
FB
3907
3908 aSig = extractFloat64Frac( a );
3909 aExp = extractFloat64Exp( a );
3910 bSig = extractFloat64Frac( b );
3911 bExp = extractFloat64Exp( b );
3912 expDiff = aExp - bExp;
3913 aSig <<= 10;
3914 bSig <<= 10;
3915 if ( 0 < expDiff ) goto aExpBigger;
3916 if ( expDiff < 0 ) goto bExpBigger;
3917 if ( aExp == 0x7FF ) {
ff32e16e
PM
3918 if (aSig | bSig) {
3919 return propagateFloat64NaN(a, b, status);
3920 }
3921 float_raise(float_flag_invalid, status);
af39bc8c 3922 return float64_default_nan(status);
158142c2
FB
3923 }
3924 if ( aExp == 0 ) {
3925 aExp = 1;
3926 bExp = 1;
3927 }
3928 if ( bSig < aSig ) goto aBigger;
3929 if ( aSig < bSig ) goto bBigger;
a2f2d288 3930 return packFloat64(status->float_rounding_mode == float_round_down, 0, 0);
158142c2
FB
3931 bExpBigger:
3932 if ( bExp == 0x7FF ) {
ff32e16e
PM
3933 if (bSig) {
3934 return propagateFloat64NaN(a, b, status);
3935 }
158142c2
FB
3936 return packFloat64( zSign ^ 1, 0x7FF, 0 );
3937 }
3938 if ( aExp == 0 ) {
3939 ++expDiff;
3940 }
3941 else {
3942 aSig |= LIT64( 0x4000000000000000 );
3943 }
3944 shift64RightJamming( aSig, - expDiff, &aSig );
3945 bSig |= LIT64( 0x4000000000000000 );
3946 bBigger:
3947 zSig = bSig - aSig;
3948 zExp = bExp;
3949 zSign ^= 1;
3950 goto normalizeRoundAndPack;
3951 aExpBigger:
3952 if ( aExp == 0x7FF ) {
ff32e16e
PM
3953 if (aSig) {
3954 return propagateFloat64NaN(a, b, status);
3955 }
158142c2
FB
3956 return a;
3957 }
3958 if ( bExp == 0 ) {
3959 --expDiff;
3960 }
3961 else {
3962 bSig |= LIT64( 0x4000000000000000 );
3963 }
3964 shift64RightJamming( bSig, expDiff, &bSig );
3965 aSig |= LIT64( 0x4000000000000000 );
3966 aBigger:
3967 zSig = aSig - bSig;
3968 zExp = aExp;
3969 normalizeRoundAndPack:
3970 --zExp;
ff32e16e 3971 return normalizeRoundAndPackFloat64(zSign, zExp, zSig, status);
158142c2
FB
3972
3973}
3974
3975/*----------------------------------------------------------------------------
3976| Returns the result of adding the double-precision floating-point values `a'
3977| and `b'. The operation is performed according to the IEC/IEEE Standard for
3978| Binary Floating-Point Arithmetic.
3979*----------------------------------------------------------------------------*/
3980
e5a41ffa 3981float64 float64_add(float64 a, float64 b, float_status *status)
158142c2
FB
3982{
3983 flag aSign, bSign;
ff32e16e
PM
3984 a = float64_squash_input_denormal(a, status);
3985 b = float64_squash_input_denormal(b, status);
158142c2
FB
3986
3987 aSign = extractFloat64Sign( a );
3988 bSign = extractFloat64Sign( b );
3989 if ( aSign == bSign ) {
ff32e16e 3990 return addFloat64Sigs(a, b, aSign, status);
158142c2
FB
3991 }
3992 else {
ff32e16e 3993 return subFloat64Sigs(a, b, aSign, status);
158142c2
FB
3994 }
3995
3996}
3997
3998/*----------------------------------------------------------------------------
3999| Returns the result of subtracting the double-precision floating-point values
4000| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
4001| for Binary Floating-Point Arithmetic.
4002*----------------------------------------------------------------------------*/
4003
e5a41ffa 4004float64 float64_sub(float64 a, float64 b, float_status *status)
158142c2
FB
4005{
4006 flag aSign, bSign;
ff32e16e
PM
4007 a = float64_squash_input_denormal(a, status);
4008 b = float64_squash_input_denormal(b, status);
158142c2
FB
4009
4010 aSign = extractFloat64Sign( a );
4011 bSign = extractFloat64Sign( b );
4012 if ( aSign == bSign ) {
ff32e16e 4013 return subFloat64Sigs(a, b, aSign, status);
158142c2
FB
4014 }
4015 else {
ff32e16e 4016 return addFloat64Sigs(a, b, aSign, status);
158142c2
FB
4017 }
4018
4019}
4020
4021/*----------------------------------------------------------------------------
4022| Returns the result of multiplying the double-precision floating-point values
4023| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
4024| for Binary Floating-Point Arithmetic.
4025*----------------------------------------------------------------------------*/
4026
e5a41ffa 4027float64 float64_mul(float64 a, float64 b, float_status *status)
158142c2
FB
4028{
4029 flag aSign, bSign, zSign;
0c48262d 4030 int aExp, bExp, zExp;
bb98fe42 4031 uint64_t aSig, bSig, zSig0, zSig1;
158142c2 4032
ff32e16e
PM
4033 a = float64_squash_input_denormal(a, status);
4034 b = float64_squash_input_denormal(b, status);
37d18660 4035
158142c2
FB
4036 aSig = extractFloat64Frac( a );
4037 aExp = extractFloat64Exp( a );
4038 aSign = extractFloat64Sign( a );
4039 bSig = extractFloat64Frac( b );
4040 bExp = extractFloat64Exp( b );
4041 bSign = extractFloat64Sign( b );
4042 zSign = aSign ^ bSign;
4043 if ( aExp == 0x7FF ) {
4044 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
ff32e16e 4045 return propagateFloat64NaN(a, b, status);
158142c2
FB
4046 }
4047 if ( ( bExp | bSig ) == 0 ) {
ff32e16e 4048 float_raise(float_flag_invalid, status);
af39bc8c 4049 return float64_default_nan(status);
158142c2
FB
4050 }
4051 return packFloat64( zSign, 0x7FF, 0 );
4052 }
4053 if ( bExp == 0x7FF ) {
ff32e16e
PM
4054 if (bSig) {
4055 return propagateFloat64NaN(a, b, status);
4056 }
158142c2 4057 if ( ( aExp | aSig ) == 0 ) {
ff32e16e 4058 float_raise(float_flag_invalid, status);
af39bc8c 4059 return float64_default_nan(status);
158142c2
FB
4060 }
4061 return packFloat64( zSign, 0x7FF, 0 );
4062 }
4063 if ( aExp == 0 ) {
4064 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
4065 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4066 }
4067 if ( bExp == 0 ) {
4068 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
4069 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4070 }
4071 zExp = aExp + bExp - 0x3FF;
4072 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
4073 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4074 mul64To128( aSig, bSig, &zSig0, &zSig1 );
4075 zSig0 |= ( zSig1 != 0 );
bb98fe42 4076 if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
158142c2
FB
4077 zSig0 <<= 1;
4078 --zExp;
4079 }
ff32e16e 4080 return roundAndPackFloat64(zSign, zExp, zSig0, status);
158142c2
FB
4081
4082}
4083
4084/*----------------------------------------------------------------------------
4085| Returns the result of dividing the double-precision floating-point value `a'
4086| by the corresponding value `b'. The operation is performed according to
4087| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4088*----------------------------------------------------------------------------*/
4089
e5a41ffa 4090float64 float64_div(float64 a, float64 b, float_status *status)
158142c2
FB
4091{
4092 flag aSign, bSign, zSign;
0c48262d 4093 int aExp, bExp, zExp;
bb98fe42
AF
4094 uint64_t aSig, bSig, zSig;
4095 uint64_t rem0, rem1;
4096 uint64_t term0, term1;
ff32e16e
PM
4097 a = float64_squash_input_denormal(a, status);
4098 b = float64_squash_input_denormal(b, status);
158142c2
FB
4099
4100 aSig = extractFloat64Frac( a );
4101 aExp = extractFloat64Exp( a );
4102 aSign = extractFloat64Sign( a );
4103 bSig = extractFloat64Frac( b );
4104 bExp = extractFloat64Exp( b );
4105 bSign = extractFloat64Sign( b );
4106 zSign = aSign ^ bSign;
4107 if ( aExp == 0x7FF ) {
ff32e16e
PM
4108 if (aSig) {
4109 return propagateFloat64NaN(a, b, status);
4110 }
158142c2 4111 if ( bExp == 0x7FF ) {
ff32e16e
PM
4112 if (bSig) {
4113 return propagateFloat64NaN(a, b, status);
4114 }
4115 float_raise(float_flag_invalid, status);
af39bc8c 4116 return float64_default_nan(status);
158142c2
FB
4117 }
4118 return packFloat64( zSign, 0x7FF, 0 );
4119 }
4120 if ( bExp == 0x7FF ) {
ff32e16e
PM
4121 if (bSig) {
4122 return propagateFloat64NaN(a, b, status);
4123 }
158142c2
FB
4124 return packFloat64( zSign, 0, 0 );
4125 }
4126 if ( bExp == 0 ) {
4127 if ( bSig == 0 ) {
4128 if ( ( aExp | aSig ) == 0 ) {
ff32e16e 4129 float_raise(float_flag_invalid, status);
af39bc8c 4130 return float64_default_nan(status);
158142c2 4131 }
ff32e16e 4132 float_raise(float_flag_divbyzero, status);
158142c2
FB
4133 return packFloat64( zSign, 0x7FF, 0 );
4134 }
4135 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4136 }
4137 if ( aExp == 0 ) {
4138 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
4139 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4140 }
4141 zExp = aExp - bExp + 0x3FD;
4142 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
4143 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4144 if ( bSig <= ( aSig + aSig ) ) {
4145 aSig >>= 1;
4146 ++zExp;
4147 }
4148 zSig = estimateDiv128To64( aSig, 0, bSig );
4149 if ( ( zSig & 0x1FF ) <= 2 ) {
4150 mul64To128( bSig, zSig, &term0, &term1 );
4151 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
bb98fe42 4152 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
4153 --zSig;
4154 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4155 }
4156 zSig |= ( rem1 != 0 );
4157 }
ff32e16e 4158 return roundAndPackFloat64(zSign, zExp, zSig, status);
158142c2
FB
4159
4160}
4161
4162/*----------------------------------------------------------------------------
4163| Returns the remainder of the double-precision floating-point value `a'
4164| with respect to the corresponding value `b'. The operation is performed
4165| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4166*----------------------------------------------------------------------------*/
4167
e5a41ffa 4168float64 float64_rem(float64 a, float64 b, float_status *status)
158142c2 4169{
ed086f3d 4170 flag aSign, zSign;
0c48262d 4171 int aExp, bExp, expDiff;
bb98fe42
AF
4172 uint64_t aSig, bSig;
4173 uint64_t q, alternateASig;
4174 int64_t sigMean;
158142c2 4175
ff32e16e
PM
4176 a = float64_squash_input_denormal(a, status);
4177 b = float64_squash_input_denormal(b, status);
158142c2
FB
4178 aSig = extractFloat64Frac( a );
4179 aExp = extractFloat64Exp( a );
4180 aSign = extractFloat64Sign( a );
4181 bSig = extractFloat64Frac( b );
4182 bExp = extractFloat64Exp( b );
158142c2
FB
4183 if ( aExp == 0x7FF ) {
4184 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
ff32e16e 4185 return propagateFloat64NaN(a, b, status);
158142c2 4186 }
ff32e16e 4187 float_raise(float_flag_invalid, status);
af39bc8c 4188 return float64_default_nan(status);
158142c2
FB
4189 }
4190 if ( bExp == 0x7FF ) {
ff32e16e
PM
4191 if (bSig) {
4192 return propagateFloat64NaN(a, b, status);
4193 }
158142c2
FB
4194 return a;
4195 }
4196 if ( bExp == 0 ) {
4197 if ( bSig == 0 ) {
ff32e16e 4198 float_raise(float_flag_invalid, status);
af39bc8c 4199 return float64_default_nan(status);
158142c2
FB
4200 }
4201 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4202 }
4203 if ( aExp == 0 ) {
4204 if ( aSig == 0 ) return a;
4205 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4206 }
4207 expDiff = aExp - bExp;
4208 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4209 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4210 if ( expDiff < 0 ) {
4211 if ( expDiff < -1 ) return a;
4212 aSig >>= 1;
4213 }
4214 q = ( bSig <= aSig );
4215 if ( q ) aSig -= bSig;
4216 expDiff -= 64;
4217 while ( 0 < expDiff ) {
4218 q = estimateDiv128To64( aSig, 0, bSig );
4219 q = ( 2 < q ) ? q - 2 : 0;
4220 aSig = - ( ( bSig>>2 ) * q );
4221 expDiff -= 62;
4222 }
4223 expDiff += 64;
4224 if ( 0 < expDiff ) {
4225 q = estimateDiv128To64( aSig, 0, bSig );
4226 q = ( 2 < q ) ? q - 2 : 0;
4227 q >>= 64 - expDiff;
4228 bSig >>= 2;
4229 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4230 }
4231 else {
4232 aSig >>= 2;
4233 bSig >>= 2;
4234 }
4235 do {
4236 alternateASig = aSig;
4237 ++q;
4238 aSig -= bSig;
bb98fe42 4239 } while ( 0 <= (int64_t) aSig );
158142c2
FB
4240 sigMean = aSig + alternateASig;
4241 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4242 aSig = alternateASig;
4243 }
bb98fe42 4244 zSign = ( (int64_t) aSig < 0 );
158142c2 4245 if ( zSign ) aSig = - aSig;
ff32e16e 4246 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
158142c2
FB
4247
4248}
4249
369be8f6
PM
4250/*----------------------------------------------------------------------------
4251| Returns the result of multiplying the double-precision floating-point values
4252| `a' and `b' then adding 'c', with no intermediate rounding step after the
4253| multiplication. The operation is performed according to the IEC/IEEE
4254| Standard for Binary Floating-Point Arithmetic 754-2008.
4255| The flags argument allows the caller to select negation of the
4256| addend, the intermediate product, or the final result. (The difference
4257| between this and having the caller do a separate negation is that negating
4258| externally will flip the sign bit on NaNs.)
4259*----------------------------------------------------------------------------*/
4260
e5a41ffa
PM
4261float64 float64_muladd(float64 a, float64 b, float64 c, int flags,
4262 float_status *status)
369be8f6
PM
4263{
4264 flag aSign, bSign, cSign, zSign;
0c48262d 4265 int aExp, bExp, cExp, pExp, zExp, expDiff;
369be8f6
PM
4266 uint64_t aSig, bSig, cSig;
4267 flag pInf, pZero, pSign;
4268 uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1;
4269 int shiftcount;
4270 flag signflip, infzero;
4271
ff32e16e
PM
4272 a = float64_squash_input_denormal(a, status);
4273 b = float64_squash_input_denormal(b, status);
4274 c = float64_squash_input_denormal(c, status);
369be8f6
PM
4275 aSig = extractFloat64Frac(a);
4276 aExp = extractFloat64Exp(a);
4277 aSign = extractFloat64Sign(a);
4278 bSig = extractFloat64Frac(b);
4279 bExp = extractFloat64Exp(b);
4280 bSign = extractFloat64Sign(b);
4281 cSig = extractFloat64Frac(c);
4282 cExp = extractFloat64Exp(c);
4283 cSign = extractFloat64Sign(c);
4284
4285 infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) ||
4286 (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0));
4287
4288 /* It is implementation-defined whether the cases of (0,inf,qnan)
4289 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
4290 * they return if they do), so we have to hand this information
4291 * off to the target-specific pick-a-NaN routine.
4292 */
4293 if (((aExp == 0x7ff) && aSig) ||
4294 ((bExp == 0x7ff) && bSig) ||
4295 ((cExp == 0x7ff) && cSig)) {
ff32e16e 4296 return propagateFloat64MulAddNaN(a, b, c, infzero, status);
369be8f6
PM
4297 }
4298
4299 if (infzero) {
ff32e16e 4300 float_raise(float_flag_invalid, status);
af39bc8c 4301 return float64_default_nan(status);
369be8f6
PM
4302 }
4303
4304 if (flags & float_muladd_negate_c) {
4305 cSign ^= 1;
4306 }
4307
4308 signflip = (flags & float_muladd_negate_result) ? 1 : 0;
4309
4310 /* Work out the sign and type of the product */
4311 pSign = aSign ^ bSign;
4312 if (flags & float_muladd_negate_product) {
4313 pSign ^= 1;
4314 }
4315 pInf = (aExp == 0x7ff) || (bExp == 0x7ff);
4316 pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
4317
4318 if (cExp == 0x7ff) {
4319 if (pInf && (pSign ^ cSign)) {
4320 /* addition of opposite-signed infinities => InvalidOperation */
ff32e16e 4321 float_raise(float_flag_invalid, status);
af39bc8c 4322 return float64_default_nan(status);
369be8f6
PM
4323 }
4324 /* Otherwise generate an infinity of the same sign */
4325 return packFloat64(cSign ^ signflip, 0x7ff, 0);
4326 }
4327
4328 if (pInf) {
4329 return packFloat64(pSign ^ signflip, 0x7ff, 0);
4330 }
4331
4332 if (pZero) {
4333 if (cExp == 0) {
4334 if (cSig == 0) {
4335 /* Adding two exact zeroes */
4336 if (pSign == cSign) {
4337 zSign = pSign;
a2f2d288 4338 } else if (status->float_rounding_mode == float_round_down) {
369be8f6
PM
4339 zSign = 1;
4340 } else {
4341 zSign = 0;
4342 }
4343 return packFloat64(zSign ^ signflip, 0, 0);
4344 }
4345 /* Exact zero plus a denorm */
a2f2d288 4346 if (status->flush_to_zero) {
ff32e16e 4347 float_raise(float_flag_output_denormal, status);
369be8f6
PM
4348 return packFloat64(cSign ^ signflip, 0, 0);
4349 }
4350 }
4351 /* Zero plus something non-zero : just return the something */
67d43538
PM
4352 if (flags & float_muladd_halve_result) {
4353 if (cExp == 0) {
4354 normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4355 }
4356 /* Subtract one to halve, and one again because roundAndPackFloat64
4357 * wants one less than the true exponent.
4358 */
4359 cExp -= 2;
4360 cSig = (cSig | 0x0010000000000000ULL) << 10;
ff32e16e 4361 return roundAndPackFloat64(cSign ^ signflip, cExp, cSig, status);
67d43538 4362 }
a6e7c184 4363 return packFloat64(cSign ^ signflip, cExp, cSig);
369be8f6
PM
4364 }
4365
4366 if (aExp == 0) {
4367 normalizeFloat64Subnormal(aSig, &aExp, &aSig);
4368 }
4369 if (bExp == 0) {
4370 normalizeFloat64Subnormal(bSig, &bExp, &bSig);
4371 }
4372
4373 /* Calculate the actual result a * b + c */
4374
4375 /* Multiply first; this is easy. */
4376 /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff
4377 * because we want the true exponent, not the "one-less-than"
4378 * flavour that roundAndPackFloat64() takes.
4379 */
4380 pExp = aExp + bExp - 0x3fe;
4381 aSig = (aSig | LIT64(0x0010000000000000))<<10;
4382 bSig = (bSig | LIT64(0x0010000000000000))<<11;
4383 mul64To128(aSig, bSig, &pSig0, &pSig1);
4384 if ((int64_t)(pSig0 << 1) >= 0) {
4385 shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1);
4386 pExp--;
4387 }
4388
4389 zSign = pSign ^ signflip;
4390
4391 /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit
4392 * bit in position 126.
4393 */
4394 if (cExp == 0) {
4395 if (!cSig) {
4396 /* Throw out the special case of c being an exact zero now */
4397 shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1);
67d43538
PM
4398 if (flags & float_muladd_halve_result) {
4399 pExp--;
4400 }
369be8f6 4401 return roundAndPackFloat64(zSign, pExp - 1,
ff32e16e 4402 pSig1, status);
369be8f6
PM
4403 }
4404 normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4405 }
4406
4407 /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the
4408 * significand of the addend, with the explicit bit in position 126.
4409 */
4410 cSig0 = cSig << (126 - 64 - 52);
4411 cSig1 = 0;
4412 cSig0 |= LIT64(0x4000000000000000);
4413 expDiff = pExp - cExp;
4414
4415 if (pSign == cSign) {
4416 /* Addition */
4417 if (expDiff > 0) {
4418 /* scale c to match p */
4419 shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4420 zExp = pExp;
4421 } else if (expDiff < 0) {
4422 /* scale p to match c */
4423 shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4424 zExp = cExp;
4425 } else {
4426 /* no scaling needed */
4427 zExp = cExp;
4428 }
4429 /* Add significands and make sure explicit bit ends up in posn 126 */
4430 add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4431 if ((int64_t)zSig0 < 0) {
4432 shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);
4433 } else {
4434 zExp--;
4435 }
4436 shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1);
67d43538
PM
4437 if (flags & float_muladd_halve_result) {
4438 zExp--;
4439 }
ff32e16e 4440 return roundAndPackFloat64(zSign, zExp, zSig1, status);
369be8f6
PM
4441 } else {
4442 /* Subtraction */
4443 if (expDiff > 0) {
4444 shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4445 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4446 zExp = pExp;
4447 } else if (expDiff < 0) {
4448 shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4449 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4450 zExp = cExp;
4451 zSign ^= 1;
4452 } else {
4453 zExp = pExp;
4454 if (lt128(cSig0, cSig1, pSig0, pSig1)) {
4455 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4456 } else if (lt128(pSig0, pSig1, cSig0, cSig1)) {
4457 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4458 zSign ^= 1;
4459 } else {
4460 /* Exact zero */
4461 zSign = signflip;
a2f2d288 4462 if (status->float_rounding_mode == float_round_down) {
369be8f6
PM
4463 zSign ^= 1;
4464 }
4465 return packFloat64(zSign, 0, 0);
4466 }
4467 }
4468 --zExp;
4469 /* Do the equivalent of normalizeRoundAndPackFloat64() but
4470 * starting with the significand in a pair of uint64_t.
4471 */
4472 if (zSig0) {
4473 shiftcount = countLeadingZeros64(zSig0) - 1;
4474 shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1);
4475 if (zSig1) {
4476 zSig0 |= 1;
4477 }
4478 zExp -= shiftcount;
4479 } else {
e3d142d0
PM
4480 shiftcount = countLeadingZeros64(zSig1);
4481 if (shiftcount == 0) {
4482 zSig0 = (zSig1 >> 1) | (zSig1 & 1);
4483 zExp -= 63;
4484 } else {
4485 shiftcount--;
4486 zSig0 = zSig1 << shiftcount;
4487 zExp -= (shiftcount + 64);
4488 }
369be8f6 4489 }
67d43538
PM
4490 if (flags & float_muladd_halve_result) {
4491 zExp--;
4492 }
ff32e16e 4493 return roundAndPackFloat64(zSign, zExp, zSig0, status);
369be8f6
PM
4494 }
4495}
4496
158142c2
FB
4497/*----------------------------------------------------------------------------
4498| Returns the square root of the double-precision floating-point value `a'.
4499| The operation is performed according to the IEC/IEEE Standard for Binary
4500| Floating-Point Arithmetic.
4501*----------------------------------------------------------------------------*/
4502
e5a41ffa 4503float64 float64_sqrt(float64 a, float_status *status)
158142c2
FB
4504{
4505 flag aSign;
0c48262d 4506 int aExp, zExp;
bb98fe42
AF
4507 uint64_t aSig, zSig, doubleZSig;
4508 uint64_t rem0, rem1, term0, term1;
ff32e16e 4509 a = float64_squash_input_denormal(a, status);
158142c2
FB
4510
4511 aSig = extractFloat64Frac( a );
4512 aExp = extractFloat64Exp( a );
4513 aSign = extractFloat64Sign( a );
4514 if ( aExp == 0x7FF ) {
ff32e16e
PM
4515 if (aSig) {
4516 return propagateFloat64NaN(a, a, status);
4517 }
158142c2 4518 if ( ! aSign ) return a;
ff32e16e 4519 float_raise(float_flag_invalid, status);
af39bc8c 4520 return float64_default_nan(status);
158142c2
FB
4521 }
4522 if ( aSign ) {
4523 if ( ( aExp | aSig ) == 0 ) return a;
ff32e16e 4524 float_raise(float_flag_invalid, status);
af39bc8c 4525 return float64_default_nan(status);
158142c2
FB
4526 }
4527 if ( aExp == 0 ) {
f090c9d4 4528 if ( aSig == 0 ) return float64_zero;
158142c2
FB
4529 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4530 }
4531 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
4532 aSig |= LIT64( 0x0010000000000000 );
4533 zSig = estimateSqrt32( aExp, aSig>>21 );
4534 aSig <<= 9 - ( aExp & 1 );
4535 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
4536 if ( ( zSig & 0x1FF ) <= 5 ) {
4537 doubleZSig = zSig<<1;
4538 mul64To128( zSig, zSig, &term0, &term1 );
4539 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
bb98fe42 4540 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
4541 --zSig;
4542 doubleZSig -= 2;
4543 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
4544 }
4545 zSig |= ( ( rem0 | rem1 ) != 0 );
4546 }
ff32e16e 4547 return roundAndPackFloat64(0, zExp, zSig, status);
158142c2
FB
4548
4549}
4550
374dfc33
AJ
4551/*----------------------------------------------------------------------------
4552| Returns the binary log of the double-precision floating-point value `a'.
4553| The operation is performed according to the IEC/IEEE Standard for Binary
4554| Floating-Point Arithmetic.
4555*----------------------------------------------------------------------------*/
e5a41ffa 4556float64 float64_log2(float64 a, float_status *status)
374dfc33
AJ
4557{
4558 flag aSign, zSign;
0c48262d 4559 int aExp;
bb98fe42 4560 uint64_t aSig, aSig0, aSig1, zSig, i;
ff32e16e 4561 a = float64_squash_input_denormal(a, status);
374dfc33
AJ
4562
4563 aSig = extractFloat64Frac( a );
4564 aExp = extractFloat64Exp( a );
4565 aSign = extractFloat64Sign( a );
4566
4567 if ( aExp == 0 ) {
4568 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4569 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4570 }
4571 if ( aSign ) {
ff32e16e 4572 float_raise(float_flag_invalid, status);
af39bc8c 4573 return float64_default_nan(status);
374dfc33
AJ
4574 }
4575 if ( aExp == 0x7FF ) {
ff32e16e
PM
4576 if (aSig) {
4577 return propagateFloat64NaN(a, float64_zero, status);
4578 }
374dfc33
AJ
4579 return a;
4580 }
4581
4582 aExp -= 0x3FF;
4583 aSig |= LIT64( 0x0010000000000000 );
4584 zSign = aExp < 0;
bb98fe42 4585 zSig = (uint64_t)aExp << 52;
374dfc33
AJ
4586 for (i = 1LL << 51; i > 0; i >>= 1) {
4587 mul64To128( aSig, aSig, &aSig0, &aSig1 );
4588 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4589 if ( aSig & LIT64( 0x0020000000000000 ) ) {
4590 aSig >>= 1;
4591 zSig |= i;
4592 }
4593 }
4594
4595 if ( zSign )
4596 zSig = -zSig;
ff32e16e 4597 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
374dfc33
AJ
4598}
4599
158142c2
FB
4600/*----------------------------------------------------------------------------
4601| Returns 1 if the double-precision floating-point value `a' is equal to the
b689362d
AJ
4602| corresponding value `b', and 0 otherwise. The invalid exception is raised
4603| if either operand is a NaN. Otherwise, the comparison is performed
158142c2
FB
4604| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4605*----------------------------------------------------------------------------*/
4606
e5a41ffa 4607int float64_eq(float64 a, float64 b, float_status *status)
158142c2 4608{
bb98fe42 4609 uint64_t av, bv;
ff32e16e
PM
4610 a = float64_squash_input_denormal(a, status);
4611 b = float64_squash_input_denormal(b, status);
158142c2
FB
4612
4613 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4614 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4615 ) {
ff32e16e 4616 float_raise(float_flag_invalid, status);
158142c2
FB
4617 return 0;
4618 }
f090c9d4 4619 av = float64_val(a);
a1b91bb4 4620 bv = float64_val(b);
bb98fe42 4621 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
158142c2
FB
4622
4623}
4624
4625/*----------------------------------------------------------------------------
4626| Returns 1 if the double-precision floating-point value `a' is less than or
f5a64251
AJ
4627| equal to the corresponding value `b', and 0 otherwise. The invalid
4628| exception is raised if either operand is a NaN. The comparison is performed
4629| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
4630*----------------------------------------------------------------------------*/
4631
e5a41ffa 4632int float64_le(float64 a, float64 b, float_status *status)
158142c2
FB
4633{
4634 flag aSign, bSign;
bb98fe42 4635 uint64_t av, bv;
ff32e16e
PM
4636 a = float64_squash_input_denormal(a, status);
4637 b = float64_squash_input_denormal(b, status);
158142c2
FB
4638
4639 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4640 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4641 ) {
ff32e16e 4642 float_raise(float_flag_invalid, status);
158142c2
FB
4643 return 0;
4644 }
4645 aSign = extractFloat64Sign( a );
4646 bSign = extractFloat64Sign( b );
f090c9d4 4647 av = float64_val(a);
a1b91bb4 4648 bv = float64_val(b);
bb98fe42 4649 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 4650 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
4651
4652}
4653
4654/*----------------------------------------------------------------------------
4655| Returns 1 if the double-precision floating-point value `a' is less than
f5a64251
AJ
4656| the corresponding value `b', and 0 otherwise. The invalid exception is
4657| raised if either operand is a NaN. The comparison is performed according
4658| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
4659*----------------------------------------------------------------------------*/
4660
e5a41ffa 4661int float64_lt(float64 a, float64 b, float_status *status)
158142c2
FB
4662{
4663 flag aSign, bSign;
bb98fe42 4664 uint64_t av, bv;
158142c2 4665
ff32e16e
PM
4666 a = float64_squash_input_denormal(a, status);
4667 b = float64_squash_input_denormal(b, status);
158142c2
FB
4668 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4669 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4670 ) {
ff32e16e 4671 float_raise(float_flag_invalid, status);
158142c2
FB
4672 return 0;
4673 }
4674 aSign = extractFloat64Sign( a );
4675 bSign = extractFloat64Sign( b );
f090c9d4 4676 av = float64_val(a);
a1b91bb4 4677 bv = float64_val(b);
bb98fe42 4678 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 4679 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
4680
4681}
4682
67b7861d
AJ
4683/*----------------------------------------------------------------------------
4684| Returns 1 if the double-precision floating-point values `a' and `b' cannot
f5a64251
AJ
4685| be compared, and 0 otherwise. The invalid exception is raised if either
4686| operand is a NaN. The comparison is performed according to the IEC/IEEE
4687| Standard for Binary Floating-Point Arithmetic.
67b7861d
AJ
4688*----------------------------------------------------------------------------*/
4689
e5a41ffa 4690int float64_unordered(float64 a, float64 b, float_status *status)
67b7861d 4691{
ff32e16e
PM
4692 a = float64_squash_input_denormal(a, status);
4693 b = float64_squash_input_denormal(b, status);
67b7861d
AJ
4694
4695 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4696 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4697 ) {
ff32e16e 4698 float_raise(float_flag_invalid, status);
67b7861d
AJ
4699 return 1;
4700 }
4701 return 0;
4702}
4703
158142c2
FB
4704/*----------------------------------------------------------------------------
4705| Returns 1 if the double-precision floating-point value `a' is equal to the
f5a64251
AJ
4706| corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4707| exception.The comparison is performed according to the IEC/IEEE Standard
4708| for Binary Floating-Point Arithmetic.
158142c2
FB
4709*----------------------------------------------------------------------------*/
4710
e5a41ffa 4711int float64_eq_quiet(float64 a, float64 b, float_status *status)
158142c2 4712{
bb98fe42 4713 uint64_t av, bv;
ff32e16e
PM
4714 a = float64_squash_input_denormal(a, status);
4715 b = float64_squash_input_denormal(b, status);
158142c2
FB
4716
4717 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4718 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4719 ) {
af39bc8c
AM
4720 if (float64_is_signaling_nan(a, status)
4721 || float64_is_signaling_nan(b, status)) {
ff32e16e 4722 float_raise(float_flag_invalid, status);
b689362d 4723 }
158142c2
FB
4724 return 0;
4725 }
f090c9d4 4726 av = float64_val(a);
a1b91bb4 4727 bv = float64_val(b);
bb98fe42 4728 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
158142c2
FB
4729
4730}
4731
4732/*----------------------------------------------------------------------------
4733| Returns 1 if the double-precision floating-point value `a' is less than or
4734| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4735| cause an exception. Otherwise, the comparison is performed according to the
4736| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4737*----------------------------------------------------------------------------*/
4738
e5a41ffa 4739int float64_le_quiet(float64 a, float64 b, float_status *status)
158142c2
FB
4740{
4741 flag aSign, bSign;
bb98fe42 4742 uint64_t av, bv;
ff32e16e
PM
4743 a = float64_squash_input_denormal(a, status);
4744 b = float64_squash_input_denormal(b, status);
158142c2
FB
4745
4746 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4747 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4748 ) {
af39bc8c
AM
4749 if (float64_is_signaling_nan(a, status)
4750 || float64_is_signaling_nan(b, status)) {
ff32e16e 4751 float_raise(float_flag_invalid, status);
158142c2
FB
4752 }
4753 return 0;
4754 }
4755 aSign = extractFloat64Sign( a );
4756 bSign = extractFloat64Sign( b );
f090c9d4 4757 av = float64_val(a);
a1b91bb4 4758 bv = float64_val(b);
bb98fe42 4759 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 4760 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
4761
4762}
4763
4764/*----------------------------------------------------------------------------
4765| Returns 1 if the double-precision floating-point value `a' is less than
4766| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4767| exception. Otherwise, the comparison is performed according to the IEC/IEEE
4768| Standard for Binary Floating-Point Arithmetic.
4769*----------------------------------------------------------------------------*/
4770
e5a41ffa 4771int float64_lt_quiet(float64 a, float64 b, float_status *status)
158142c2
FB
4772{
4773 flag aSign, bSign;
bb98fe42 4774 uint64_t av, bv;
ff32e16e
PM
4775 a = float64_squash_input_denormal(a, status);
4776 b = float64_squash_input_denormal(b, status);
158142c2
FB
4777
4778 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4779 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4780 ) {
af39bc8c
AM
4781 if (float64_is_signaling_nan(a, status)
4782 || float64_is_signaling_nan(b, status)) {
ff32e16e 4783 float_raise(float_flag_invalid, status);
158142c2
FB
4784 }
4785 return 0;
4786 }
4787 aSign = extractFloat64Sign( a );
4788 bSign = extractFloat64Sign( b );
f090c9d4 4789 av = float64_val(a);
a1b91bb4 4790 bv = float64_val(b);
bb98fe42 4791 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 4792 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
4793
4794}
4795
67b7861d
AJ
4796/*----------------------------------------------------------------------------
4797| Returns 1 if the double-precision floating-point values `a' and `b' cannot
4798| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4799| comparison is performed according to the IEC/IEEE Standard for Binary
4800| Floating-Point Arithmetic.
4801*----------------------------------------------------------------------------*/
4802
e5a41ffa 4803int float64_unordered_quiet(float64 a, float64 b, float_status *status)
67b7861d 4804{
ff32e16e
PM
4805 a = float64_squash_input_denormal(a, status);
4806 b = float64_squash_input_denormal(b, status);
67b7861d
AJ
4807
4808 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4809 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4810 ) {
af39bc8c
AM
4811 if (float64_is_signaling_nan(a, status)
4812 || float64_is_signaling_nan(b, status)) {
ff32e16e 4813 float_raise(float_flag_invalid, status);
67b7861d
AJ
4814 }
4815 return 1;
4816 }
4817 return 0;
4818}
4819
158142c2
FB
4820/*----------------------------------------------------------------------------
4821| Returns the result of converting the extended double-precision floating-
4822| point value `a' to the 32-bit two's complement integer format. The
4823| conversion is performed according to the IEC/IEEE Standard for Binary
4824| Floating-Point Arithmetic---which means in particular that the conversion
4825| is rounded according to the current rounding mode. If `a' is a NaN, the
4826| largest positive integer is returned. Otherwise, if the conversion
4827| overflows, the largest integer with the same sign as `a' is returned.
4828*----------------------------------------------------------------------------*/
4829
f4014512 4830int32_t floatx80_to_int32(floatx80 a, float_status *status)
158142c2
FB
4831{
4832 flag aSign;
f4014512 4833 int32_t aExp, shiftCount;
bb98fe42 4834 uint64_t aSig;
158142c2 4835
d1eb8f2a
AD
4836 if (floatx80_invalid_encoding(a)) {
4837 float_raise(float_flag_invalid, status);
4838 return 1 << 31;
4839 }
158142c2
FB
4840 aSig = extractFloatx80Frac( a );
4841 aExp = extractFloatx80Exp( a );
4842 aSign = extractFloatx80Sign( a );
bb98fe42 4843 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
158142c2
FB
4844 shiftCount = 0x4037 - aExp;
4845 if ( shiftCount <= 0 ) shiftCount = 1;
4846 shift64RightJamming( aSig, shiftCount, &aSig );
ff32e16e 4847 return roundAndPackInt32(aSign, aSig, status);
158142c2
FB
4848
4849}
4850
4851/*----------------------------------------------------------------------------
4852| Returns the result of converting the extended double-precision floating-
4853| point value `a' to the 32-bit two's complement integer format. The
4854| conversion is performed according to the IEC/IEEE Standard for Binary
4855| Floating-Point Arithmetic, except that the conversion is always rounded
4856| toward zero. If `a' is a NaN, the largest positive integer is returned.
4857| Otherwise, if the conversion overflows, the largest integer with the same
4858| sign as `a' is returned.
4859*----------------------------------------------------------------------------*/
4860
f4014512 4861int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
158142c2
FB
4862{
4863 flag aSign;
f4014512 4864 int32_t aExp, shiftCount;
bb98fe42 4865 uint64_t aSig, savedASig;
b3a6a2e0 4866 int32_t z;
158142c2 4867
d1eb8f2a
AD
4868 if (floatx80_invalid_encoding(a)) {
4869 float_raise(float_flag_invalid, status);
4870 return 1 << 31;
4871 }
158142c2
FB
4872 aSig = extractFloatx80Frac( a );
4873 aExp = extractFloatx80Exp( a );
4874 aSign = extractFloatx80Sign( a );
4875 if ( 0x401E < aExp ) {
bb98fe42 4876 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
158142c2
FB
4877 goto invalid;
4878 }
4879 else if ( aExp < 0x3FFF ) {
a2f2d288
PM
4880 if (aExp || aSig) {
4881 status->float_exception_flags |= float_flag_inexact;
4882 }
158142c2
FB
4883 return 0;
4884 }
4885 shiftCount = 0x403E - aExp;
4886 savedASig = aSig;
4887 aSig >>= shiftCount;
4888 z = aSig;
4889 if ( aSign ) z = - z;
4890 if ( ( z < 0 ) ^ aSign ) {
4891 invalid:
ff32e16e 4892 float_raise(float_flag_invalid, status);
bb98fe42 4893 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2
FB
4894 }
4895 if ( ( aSig<<shiftCount ) != savedASig ) {
a2f2d288 4896 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
4897 }
4898 return z;
4899
4900}
4901
4902/*----------------------------------------------------------------------------
4903| Returns the result of converting the extended double-precision floating-
4904| point value `a' to the 64-bit two's complement integer format. The
4905| conversion is performed according to the IEC/IEEE Standard for Binary
4906| Floating-Point Arithmetic---which means in particular that the conversion
4907| is rounded according to the current rounding mode. If `a' is a NaN,
4908| the largest positive integer is returned. Otherwise, if the conversion
4909| overflows, the largest integer with the same sign as `a' is returned.
4910*----------------------------------------------------------------------------*/
4911
f42c2224 4912int64_t floatx80_to_int64(floatx80 a, float_status *status)
158142c2
FB
4913{
4914 flag aSign;
f4014512 4915 int32_t aExp, shiftCount;
bb98fe42 4916 uint64_t aSig, aSigExtra;
158142c2 4917
d1eb8f2a
AD
4918 if (floatx80_invalid_encoding(a)) {
4919 float_raise(float_flag_invalid, status);
4920 return 1ULL << 63;
4921 }
158142c2
FB
4922 aSig = extractFloatx80Frac( a );
4923 aExp = extractFloatx80Exp( a );
4924 aSign = extractFloatx80Sign( a );
4925 shiftCount = 0x403E - aExp;
4926 if ( shiftCount <= 0 ) {
4927 if ( shiftCount ) {
ff32e16e 4928 float_raise(float_flag_invalid, status);
158142c2
FB
4929 if ( ! aSign
4930 || ( ( aExp == 0x7FFF )
4931 && ( aSig != LIT64( 0x8000000000000000 ) ) )
4932 ) {
4933 return LIT64( 0x7FFFFFFFFFFFFFFF );
4934 }
bb98fe42 4935 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
4936 }
4937 aSigExtra = 0;
4938 }
4939 else {
4940 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4941 }
ff32e16e 4942 return roundAndPackInt64(aSign, aSig, aSigExtra, status);
158142c2
FB
4943
4944}
4945
4946/*----------------------------------------------------------------------------
4947| Returns the result of converting the extended double-precision floating-
4948| point value `a' to the 64-bit two's complement integer format. The
4949| conversion is performed according to the IEC/IEEE Standard for Binary
4950| Floating-Point Arithmetic, except that the conversion is always rounded
4951| toward zero. If `a' is a NaN, the largest positive integer is returned.
4952| Otherwise, if the conversion overflows, the largest integer with the same
4953| sign as `a' is returned.
4954*----------------------------------------------------------------------------*/
4955
f42c2224 4956int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
158142c2
FB
4957{
4958 flag aSign;
f4014512 4959 int32_t aExp, shiftCount;
bb98fe42 4960 uint64_t aSig;
f42c2224 4961 int64_t z;
158142c2 4962
d1eb8f2a
AD
4963 if (floatx80_invalid_encoding(a)) {
4964 float_raise(float_flag_invalid, status);
4965 return 1ULL << 63;
4966 }
158142c2
FB
4967 aSig = extractFloatx80Frac( a );
4968 aExp = extractFloatx80Exp( a );
4969 aSign = extractFloatx80Sign( a );
4970 shiftCount = aExp - 0x403E;
4971 if ( 0 <= shiftCount ) {
4972 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4973 if ( ( a.high != 0xC03E ) || aSig ) {
ff32e16e 4974 float_raise(float_flag_invalid, status);
158142c2
FB
4975 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4976 return LIT64( 0x7FFFFFFFFFFFFFFF );
4977 }
4978 }
bb98fe42 4979 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
4980 }
4981 else if ( aExp < 0x3FFF ) {
a2f2d288
PM
4982 if (aExp | aSig) {
4983 status->float_exception_flags |= float_flag_inexact;
4984 }
158142c2
FB
4985 return 0;
4986 }
4987 z = aSig>>( - shiftCount );
bb98fe42 4988 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
a2f2d288 4989 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
4990 }
4991 if ( aSign ) z = - z;
4992 return z;
4993
4994}
4995
4996/*----------------------------------------------------------------------------
4997| Returns the result of converting the extended double-precision floating-
4998| point value `a' to the single-precision floating-point format. The
4999| conversion is performed according to the IEC/IEEE Standard for Binary
5000| Floating-Point Arithmetic.
5001*----------------------------------------------------------------------------*/
5002
e5a41ffa 5003float32 floatx80_to_float32(floatx80 a, float_status *status)
158142c2
FB
5004{
5005 flag aSign;
f4014512 5006 int32_t aExp;
bb98fe42 5007 uint64_t aSig;
158142c2 5008
d1eb8f2a
AD
5009 if (floatx80_invalid_encoding(a)) {
5010 float_raise(float_flag_invalid, status);
5011 return float32_default_nan(status);
5012 }
158142c2
FB
5013 aSig = extractFloatx80Frac( a );
5014 aExp = extractFloatx80Exp( a );
5015 aSign = extractFloatx80Sign( a );
5016 if ( aExp == 0x7FFF ) {
bb98fe42 5017 if ( (uint64_t) ( aSig<<1 ) ) {
ff32e16e 5018 return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
158142c2
FB
5019 }
5020 return packFloat32( aSign, 0xFF, 0 );
5021 }
5022 shift64RightJamming( aSig, 33, &aSig );
5023 if ( aExp || aSig ) aExp -= 0x3F81;
ff32e16e 5024 return roundAndPackFloat32(aSign, aExp, aSig, status);
158142c2
FB
5025
5026}
5027
5028/*----------------------------------------------------------------------------
5029| Returns the result of converting the extended double-precision floating-
5030| point value `a' to the double-precision floating-point format. The
5031| conversion is performed according to the IEC/IEEE Standard for Binary
5032| Floating-Point Arithmetic.
5033*----------------------------------------------------------------------------*/
5034
e5a41ffa 5035float64 floatx80_to_float64(floatx80 a, float_status *status)
158142c2
FB
5036{
5037 flag aSign;
f4014512 5038 int32_t aExp;
bb98fe42 5039 uint64_t aSig, zSig;
158142c2 5040
d1eb8f2a
AD
5041 if (floatx80_invalid_encoding(a)) {
5042 float_raise(float_flag_invalid, status);
5043 return float64_default_nan(status);
5044 }
158142c2
FB
5045 aSig = extractFloatx80Frac( a );
5046 aExp = extractFloatx80Exp( a );
5047 aSign = extractFloatx80Sign( a );
5048 if ( aExp == 0x7FFF ) {
bb98fe42 5049 if ( (uint64_t) ( aSig<<1 ) ) {
ff32e16e 5050 return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
158142c2
FB
5051 }
5052 return packFloat64( aSign, 0x7FF, 0 );
5053 }
5054 shift64RightJamming( aSig, 1, &zSig );
5055 if ( aExp || aSig ) aExp -= 0x3C01;
ff32e16e 5056 return roundAndPackFloat64(aSign, aExp, zSig, status);
158142c2
FB
5057
5058}
5059
158142c2
FB
5060/*----------------------------------------------------------------------------
5061| Returns the result of converting the extended double-precision floating-
5062| point value `a' to the quadruple-precision floating-point format. The
5063| conversion is performed according to the IEC/IEEE Standard for Binary
5064| Floating-Point Arithmetic.
5065*----------------------------------------------------------------------------*/
5066
e5a41ffa 5067float128 floatx80_to_float128(floatx80 a, float_status *status)
158142c2
FB
5068{
5069 flag aSign;
0c48262d 5070 int aExp;
bb98fe42 5071 uint64_t aSig, zSig0, zSig1;
158142c2 5072
d1eb8f2a
AD
5073 if (floatx80_invalid_encoding(a)) {
5074 float_raise(float_flag_invalid, status);
5075 return float128_default_nan(status);
5076 }
158142c2
FB
5077 aSig = extractFloatx80Frac( a );
5078 aExp = extractFloatx80Exp( a );
5079 aSign = extractFloatx80Sign( a );
bb98fe42 5080 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
ff32e16e 5081 return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
158142c2
FB
5082 }
5083 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5084 return packFloat128( aSign, aExp, zSig0, zSig1 );
5085
5086}
5087
0f721292
LV
5088/*----------------------------------------------------------------------------
5089| Rounds the extended double-precision floating-point value `a'
5090| to the precision provided by floatx80_rounding_precision and returns the
5091| result as an extended double-precision floating-point value.
5092| The operation is performed according to the IEC/IEEE Standard for Binary
5093| Floating-Point Arithmetic.
5094*----------------------------------------------------------------------------*/
5095
5096floatx80 floatx80_round(floatx80 a, float_status *status)
5097{
5098 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5099 extractFloatx80Sign(a),
5100 extractFloatx80Exp(a),
5101 extractFloatx80Frac(a), 0, status);
5102}
5103
158142c2
FB
5104/*----------------------------------------------------------------------------
5105| Rounds the extended double-precision floating-point value `a' to an integer,
5106| and returns the result as an extended quadruple-precision floating-point
5107| value. The operation is performed according to the IEC/IEEE Standard for
5108| Binary Floating-Point Arithmetic.
5109*----------------------------------------------------------------------------*/
5110
e5a41ffa 5111floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
158142c2
FB
5112{
5113 flag aSign;
f4014512 5114 int32_t aExp;
bb98fe42 5115 uint64_t lastBitMask, roundBitsMask;
158142c2
FB
5116 floatx80 z;
5117
d1eb8f2a
AD
5118 if (floatx80_invalid_encoding(a)) {
5119 float_raise(float_flag_invalid, status);
5120 return floatx80_default_nan(status);
5121 }
158142c2
FB
5122 aExp = extractFloatx80Exp( a );
5123 if ( 0x403E <= aExp ) {
bb98fe42 5124 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
ff32e16e 5125 return propagateFloatx80NaN(a, a, status);
158142c2
FB
5126 }
5127 return a;
5128 }
5129 if ( aExp < 0x3FFF ) {
5130 if ( ( aExp == 0 )
bb98fe42 5131 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
158142c2
FB
5132 return a;
5133 }
a2f2d288 5134 status->float_exception_flags |= float_flag_inexact;
158142c2 5135 aSign = extractFloatx80Sign( a );
a2f2d288 5136 switch (status->float_rounding_mode) {
158142c2 5137 case float_round_nearest_even:
bb98fe42 5138 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
158142c2
FB
5139 ) {
5140 return
5141 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
5142 }
5143 break;
f9288a76
PM
5144 case float_round_ties_away:
5145 if (aExp == 0x3FFE) {
5146 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
5147 }
5148 break;
158142c2
FB
5149 case float_round_down:
5150 return
5151 aSign ?
5152 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
5153 : packFloatx80( 0, 0, 0 );
5154 case float_round_up:
5155 return
5156 aSign ? packFloatx80( 1, 0, 0 )
5157 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
5158 }
5159 return packFloatx80( aSign, 0, 0 );
5160 }
5161 lastBitMask = 1;
5162 lastBitMask <<= 0x403E - aExp;
5163 roundBitsMask = lastBitMask - 1;
5164 z = a;
a2f2d288 5165 switch (status->float_rounding_mode) {
dc355b76 5166 case float_round_nearest_even:
158142c2 5167 z.low += lastBitMask>>1;
dc355b76
PM
5168 if ((z.low & roundBitsMask) == 0) {
5169 z.low &= ~lastBitMask;
5170 }
5171 break;
f9288a76
PM
5172 case float_round_ties_away:
5173 z.low += lastBitMask >> 1;
5174 break;
dc355b76
PM
5175 case float_round_to_zero:
5176 break;
5177 case float_round_up:
5178 if (!extractFloatx80Sign(z)) {
5179 z.low += roundBitsMask;
5180 }
5181 break;
5182 case float_round_down:
5183 if (extractFloatx80Sign(z)) {
158142c2
FB
5184 z.low += roundBitsMask;
5185 }
dc355b76
PM
5186 break;
5187 default:
5188 abort();
158142c2
FB
5189 }
5190 z.low &= ~ roundBitsMask;
5191 if ( z.low == 0 ) {
5192 ++z.high;
5193 z.low = LIT64( 0x8000000000000000 );
5194 }
a2f2d288
PM
5195 if (z.low != a.low) {
5196 status->float_exception_flags |= float_flag_inexact;
5197 }
158142c2
FB
5198 return z;
5199
5200}
5201
5202/*----------------------------------------------------------------------------
5203| Returns the result of adding the absolute values of the extended double-
5204| precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
5205| negated before being returned. `zSign' is ignored if the result is a NaN.
5206| The addition is performed according to the IEC/IEEE Standard for Binary
5207| Floating-Point Arithmetic.
5208*----------------------------------------------------------------------------*/
5209
e5a41ffa
PM
5210static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5211 float_status *status)
158142c2 5212{
f4014512 5213 int32_t aExp, bExp, zExp;
bb98fe42 5214 uint64_t aSig, bSig, zSig0, zSig1;
f4014512 5215 int32_t expDiff;
158142c2
FB
5216
5217 aSig = extractFloatx80Frac( a );
5218 aExp = extractFloatx80Exp( a );
5219 bSig = extractFloatx80Frac( b );
5220 bExp = extractFloatx80Exp( b );
5221 expDiff = aExp - bExp;
5222 if ( 0 < expDiff ) {
5223 if ( aExp == 0x7FFF ) {
ff32e16e
PM
5224 if ((uint64_t)(aSig << 1)) {
5225 return propagateFloatx80NaN(a, b, status);
5226 }
158142c2
FB
5227 return a;
5228 }
5229 if ( bExp == 0 ) --expDiff;
5230 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5231 zExp = aExp;
5232 }
5233 else if ( expDiff < 0 ) {
5234 if ( bExp == 0x7FFF ) {
ff32e16e
PM
5235 if ((uint64_t)(bSig << 1)) {
5236 return propagateFloatx80NaN(a, b, status);
5237 }
158142c2
FB
5238 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5239 }
5240 if ( aExp == 0 ) ++expDiff;
5241 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5242 zExp = bExp;
5243 }
5244 else {
5245 if ( aExp == 0x7FFF ) {
bb98fe42 5246 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
ff32e16e 5247 return propagateFloatx80NaN(a, b, status);
158142c2
FB
5248 }
5249 return a;
5250 }
5251 zSig1 = 0;
5252 zSig0 = aSig + bSig;
5253 if ( aExp == 0 ) {
5254 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5255 goto roundAndPack;
5256 }
5257 zExp = aExp;
5258 goto shiftRight1;
5259 }
5260 zSig0 = aSig + bSig;
bb98fe42 5261 if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
158142c2
FB
5262 shiftRight1:
5263 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5264 zSig0 |= LIT64( 0x8000000000000000 );
5265 ++zExp;
5266 roundAndPack:
a2f2d288 5267 return roundAndPackFloatx80(status->floatx80_rounding_precision,
ff32e16e 5268 zSign, zExp, zSig0, zSig1, status);
158142c2
FB
5269}
5270
5271/*----------------------------------------------------------------------------
5272| Returns the result of subtracting the absolute values of the extended
5273| double-precision floating-point values `a' and `b'. If `zSign' is 1, the
5274| difference is negated before being returned. `zSign' is ignored if the
5275| result is a NaN. The subtraction is performed according to the IEC/IEEE
5276| Standard for Binary Floating-Point Arithmetic.
5277*----------------------------------------------------------------------------*/
5278
e5a41ffa
PM
5279static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5280 float_status *status)
158142c2 5281{
f4014512 5282 int32_t aExp, bExp, zExp;
bb98fe42 5283 uint64_t aSig, bSig, zSig0, zSig1;
f4014512 5284 int32_t expDiff;
158142c2
FB
5285
5286 aSig = extractFloatx80Frac( a );
5287 aExp = extractFloatx80Exp( a );
5288 bSig = extractFloatx80Frac( b );
5289 bExp = extractFloatx80Exp( b );
5290 expDiff = aExp - bExp;
5291 if ( 0 < expDiff ) goto aExpBigger;
5292 if ( expDiff < 0 ) goto bExpBigger;
5293 if ( aExp == 0x7FFF ) {
bb98fe42 5294 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
ff32e16e 5295 return propagateFloatx80NaN(a, b, status);
158142c2 5296 }
ff32e16e 5297 float_raise(float_flag_invalid, status);
af39bc8c 5298 return floatx80_default_nan(status);
158142c2
FB
5299 }
5300 if ( aExp == 0 ) {
5301 aExp = 1;
5302 bExp = 1;
5303 }
5304 zSig1 = 0;
5305 if ( bSig < aSig ) goto aBigger;
5306 if ( aSig < bSig ) goto bBigger;
a2f2d288 5307 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
158142c2
FB
5308 bExpBigger:
5309 if ( bExp == 0x7FFF ) {
ff32e16e
PM
5310 if ((uint64_t)(bSig << 1)) {
5311 return propagateFloatx80NaN(a, b, status);
5312 }
158142c2
FB
5313 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
5314 }
5315 if ( aExp == 0 ) ++expDiff;
5316 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5317 bBigger:
5318 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5319 zExp = bExp;
5320 zSign ^= 1;
5321 goto normalizeRoundAndPack;
5322 aExpBigger:
5323 if ( aExp == 0x7FFF ) {
ff32e16e
PM
5324 if ((uint64_t)(aSig << 1)) {
5325 return propagateFloatx80NaN(a, b, status);
5326 }
158142c2
FB
5327 return a;
5328 }
5329 if ( bExp == 0 ) --expDiff;
5330 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5331 aBigger:
5332 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5333 zExp = aExp;
5334 normalizeRoundAndPack:
a2f2d288 5335 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
ff32e16e 5336 zSign, zExp, zSig0, zSig1, status);
158142c2
FB
5337}
5338
5339/*----------------------------------------------------------------------------
5340| Returns the result of adding the extended double-precision floating-point
5341| values `a' and `b'. The operation is performed according to the IEC/IEEE
5342| Standard for Binary Floating-Point Arithmetic.
5343*----------------------------------------------------------------------------*/
5344
e5a41ffa 5345floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5346{
5347 flag aSign, bSign;
5348
d1eb8f2a
AD
5349 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5350 float_raise(float_flag_invalid, status);
5351 return floatx80_default_nan(status);
5352 }
158142c2
FB
5353 aSign = extractFloatx80Sign( a );
5354 bSign = extractFloatx80Sign( b );
5355 if ( aSign == bSign ) {
ff32e16e 5356 return addFloatx80Sigs(a, b, aSign, status);
158142c2
FB
5357 }
5358 else {
ff32e16e 5359 return subFloatx80Sigs(a, b, aSign, status);
158142c2
FB
5360 }
5361
5362}
5363
5364/*----------------------------------------------------------------------------
5365| Returns the result of subtracting the extended double-precision floating-
5366| point values `a' and `b'. The operation is performed according to the
5367| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5368*----------------------------------------------------------------------------*/
5369
e5a41ffa 5370floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5371{
5372 flag aSign, bSign;
5373
d1eb8f2a
AD
5374 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5375 float_raise(float_flag_invalid, status);
5376 return floatx80_default_nan(status);
5377 }
158142c2
FB
5378 aSign = extractFloatx80Sign( a );
5379 bSign = extractFloatx80Sign( b );
5380 if ( aSign == bSign ) {
ff32e16e 5381 return subFloatx80Sigs(a, b, aSign, status);
158142c2
FB
5382 }
5383 else {
ff32e16e 5384 return addFloatx80Sigs(a, b, aSign, status);
158142c2
FB
5385 }
5386
5387}
5388
5389/*----------------------------------------------------------------------------
5390| Returns the result of multiplying the extended double-precision floating-
5391| point values `a' and `b'. The operation is performed according to the
5392| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5393*----------------------------------------------------------------------------*/
5394
e5a41ffa 5395floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5396{
5397 flag aSign, bSign, zSign;
f4014512 5398 int32_t aExp, bExp, zExp;
bb98fe42 5399 uint64_t aSig, bSig, zSig0, zSig1;
158142c2 5400
d1eb8f2a
AD
5401 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5402 float_raise(float_flag_invalid, status);
5403 return floatx80_default_nan(status);
5404 }
158142c2
FB
5405 aSig = extractFloatx80Frac( a );
5406 aExp = extractFloatx80Exp( a );
5407 aSign = extractFloatx80Sign( a );
5408 bSig = extractFloatx80Frac( b );
5409 bExp = extractFloatx80Exp( b );
5410 bSign = extractFloatx80Sign( b );
5411 zSign = aSign ^ bSign;
5412 if ( aExp == 0x7FFF ) {
bb98fe42
AF
5413 if ( (uint64_t) ( aSig<<1 )
5414 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
ff32e16e 5415 return propagateFloatx80NaN(a, b, status);
158142c2
FB
5416 }
5417 if ( ( bExp | bSig ) == 0 ) goto invalid;
5418 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5419 }
5420 if ( bExp == 0x7FFF ) {
ff32e16e
PM
5421 if ((uint64_t)(bSig << 1)) {
5422 return propagateFloatx80NaN(a, b, status);
5423 }
158142c2
FB
5424 if ( ( aExp | aSig ) == 0 ) {
5425 invalid:
ff32e16e 5426 float_raise(float_flag_invalid, status);
af39bc8c 5427 return floatx80_default_nan(status);
158142c2
FB
5428 }
5429 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5430 }
5431 if ( aExp == 0 ) {
5432 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5433 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5434 }
5435 if ( bExp == 0 ) {
5436 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5437 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5438 }
5439 zExp = aExp + bExp - 0x3FFE;
5440 mul64To128( aSig, bSig, &zSig0, &zSig1 );
bb98fe42 5441 if ( 0 < (int64_t) zSig0 ) {
158142c2
FB
5442 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5443 --zExp;
5444 }
a2f2d288 5445 return roundAndPackFloatx80(status->floatx80_rounding_precision,
ff32e16e 5446 zSign, zExp, zSig0, zSig1, status);
158142c2
FB
5447}
5448
5449/*----------------------------------------------------------------------------
5450| Returns the result of dividing the extended double-precision floating-point
5451| value `a' by the corresponding value `b'. The operation is performed
5452| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5453*----------------------------------------------------------------------------*/
5454
e5a41ffa 5455floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5456{
5457 flag aSign, bSign, zSign;
f4014512 5458 int32_t aExp, bExp, zExp;
bb98fe42
AF
5459 uint64_t aSig, bSig, zSig0, zSig1;
5460 uint64_t rem0, rem1, rem2, term0, term1, term2;
158142c2 5461
d1eb8f2a
AD
5462 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5463 float_raise(float_flag_invalid, status);
5464 return floatx80_default_nan(status);
5465 }
158142c2
FB
5466 aSig = extractFloatx80Frac( a );
5467 aExp = extractFloatx80Exp( a );
5468 aSign = extractFloatx80Sign( a );
5469 bSig = extractFloatx80Frac( b );
5470 bExp = extractFloatx80Exp( b );
5471 bSign = extractFloatx80Sign( b );
5472 zSign = aSign ^ bSign;
5473 if ( aExp == 0x7FFF ) {
ff32e16e
PM
5474 if ((uint64_t)(aSig << 1)) {
5475 return propagateFloatx80NaN(a, b, status);
5476 }
158142c2 5477 if ( bExp == 0x7FFF ) {
ff32e16e
PM
5478 if ((uint64_t)(bSig << 1)) {
5479 return propagateFloatx80NaN(a, b, status);
5480 }
158142c2
FB
5481 goto invalid;
5482 }
5483 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5484 }
5485 if ( bExp == 0x7FFF ) {
ff32e16e
PM
5486 if ((uint64_t)(bSig << 1)) {
5487 return propagateFloatx80NaN(a, b, status);
5488 }
158142c2
FB
5489 return packFloatx80( zSign, 0, 0 );
5490 }
5491 if ( bExp == 0 ) {
5492 if ( bSig == 0 ) {
5493 if ( ( aExp | aSig ) == 0 ) {
5494 invalid:
ff32e16e 5495 float_raise(float_flag_invalid, status);
af39bc8c 5496 return floatx80_default_nan(status);
158142c2 5497 }
ff32e16e 5498 float_raise(float_flag_divbyzero, status);
158142c2
FB
5499 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5500 }
5501 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5502 }
5503 if ( aExp == 0 ) {
5504 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5505 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5506 }
5507 zExp = aExp - bExp + 0x3FFE;
5508 rem1 = 0;
5509 if ( bSig <= aSig ) {
5510 shift128Right( aSig, 0, 1, &aSig, &rem1 );
5511 ++zExp;
5512 }
5513 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5514 mul64To128( bSig, zSig0, &term0, &term1 );
5515 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
bb98fe42 5516 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
5517 --zSig0;
5518 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5519 }
5520 zSig1 = estimateDiv128To64( rem1, 0, bSig );
bb98fe42 5521 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
158142c2
FB
5522 mul64To128( bSig, zSig1, &term1, &term2 );
5523 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
bb98fe42 5524 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
5525 --zSig1;
5526 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5527 }
5528 zSig1 |= ( ( rem1 | rem2 ) != 0 );
5529 }
a2f2d288 5530 return roundAndPackFloatx80(status->floatx80_rounding_precision,
ff32e16e 5531 zSign, zExp, zSig0, zSig1, status);
158142c2
FB
5532}
5533
5534/*----------------------------------------------------------------------------
5535| Returns the remainder of the extended double-precision floating-point value
5536| `a' with respect to the corresponding value `b'. The operation is performed
5537| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5538*----------------------------------------------------------------------------*/
5539
e5a41ffa 5540floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
158142c2 5541{
ed086f3d 5542 flag aSign, zSign;
f4014512 5543 int32_t aExp, bExp, expDiff;
bb98fe42
AF
5544 uint64_t aSig0, aSig1, bSig;
5545 uint64_t q, term0, term1, alternateASig0, alternateASig1;
158142c2 5546
d1eb8f2a
AD
5547 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5548 float_raise(float_flag_invalid, status);
5549 return floatx80_default_nan(status);
5550 }
158142c2
FB
5551 aSig0 = extractFloatx80Frac( a );
5552 aExp = extractFloatx80Exp( a );
5553 aSign = extractFloatx80Sign( a );
5554 bSig = extractFloatx80Frac( b );
5555 bExp = extractFloatx80Exp( b );
158142c2 5556 if ( aExp == 0x7FFF ) {
bb98fe42
AF
5557 if ( (uint64_t) ( aSig0<<1 )
5558 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
ff32e16e 5559 return propagateFloatx80NaN(a, b, status);
158142c2
FB
5560 }
5561 goto invalid;
5562 }
5563 if ( bExp == 0x7FFF ) {
ff32e16e
PM
5564 if ((uint64_t)(bSig << 1)) {
5565 return propagateFloatx80NaN(a, b, status);
5566 }
158142c2
FB
5567 return a;
5568 }
5569 if ( bExp == 0 ) {
5570 if ( bSig == 0 ) {
5571 invalid:
ff32e16e 5572 float_raise(float_flag_invalid, status);
af39bc8c 5573 return floatx80_default_nan(status);
158142c2
FB
5574 }
5575 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5576 }
5577 if ( aExp == 0 ) {
bb98fe42 5578 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
158142c2
FB
5579 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5580 }
5581 bSig |= LIT64( 0x8000000000000000 );
5582 zSign = aSign;
5583 expDiff = aExp - bExp;
5584 aSig1 = 0;
5585 if ( expDiff < 0 ) {
5586 if ( expDiff < -1 ) return a;
5587 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5588 expDiff = 0;
5589 }
5590 q = ( bSig <= aSig0 );
5591 if ( q ) aSig0 -= bSig;
5592 expDiff -= 64;
5593 while ( 0 < expDiff ) {
5594 q = estimateDiv128To64( aSig0, aSig1, bSig );
5595 q = ( 2 < q ) ? q - 2 : 0;
5596 mul64To128( bSig, q, &term0, &term1 );
5597 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5598 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5599 expDiff -= 62;
5600 }
5601 expDiff += 64;
5602 if ( 0 < expDiff ) {
5603 q = estimateDiv128To64( aSig0, aSig1, bSig );
5604 q = ( 2 < q ) ? q - 2 : 0;
5605 q >>= 64 - expDiff;
5606 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5607 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5608 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5609 while ( le128( term0, term1, aSig0, aSig1 ) ) {
5610 ++q;
5611 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5612 }
5613 }
5614 else {
5615 term1 = 0;
5616 term0 = bSig;
5617 }
5618 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5619 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5620 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5621 && ( q & 1 ) )
5622 ) {
5623 aSig0 = alternateASig0;
5624 aSig1 = alternateASig1;
5625 zSign = ! zSign;
5626 }
5627 return
5628 normalizeRoundAndPackFloatx80(
ff32e16e 5629 80, zSign, bExp + expDiff, aSig0, aSig1, status);
158142c2
FB
5630
5631}
5632
5633/*----------------------------------------------------------------------------
5634| Returns the square root of the extended double-precision floating-point
5635| value `a'. The operation is performed according to the IEC/IEEE Standard
5636| for Binary Floating-Point Arithmetic.
5637*----------------------------------------------------------------------------*/
5638
e5a41ffa 5639floatx80 floatx80_sqrt(floatx80 a, float_status *status)
158142c2
FB
5640{
5641 flag aSign;
f4014512 5642 int32_t aExp, zExp;
bb98fe42
AF
5643 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5644 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
158142c2 5645
d1eb8f2a
AD
5646 if (floatx80_invalid_encoding(a)) {
5647 float_raise(float_flag_invalid, status);
5648 return floatx80_default_nan(status);
5649 }
158142c2
FB
5650 aSig0 = extractFloatx80Frac( a );
5651 aExp = extractFloatx80Exp( a );
5652 aSign = extractFloatx80Sign( a );
5653 if ( aExp == 0x7FFF ) {
ff32e16e
PM
5654 if ((uint64_t)(aSig0 << 1)) {
5655 return propagateFloatx80NaN(a, a, status);
5656 }
158142c2
FB
5657 if ( ! aSign ) return a;
5658 goto invalid;
5659 }
5660 if ( aSign ) {
5661 if ( ( aExp | aSig0 ) == 0 ) return a;
5662 invalid:
ff32e16e 5663 float_raise(float_flag_invalid, status);
af39bc8c 5664 return floatx80_default_nan(status);
158142c2
FB
5665 }
5666 if ( aExp == 0 ) {
5667 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5668 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5669 }
5670 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5671 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5672 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5673 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5674 doubleZSig0 = zSig0<<1;
5675 mul64To128( zSig0, zSig0, &term0, &term1 );
5676 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
bb98fe42 5677 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
5678 --zSig0;
5679 doubleZSig0 -= 2;
5680 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5681 }
5682 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5683 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5684 if ( zSig1 == 0 ) zSig1 = 1;
5685 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5686 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5687 mul64To128( zSig1, zSig1, &term2, &term3 );
5688 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
bb98fe42 5689 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
5690 --zSig1;
5691 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5692 term3 |= 1;
5693 term2 |= doubleZSig0;
5694 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5695 }
5696 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5697 }
5698 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5699 zSig0 |= doubleZSig0;
a2f2d288
PM
5700 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5701 0, zExp, zSig0, zSig1, status);
158142c2
FB
5702}
5703
5704/*----------------------------------------------------------------------------
b689362d
AJ
5705| Returns 1 if the extended double-precision floating-point value `a' is equal
5706| to the corresponding value `b', and 0 otherwise. The invalid exception is
5707| raised if either operand is a NaN. Otherwise, the comparison is performed
5708| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
5709*----------------------------------------------------------------------------*/
5710
e5a41ffa 5711int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5712{
5713
d1eb8f2a
AD
5714 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5715 || (extractFloatx80Exp(a) == 0x7FFF
5716 && (uint64_t) (extractFloatx80Frac(a) << 1))
5717 || (extractFloatx80Exp(b) == 0x7FFF
5718 && (uint64_t) (extractFloatx80Frac(b) << 1))
158142c2 5719 ) {
ff32e16e 5720 float_raise(float_flag_invalid, status);
158142c2
FB
5721 return 0;
5722 }
5723 return
5724 ( a.low == b.low )
5725 && ( ( a.high == b.high )
5726 || ( ( a.low == 0 )
bb98fe42 5727 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
5728 );
5729
5730}
5731
5732/*----------------------------------------------------------------------------
5733| Returns 1 if the extended double-precision floating-point value `a' is
5734| less than or equal to the corresponding value `b', and 0 otherwise. The
f5a64251
AJ
5735| invalid exception is raised if either operand is a NaN. The comparison is
5736| performed according to the IEC/IEEE Standard for Binary Floating-Point
5737| Arithmetic.
158142c2
FB
5738*----------------------------------------------------------------------------*/
5739
e5a41ffa 5740int floatx80_le(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5741{
5742 flag aSign, bSign;
5743
d1eb8f2a
AD
5744 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5745 || (extractFloatx80Exp(a) == 0x7FFF
5746 && (uint64_t) (extractFloatx80Frac(a) << 1))
5747 || (extractFloatx80Exp(b) == 0x7FFF
5748 && (uint64_t) (extractFloatx80Frac(b) << 1))
158142c2 5749 ) {
ff32e16e 5750 float_raise(float_flag_invalid, status);
158142c2
FB
5751 return 0;
5752 }
5753 aSign = extractFloatx80Sign( a );
5754 bSign = extractFloatx80Sign( b );
5755 if ( aSign != bSign ) {
5756 return
5757 aSign
bb98fe42 5758 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5759 == 0 );
5760 }
5761 return
5762 aSign ? le128( b.high, b.low, a.high, a.low )
5763 : le128( a.high, a.low, b.high, b.low );
5764
5765}
5766
5767/*----------------------------------------------------------------------------
5768| Returns 1 if the extended double-precision floating-point value `a' is
f5a64251
AJ
5769| less than the corresponding value `b', and 0 otherwise. The invalid
5770| exception is raised if either operand is a NaN. The comparison is performed
5771| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
5772*----------------------------------------------------------------------------*/
5773
e5a41ffa 5774int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5775{
5776 flag aSign, bSign;
5777
d1eb8f2a
AD
5778 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5779 || (extractFloatx80Exp(a) == 0x7FFF
5780 && (uint64_t) (extractFloatx80Frac(a) << 1))
5781 || (extractFloatx80Exp(b) == 0x7FFF
5782 && (uint64_t) (extractFloatx80Frac(b) << 1))
158142c2 5783 ) {
ff32e16e 5784 float_raise(float_flag_invalid, status);
158142c2
FB
5785 return 0;
5786 }
5787 aSign = extractFloatx80Sign( a );
5788 bSign = extractFloatx80Sign( b );
5789 if ( aSign != bSign ) {
5790 return
5791 aSign
bb98fe42 5792 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5793 != 0 );
5794 }
5795 return
5796 aSign ? lt128( b.high, b.low, a.high, a.low )
5797 : lt128( a.high, a.low, b.high, b.low );
5798
5799}
5800
67b7861d
AJ
5801/*----------------------------------------------------------------------------
5802| Returns 1 if the extended double-precision floating-point values `a' and `b'
f5a64251
AJ
5803| cannot be compared, and 0 otherwise. The invalid exception is raised if
5804| either operand is a NaN. The comparison is performed according to the
5805| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
67b7861d 5806*----------------------------------------------------------------------------*/
e5a41ffa 5807int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
67b7861d 5808{
d1eb8f2a
AD
5809 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5810 || (extractFloatx80Exp(a) == 0x7FFF
5811 && (uint64_t) (extractFloatx80Frac(a) << 1))
5812 || (extractFloatx80Exp(b) == 0x7FFF
5813 && (uint64_t) (extractFloatx80Frac(b) << 1))
67b7861d 5814 ) {
ff32e16e 5815 float_raise(float_flag_invalid, status);
67b7861d
AJ
5816 return 1;
5817 }
5818 return 0;
5819}
5820
158142c2 5821/*----------------------------------------------------------------------------
b689362d 5822| Returns 1 if the extended double-precision floating-point value `a' is
f5a64251
AJ
5823| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5824| cause an exception. The comparison is performed according to the IEC/IEEE
5825| Standard for Binary Floating-Point Arithmetic.
158142c2
FB
5826*----------------------------------------------------------------------------*/
5827
e5a41ffa 5828int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5829{
5830
d1eb8f2a
AD
5831 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5832 float_raise(float_flag_invalid, status);
5833 return 0;
5834 }
158142c2 5835 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5836 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5837 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5838 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2 5839 ) {
af39bc8c
AM
5840 if (floatx80_is_signaling_nan(a, status)
5841 || floatx80_is_signaling_nan(b, status)) {
ff32e16e 5842 float_raise(float_flag_invalid, status);
b689362d 5843 }
158142c2
FB
5844 return 0;
5845 }
5846 return
5847 ( a.low == b.low )
5848 && ( ( a.high == b.high )
5849 || ( ( a.low == 0 )
bb98fe42 5850 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
5851 );
5852
5853}
5854
5855/*----------------------------------------------------------------------------
5856| Returns 1 if the extended double-precision floating-point value `a' is less
5857| than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
5858| do not cause an exception. Otherwise, the comparison is performed according
5859| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5860*----------------------------------------------------------------------------*/
5861
e5a41ffa 5862int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5863{
5864 flag aSign, bSign;
5865
d1eb8f2a
AD
5866 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5867 float_raise(float_flag_invalid, status);
5868 return 0;
5869 }
158142c2 5870 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5871 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5872 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5873 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2 5874 ) {
af39bc8c
AM
5875 if (floatx80_is_signaling_nan(a, status)
5876 || floatx80_is_signaling_nan(b, status)) {
ff32e16e 5877 float_raise(float_flag_invalid, status);
158142c2
FB
5878 }
5879 return 0;
5880 }
5881 aSign = extractFloatx80Sign( a );
5882 bSign = extractFloatx80Sign( b );
5883 if ( aSign != bSign ) {
5884 return
5885 aSign
bb98fe42 5886 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5887 == 0 );
5888 }
5889 return
5890 aSign ? le128( b.high, b.low, a.high, a.low )
5891 : le128( a.high, a.low, b.high, b.low );
5892
5893}
5894
5895/*----------------------------------------------------------------------------
5896| Returns 1 if the extended double-precision floating-point value `a' is less
5897| than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
5898| an exception. Otherwise, the comparison is performed according to the
5899| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5900*----------------------------------------------------------------------------*/
5901
e5a41ffa 5902int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5903{
5904 flag aSign, bSign;
5905
d1eb8f2a
AD
5906 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5907 float_raise(float_flag_invalid, status);
5908 return 0;
5909 }
158142c2 5910 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5911 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5912 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5913 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2 5914 ) {
af39bc8c
AM
5915 if (floatx80_is_signaling_nan(a, status)
5916 || floatx80_is_signaling_nan(b, status)) {
ff32e16e 5917 float_raise(float_flag_invalid, status);
158142c2
FB
5918 }
5919 return 0;
5920 }
5921 aSign = extractFloatx80Sign( a );
5922 bSign = extractFloatx80Sign( b );
5923 if ( aSign != bSign ) {
5924 return
5925 aSign
bb98fe42 5926 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5927 != 0 );
5928 }
5929 return
5930 aSign ? lt128( b.high, b.low, a.high, a.low )
5931 : lt128( a.high, a.low, b.high, b.low );
5932
5933}
5934
67b7861d
AJ
5935/*----------------------------------------------------------------------------
5936| Returns 1 if the extended double-precision floating-point values `a' and `b'
5937| cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
5938| The comparison is performed according to the IEC/IEEE Standard for Binary
5939| Floating-Point Arithmetic.
5940*----------------------------------------------------------------------------*/
e5a41ffa 5941int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
67b7861d 5942{
d1eb8f2a
AD
5943 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5944 float_raise(float_flag_invalid, status);
5945 return 1;
5946 }
67b7861d
AJ
5947 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5948 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5949 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5950 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5951 ) {
af39bc8c
AM
5952 if (floatx80_is_signaling_nan(a, status)
5953 || floatx80_is_signaling_nan(b, status)) {
ff32e16e 5954 float_raise(float_flag_invalid, status);
67b7861d
AJ
5955 }
5956 return 1;
5957 }
5958 return 0;
5959}
5960
158142c2
FB
5961/*----------------------------------------------------------------------------
5962| Returns the result of converting the quadruple-precision floating-point
5963| value `a' to the 32-bit two's complement integer format. The conversion
5964| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5965| Arithmetic---which means in particular that the conversion is rounded
5966| according to the current rounding mode. If `a' is a NaN, the largest
5967| positive integer is returned. Otherwise, if the conversion overflows, the
5968| largest integer with the same sign as `a' is returned.
5969*----------------------------------------------------------------------------*/
5970
f4014512 5971int32_t float128_to_int32(float128 a, float_status *status)
158142c2
FB
5972{
5973 flag aSign;
f4014512 5974 int32_t aExp, shiftCount;
bb98fe42 5975 uint64_t aSig0, aSig1;
158142c2
FB
5976
5977 aSig1 = extractFloat128Frac1( a );
5978 aSig0 = extractFloat128Frac0( a );
5979 aExp = extractFloat128Exp( a );
5980 aSign = extractFloat128Sign( a );
5981 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5982 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5983 aSig0 |= ( aSig1 != 0 );
5984 shiftCount = 0x4028 - aExp;
5985 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
ff32e16e 5986 return roundAndPackInt32(aSign, aSig0, status);
158142c2
FB
5987
5988}
5989
5990/*----------------------------------------------------------------------------
5991| Returns the result of converting the quadruple-precision floating-point
5992| value `a' to the 32-bit two's complement integer format. The conversion
5993| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5994| Arithmetic, except that the conversion is always rounded toward zero. If
5995| `a' is a NaN, the largest positive integer is returned. Otherwise, if the
5996| conversion overflows, the largest integer with the same sign as `a' is
5997| returned.
5998*----------------------------------------------------------------------------*/
5999
f4014512 6000int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
158142c2
FB
6001{
6002 flag aSign;
f4014512 6003 int32_t aExp, shiftCount;
bb98fe42 6004 uint64_t aSig0, aSig1, savedASig;
b3a6a2e0 6005 int32_t z;
158142c2
FB
6006
6007 aSig1 = extractFloat128Frac1( a );
6008 aSig0 = extractFloat128Frac0( a );
6009 aExp = extractFloat128Exp( a );
6010 aSign = extractFloat128Sign( a );
6011 aSig0 |= ( aSig1 != 0 );
6012 if ( 0x401E < aExp ) {
6013 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
6014 goto invalid;
6015 }
6016 else if ( aExp < 0x3FFF ) {
a2f2d288
PM
6017 if (aExp || aSig0) {
6018 status->float_exception_flags |= float_flag_inexact;
6019 }
158142c2
FB
6020 return 0;
6021 }
6022 aSig0 |= LIT64( 0x0001000000000000 );
6023 shiftCount = 0x402F - aExp;
6024 savedASig = aSig0;
6025 aSig0 >>= shiftCount;
6026 z = aSig0;
6027 if ( aSign ) z = - z;
6028 if ( ( z < 0 ) ^ aSign ) {
6029 invalid:
ff32e16e 6030 float_raise(float_flag_invalid, status);
bb98fe42 6031 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2
FB
6032 }
6033 if ( ( aSig0<<shiftCount ) != savedASig ) {
a2f2d288 6034 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
6035 }
6036 return z;
6037
6038}
6039
6040/*----------------------------------------------------------------------------
6041| Returns the result of converting the quadruple-precision floating-point
6042| value `a' to the 64-bit two's complement integer format. The conversion
6043| is performed according to the IEC/IEEE Standard for Binary Floating-Point
6044| Arithmetic---which means in particular that the conversion is rounded
6045| according to the current rounding mode. If `a' is a NaN, the largest
6046| positive integer is returned. Otherwise, if the conversion overflows, the
6047| largest integer with the same sign as `a' is returned.
6048*----------------------------------------------------------------------------*/
6049
f42c2224 6050int64_t float128_to_int64(float128 a, float_status *status)
158142c2
FB
6051{
6052 flag aSign;
f4014512 6053 int32_t aExp, shiftCount;
bb98fe42 6054 uint64_t aSig0, aSig1;
158142c2
FB
6055
6056 aSig1 = extractFloat128Frac1( a );
6057 aSig0 = extractFloat128Frac0( a );
6058 aExp = extractFloat128Exp( a );
6059 aSign = extractFloat128Sign( a );
6060 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6061 shiftCount = 0x402F - aExp;
6062 if ( shiftCount <= 0 ) {
6063 if ( 0x403E < aExp ) {
ff32e16e 6064 float_raise(float_flag_invalid, status);
158142c2
FB
6065 if ( ! aSign
6066 || ( ( aExp == 0x7FFF )
6067 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
6068 )
6069 ) {
6070 return LIT64( 0x7FFFFFFFFFFFFFFF );
6071 }
bb98fe42 6072 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
6073 }
6074 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
6075 }
6076 else {
6077 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
6078 }
ff32e16e 6079 return roundAndPackInt64(aSign, aSig0, aSig1, status);
158142c2
FB
6080
6081}
6082
6083/*----------------------------------------------------------------------------
6084| Returns the result of converting the quadruple-precision floating-point
6085| value `a' to the 64-bit two's complement integer format. The conversion
6086| is performed according to the IEC/IEEE Standard for Binary Floating-Point
6087| Arithmetic, except that the conversion is always rounded toward zero.
6088| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
6089| the conversion overflows, the largest integer with the same sign as `a' is
6090| returned.
6091*----------------------------------------------------------------------------*/
6092
f42c2224 6093int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
158142c2
FB
6094{
6095 flag aSign;
f4014512 6096 int32_t aExp, shiftCount;
bb98fe42 6097 uint64_t aSig0, aSig1;
f42c2224 6098 int64_t z;
158142c2
FB
6099
6100 aSig1 = extractFloat128Frac1( a );
6101 aSig0 = extractFloat128Frac0( a );
6102 aExp = extractFloat128Exp( a );
6103 aSign = extractFloat128Sign( a );
6104 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6105 shiftCount = aExp - 0x402F;
6106 if ( 0 < shiftCount ) {
6107 if ( 0x403E <= aExp ) {
6108 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
6109 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
6110 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
a2f2d288
PM
6111 if (aSig1) {
6112 status->float_exception_flags |= float_flag_inexact;
6113 }
158142c2
FB
6114 }
6115 else {
ff32e16e 6116 float_raise(float_flag_invalid, status);
158142c2
FB
6117 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
6118 return LIT64( 0x7FFFFFFFFFFFFFFF );
6119 }
6120 }
bb98fe42 6121 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
6122 }
6123 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
bb98fe42 6124 if ( (uint64_t) ( aSig1<<shiftCount ) ) {
a2f2d288 6125 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
6126 }
6127 }
6128 else {
6129 if ( aExp < 0x3FFF ) {
6130 if ( aExp | aSig0 | aSig1 ) {
a2f2d288 6131 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
6132 }
6133 return 0;
6134 }
6135 z = aSig0>>( - shiftCount );
6136 if ( aSig1
bb98fe42 6137 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
a2f2d288 6138 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
6139 }
6140 }
6141 if ( aSign ) z = - z;
6142 return z;
6143
6144}
6145
2e6d8568
BR
6146/*----------------------------------------------------------------------------
6147| Returns the result of converting the quadruple-precision floating-point value
6148| `a' to the 64-bit unsigned integer format. The conversion is
6149| performed according to the IEC/IEEE Standard for Binary Floating-Point
6150| Arithmetic---which means in particular that the conversion is rounded
6151| according to the current rounding mode. If `a' is a NaN, the largest
6152| positive integer is returned. If the conversion overflows, the
6153| largest unsigned integer is returned. If 'a' is negative, the value is
6154| rounded and zero is returned; negative values that do not round to zero
6155| will raise the inexact exception.
6156*----------------------------------------------------------------------------*/
6157
6158uint64_t float128_to_uint64(float128 a, float_status *status)
6159{
6160 flag aSign;
6161 int aExp;
6162 int shiftCount;
6163 uint64_t aSig0, aSig1;
6164
6165 aSig0 = extractFloat128Frac0(a);
6166 aSig1 = extractFloat128Frac1(a);
6167 aExp = extractFloat128Exp(a);
6168 aSign = extractFloat128Sign(a);
6169 if (aSign && (aExp > 0x3FFE)) {
6170 float_raise(float_flag_invalid, status);
6171 if (float128_is_any_nan(a)) {
6172 return LIT64(0xFFFFFFFFFFFFFFFF);
6173 } else {
6174 return 0;
6175 }
6176 }
6177 if (aExp) {
6178 aSig0 |= LIT64(0x0001000000000000);
6179 }
6180 shiftCount = 0x402F - aExp;
6181 if (shiftCount <= 0) {
6182 if (0x403E < aExp) {
6183 float_raise(float_flag_invalid, status);
6184 return LIT64(0xFFFFFFFFFFFFFFFF);
6185 }
6186 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
6187 } else {
6188 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
6189 }
6190 return roundAndPackUint64(aSign, aSig0, aSig1, status);
6191}
6192
6193uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
6194{
6195 uint64_t v;
6196 signed char current_rounding_mode = status->float_rounding_mode;
6197
6198 set_float_rounding_mode(float_round_to_zero, status);
6199 v = float128_to_uint64(a, status);
6200 set_float_rounding_mode(current_rounding_mode, status);
6201
6202 return v;
6203}
6204
158142c2
FB
6205/*----------------------------------------------------------------------------
6206| Returns the result of converting the quadruple-precision floating-point
fd425037
BR
6207| value `a' to the 32-bit unsigned integer format. The conversion
6208| is performed according to the IEC/IEEE Standard for Binary Floating-Point
6209| Arithmetic except that the conversion is always rounded toward zero.
6210| If `a' is a NaN, the largest positive integer is returned. Otherwise,
6211| if the conversion overflows, the largest unsigned integer is returned.
6212| If 'a' is negative, the value is rounded and zero is returned; negative
6213| values that do not round to zero will raise the inexact exception.
6214*----------------------------------------------------------------------------*/
6215
6216uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
6217{
6218 uint64_t v;
6219 uint32_t res;
6220 int old_exc_flags = get_float_exception_flags(status);
6221
6222 v = float128_to_uint64_round_to_zero(a, status);
6223 if (v > 0xffffffff) {
6224 res = 0xffffffff;
6225 } else {
6226 return v;
6227 }
6228 set_float_exception_flags(old_exc_flags, status);
6229 float_raise(float_flag_invalid, status);
6230 return res;
6231}
6232
6233/*----------------------------------------------------------------------------
6234| Returns the result of converting the quadruple-precision floating-point
158142c2
FB
6235| value `a' to the single-precision floating-point format. The conversion
6236| is performed according to the IEC/IEEE Standard for Binary Floating-Point
6237| Arithmetic.
6238*----------------------------------------------------------------------------*/
6239
e5a41ffa 6240float32 float128_to_float32(float128 a, float_status *status)
158142c2
FB
6241{
6242 flag aSign;
f4014512 6243 int32_t aExp;
bb98fe42
AF
6244 uint64_t aSig0, aSig1;
6245 uint32_t zSig;
158142c2
FB
6246
6247 aSig1 = extractFloat128Frac1( a );
6248 aSig0 = extractFloat128Frac0( a );
6249 aExp = extractFloat128Exp( a );
6250 aSign = extractFloat128Sign( a );
6251 if ( aExp == 0x7FFF ) {
6252 if ( aSig0 | aSig1 ) {
ff32e16e 6253 return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
158142c2
FB
6254 }
6255 return packFloat32( aSign, 0xFF, 0 );
6256 }
6257 aSig0 |= ( aSig1 != 0 );
6258 shift64RightJamming( aSig0, 18, &aSig0 );
6259 zSig = aSig0;
6260 if ( aExp || zSig ) {
6261 zSig |= 0x40000000;
6262 aExp -= 0x3F81;
6263 }
ff32e16e 6264 return roundAndPackFloat32(aSign, aExp, zSig, status);
158142c2
FB
6265
6266}
6267
6268/*----------------------------------------------------------------------------
6269| Returns the result of converting the quadruple-precision floating-point
6270| value `a' to the double-precision floating-point format. The conversion
6271| is performed according to the IEC/IEEE Standard for Binary Floating-Point
6272| Arithmetic.
6273*----------------------------------------------------------------------------*/
6274
e5a41ffa 6275float64 float128_to_float64(float128 a, float_status *status)
158142c2
FB
6276{
6277 flag aSign;
f4014512 6278 int32_t aExp;
bb98fe42 6279 uint64_t aSig0, aSig1;
158142c2
FB
6280
6281 aSig1 = extractFloat128Frac1( a );
6282 aSig0 = extractFloat128Frac0( a );
6283 aExp = extractFloat128Exp( a );
6284 aSign = extractFloat128Sign( a );
6285 if ( aExp == 0x7FFF ) {
6286 if ( aSig0 | aSig1 ) {
ff32e16e 6287 return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
158142c2
FB
6288 }
6289 return packFloat64( aSign, 0x7FF, 0 );
6290 }
6291 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6292 aSig0 |= ( aSig1 != 0 );
6293 if ( aExp || aSig0 ) {
6294 aSig0 |= LIT64( 0x4000000000000000 );
6295 aExp -= 0x3C01;
6296 }
ff32e16e 6297 return roundAndPackFloat64(aSign, aExp, aSig0, status);
158142c2
FB
6298
6299}
6300
158142c2
FB
6301/*----------------------------------------------------------------------------
6302| Returns the result of converting the quadruple-precision floating-point
6303| value `a' to the extended double-precision floating-point format. The
6304| conversion is performed according to the IEC/IEEE Standard for Binary
6305| Floating-Point Arithmetic.
6306*----------------------------------------------------------------------------*/
6307
e5a41ffa 6308floatx80 float128_to_floatx80(float128 a, float_status *status)
158142c2
FB
6309{
6310 flag aSign;
f4014512 6311 int32_t aExp;
bb98fe42 6312 uint64_t aSig0, aSig1;
158142c2
FB
6313
6314 aSig1 = extractFloat128Frac1( a );
6315 aSig0 = extractFloat128Frac0( a );
6316 aExp = extractFloat128Exp( a );
6317 aSign = extractFloat128Sign( a );
6318 if ( aExp == 0x7FFF ) {
6319 if ( aSig0 | aSig1 ) {
ff32e16e 6320 return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
158142c2
FB
6321 }
6322 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
6323 }
6324 if ( aExp == 0 ) {
6325 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6326 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6327 }
6328 else {
6329 aSig0 |= LIT64( 0x0001000000000000 );
6330 }
6331 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
ff32e16e 6332 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
158142c2
FB
6333
6334}
6335
158142c2
FB
6336/*----------------------------------------------------------------------------
6337| Rounds the quadruple-precision floating-point value `a' to an integer, and
6338| returns the result as a quadruple-precision floating-point value. The
6339| operation is performed according to the IEC/IEEE Standard for Binary
6340| Floating-Point Arithmetic.
6341*----------------------------------------------------------------------------*/
6342
e5a41ffa 6343float128 float128_round_to_int(float128 a, float_status *status)
158142c2
FB
6344{
6345 flag aSign;
f4014512 6346 int32_t aExp;
bb98fe42 6347 uint64_t lastBitMask, roundBitsMask;
158142c2
FB
6348 float128 z;
6349
6350 aExp = extractFloat128Exp( a );
6351 if ( 0x402F <= aExp ) {
6352 if ( 0x406F <= aExp ) {
6353 if ( ( aExp == 0x7FFF )
6354 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6355 ) {
ff32e16e 6356 return propagateFloat128NaN(a, a, status);
158142c2
FB
6357 }
6358 return a;
6359 }
6360 lastBitMask = 1;
6361 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6362 roundBitsMask = lastBitMask - 1;
6363 z = a;
a2f2d288 6364 switch (status->float_rounding_mode) {
dc355b76 6365 case float_round_nearest_even:
158142c2
FB
6366 if ( lastBitMask ) {
6367 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6368 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6369 }
6370 else {
bb98fe42 6371 if ( (int64_t) z.low < 0 ) {
158142c2 6372 ++z.high;
bb98fe42 6373 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
158142c2
FB
6374 }
6375 }
dc355b76 6376 break;
f9288a76
PM
6377 case float_round_ties_away:
6378 if (lastBitMask) {
6379 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6380 } else {
6381 if ((int64_t) z.low < 0) {
6382 ++z.high;
6383 }
6384 }
6385 break;
dc355b76
PM
6386 case float_round_to_zero:
6387 break;
6388 case float_round_up:
6389 if (!extractFloat128Sign(z)) {
6390 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6391 }
6392 break;
6393 case float_round_down:
6394 if (extractFloat128Sign(z)) {
6395 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
158142c2 6396 }
dc355b76
PM
6397 break;
6398 default:
6399 abort();
158142c2
FB
6400 }
6401 z.low &= ~ roundBitsMask;
6402 }
6403 else {
6404 if ( aExp < 0x3FFF ) {
bb98fe42 6405 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
a2f2d288 6406 status->float_exception_flags |= float_flag_inexact;
158142c2 6407 aSign = extractFloat128Sign( a );
a2f2d288 6408 switch (status->float_rounding_mode) {
158142c2
FB
6409 case float_round_nearest_even:
6410 if ( ( aExp == 0x3FFE )
6411 && ( extractFloat128Frac0( a )
6412 | extractFloat128Frac1( a ) )
6413 ) {
6414 return packFloat128( aSign, 0x3FFF, 0, 0 );
6415 }
6416 break;
f9288a76
PM
6417 case float_round_ties_away:
6418 if (aExp == 0x3FFE) {
6419 return packFloat128(aSign, 0x3FFF, 0, 0);
6420 }
6421 break;
158142c2
FB
6422 case float_round_down:
6423 return
6424 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6425 : packFloat128( 0, 0, 0, 0 );
6426 case float_round_up:
6427 return
6428 aSign ? packFloat128( 1, 0, 0, 0 )
6429 : packFloat128( 0, 0x3FFF, 0, 0 );
6430 }
6431 return packFloat128( aSign, 0, 0, 0 );
6432 }
6433 lastBitMask = 1;
6434 lastBitMask <<= 0x402F - aExp;
6435 roundBitsMask = lastBitMask - 1;
6436 z.low = 0;
6437 z.high = a.high;
a2f2d288 6438 switch (status->float_rounding_mode) {
dc355b76 6439 case float_round_nearest_even:
158142c2
FB
6440 z.high += lastBitMask>>1;
6441 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6442 z.high &= ~ lastBitMask;
6443 }
dc355b76 6444 break;
f9288a76
PM
6445 case float_round_ties_away:
6446 z.high += lastBitMask>>1;
6447 break;
dc355b76
PM
6448 case float_round_to_zero:
6449 break;
6450 case float_round_up:
6451 if (!extractFloat128Sign(z)) {
158142c2
FB
6452 z.high |= ( a.low != 0 );
6453 z.high += roundBitsMask;
6454 }
dc355b76
PM
6455 break;
6456 case float_round_down:
6457 if (extractFloat128Sign(z)) {
6458 z.high |= (a.low != 0);
6459 z.high += roundBitsMask;
6460 }
6461 break;
6462 default:
6463 abort();
158142c2
FB
6464 }
6465 z.high &= ~ roundBitsMask;
6466 }
6467 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
a2f2d288 6468 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
6469 }
6470 return z;
6471
6472}
6473
6474/*----------------------------------------------------------------------------
6475| Returns the result of adding the absolute values of the quadruple-precision
6476| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
6477| before being returned. `zSign' is ignored if the result is a NaN.
6478| The addition is performed according to the IEC/IEEE Standard for Binary
6479| Floating-Point Arithmetic.
6480*----------------------------------------------------------------------------*/
6481
e5a41ffa
PM
6482static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
6483 float_status *status)
158142c2 6484{
f4014512 6485 int32_t aExp, bExp, zExp;
bb98fe42 6486 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
f4014512 6487 int32_t expDiff;
158142c2
FB
6488
6489 aSig1 = extractFloat128Frac1( a );
6490 aSig0 = extractFloat128Frac0( a );
6491 aExp = extractFloat128Exp( a );
6492 bSig1 = extractFloat128Frac1( b );
6493 bSig0 = extractFloat128Frac0( b );
6494 bExp = extractFloat128Exp( b );
6495 expDiff = aExp - bExp;
6496 if ( 0 < expDiff ) {
6497 if ( aExp == 0x7FFF ) {
ff32e16e
PM
6498 if (aSig0 | aSig1) {
6499 return propagateFloat128NaN(a, b, status);
6500 }
158142c2
FB
6501 return a;
6502 }
6503 if ( bExp == 0 ) {
6504 --expDiff;
6505 }
6506 else {
6507 bSig0 |= LIT64( 0x0001000000000000 );
6508 }
6509 shift128ExtraRightJamming(
6510 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6511 zExp = aExp;
6512 }
6513 else if ( expDiff < 0 ) {
6514 if ( bExp == 0x7FFF ) {
ff32e16e
PM
6515 if (bSig0 | bSig1) {
6516 return propagateFloat128NaN(a, b, status);
6517 }
158142c2
FB
6518 return packFloat128( zSign, 0x7FFF, 0, 0 );
6519 }
6520 if ( aExp == 0 ) {
6521 ++expDiff;
6522 }
6523 else {
6524 aSig0 |= LIT64( 0x0001000000000000 );
6525 }
6526 shift128ExtraRightJamming(
6527 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6528 zExp = bExp;
6529 }
6530 else {
6531 if ( aExp == 0x7FFF ) {
6532 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
ff32e16e 6533 return propagateFloat128NaN(a, b, status);
158142c2
FB
6534 }
6535 return a;
6536 }
6537 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
fe76d976 6538 if ( aExp == 0 ) {
a2f2d288 6539 if (status->flush_to_zero) {
e6afc87f 6540 if (zSig0 | zSig1) {
ff32e16e 6541 float_raise(float_flag_output_denormal, status);
e6afc87f
PM
6542 }
6543 return packFloat128(zSign, 0, 0, 0);
6544 }
fe76d976
PB
6545 return packFloat128( zSign, 0, zSig0, zSig1 );
6546 }
158142c2
FB
6547 zSig2 = 0;
6548 zSig0 |= LIT64( 0x0002000000000000 );
6549 zExp = aExp;
6550 goto shiftRight1;
6551 }
6552 aSig0 |= LIT64( 0x0001000000000000 );
6553 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6554 --zExp;
6555 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6556 ++zExp;
6557 shiftRight1:
6558 shift128ExtraRightJamming(
6559 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6560 roundAndPack:
ff32e16e 6561 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
158142c2
FB
6562
6563}
6564
6565/*----------------------------------------------------------------------------
6566| Returns the result of subtracting the absolute values of the quadruple-
6567| precision floating-point values `a' and `b'. If `zSign' is 1, the
6568| difference is negated before being returned. `zSign' is ignored if the
6569| result is a NaN. The subtraction is performed according to the IEC/IEEE
6570| Standard for Binary Floating-Point Arithmetic.
6571*----------------------------------------------------------------------------*/
6572
e5a41ffa
PM
6573static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6574 float_status *status)
158142c2 6575{
f4014512 6576 int32_t aExp, bExp, zExp;
bb98fe42 6577 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
f4014512 6578 int32_t expDiff;
158142c2
FB
6579
6580 aSig1 = extractFloat128Frac1( a );
6581 aSig0 = extractFloat128Frac0( a );
6582 aExp = extractFloat128Exp( a );
6583 bSig1 = extractFloat128Frac1( b );
6584 bSig0 = extractFloat128Frac0( b );
6585 bExp = extractFloat128Exp( b );
6586 expDiff = aExp - bExp;
6587 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6588 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6589 if ( 0 < expDiff ) goto aExpBigger;
6590 if ( expDiff < 0 ) goto bExpBigger;
6591 if ( aExp == 0x7FFF ) {
6592 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
ff32e16e 6593 return propagateFloat128NaN(a, b, status);
158142c2 6594 }
ff32e16e 6595 float_raise(float_flag_invalid, status);
af39bc8c 6596 return float128_default_nan(status);
158142c2
FB
6597 }
6598 if ( aExp == 0 ) {
6599 aExp = 1;
6600 bExp = 1;
6601 }
6602 if ( bSig0 < aSig0 ) goto aBigger;
6603 if ( aSig0 < bSig0 ) goto bBigger;
6604 if ( bSig1 < aSig1 ) goto aBigger;
6605 if ( aSig1 < bSig1 ) goto bBigger;
a2f2d288
PM
6606 return packFloat128(status->float_rounding_mode == float_round_down,
6607 0, 0, 0);
158142c2
FB
6608 bExpBigger:
6609 if ( bExp == 0x7FFF ) {
ff32e16e
PM
6610 if (bSig0 | bSig1) {
6611 return propagateFloat128NaN(a, b, status);
6612 }
158142c2
FB
6613 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6614 }
6615 if ( aExp == 0 ) {
6616 ++expDiff;
6617 }
6618 else {
6619 aSig0 |= LIT64( 0x4000000000000000 );
6620 }
6621 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6622 bSig0 |= LIT64( 0x4000000000000000 );
6623 bBigger:
6624 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6625 zExp = bExp;
6626 zSign ^= 1;
6627 goto normalizeRoundAndPack;
6628 aExpBigger:
6629 if ( aExp == 0x7FFF ) {
ff32e16e
PM
6630 if (aSig0 | aSig1) {
6631 return propagateFloat128NaN(a, b, status);
6632 }
158142c2
FB
6633 return a;
6634 }
6635 if ( bExp == 0 ) {
6636 --expDiff;
6637 }
6638 else {
6639 bSig0 |= LIT64( 0x4000000000000000 );
6640 }
6641 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6642 aSig0 |= LIT64( 0x4000000000000000 );
6643 aBigger:
6644 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6645 zExp = aExp;
6646 normalizeRoundAndPack:
6647 --zExp;
ff32e16e
PM
6648 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6649 status);
158142c2
FB
6650
6651}
6652
6653/*----------------------------------------------------------------------------
6654| Returns the result of adding the quadruple-precision floating-point values
6655| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
6656| for Binary Floating-Point Arithmetic.
6657*----------------------------------------------------------------------------*/
6658
e5a41ffa 6659float128 float128_add(float128 a, float128 b, float_status *status)
158142c2
FB
6660{
6661 flag aSign, bSign;
6662
6663 aSign = extractFloat128Sign( a );
6664 bSign = extractFloat128Sign( b );
6665 if ( aSign == bSign ) {
ff32e16e 6666 return addFloat128Sigs(a, b, aSign, status);
158142c2
FB
6667 }
6668 else {
ff32e16e 6669 return subFloat128Sigs(a, b, aSign, status);
158142c2
FB
6670 }
6671
6672}
6673
6674/*----------------------------------------------------------------------------
6675| Returns the result of subtracting the quadruple-precision floating-point
6676| values `a' and `b'. The operation is performed according to the IEC/IEEE
6677| Standard for Binary Floating-Point Arithmetic.
6678*----------------------------------------------------------------------------*/
6679
e5a41ffa 6680float128 float128_sub(float128 a, float128 b, float_status *status)
158142c2
FB
6681{
6682 flag aSign, bSign;
6683
6684 aSign = extractFloat128Sign( a );
6685 bSign = extractFloat128Sign( b );
6686 if ( aSign == bSign ) {
ff32e16e 6687 return subFloat128Sigs(a, b, aSign, status);
158142c2
FB
6688 }
6689 else {
ff32e16e 6690 return addFloat128Sigs(a, b, aSign, status);
158142c2
FB
6691 }
6692
6693}
6694
6695/*----------------------------------------------------------------------------
6696| Returns the result of multiplying the quadruple-precision floating-point
6697| values `a' and `b'. The operation is performed according to the IEC/IEEE
6698| Standard for Binary Floating-Point Arithmetic.
6699*----------------------------------------------------------------------------*/
6700
e5a41ffa 6701float128 float128_mul(float128 a, float128 b, float_status *status)
158142c2
FB
6702{
6703 flag aSign, bSign, zSign;
f4014512 6704 int32_t aExp, bExp, zExp;
bb98fe42 6705 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
158142c2
FB
6706
6707 aSig1 = extractFloat128Frac1( a );
6708 aSig0 = extractFloat128Frac0( a );
6709 aExp = extractFloat128Exp( a );
6710 aSign = extractFloat128Sign( a );
6711 bSig1 = extractFloat128Frac1( b );
6712 bSig0 = extractFloat128Frac0( b );
6713 bExp = extractFloat128Exp( b );
6714 bSign = extractFloat128Sign( b );
6715 zSign = aSign ^ bSign;
6716 if ( aExp == 0x7FFF ) {
6717 if ( ( aSig0 | aSig1 )
6718 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
ff32e16e 6719 return propagateFloat128NaN(a, b, status);
158142c2
FB
6720 }
6721 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6722 return packFloat128( zSign, 0x7FFF, 0, 0 );
6723 }
6724 if ( bExp == 0x7FFF ) {
ff32e16e
PM
6725 if (bSig0 | bSig1) {
6726 return propagateFloat128NaN(a, b, status);
6727 }
158142c2
FB
6728 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6729 invalid:
ff32e16e 6730 float_raise(float_flag_invalid, status);
af39bc8c 6731 return float128_default_nan(status);
158142c2
FB
6732 }
6733 return packFloat128( zSign, 0x7FFF, 0, 0 );
6734 }
6735 if ( aExp == 0 ) {
6736 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6737 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6738 }
6739 if ( bExp == 0 ) {
6740 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6741 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6742 }
6743 zExp = aExp + bExp - 0x4000;
6744 aSig0 |= LIT64( 0x0001000000000000 );
6745 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6746 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6747 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6748 zSig2 |= ( zSig3 != 0 );
6749 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6750 shift128ExtraRightJamming(
6751 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6752 ++zExp;
6753 }
ff32e16e 6754 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
158142c2
FB
6755
6756}
6757
6758/*----------------------------------------------------------------------------
6759| Returns the result of dividing the quadruple-precision floating-point value
6760| `a' by the corresponding value `b'. The operation is performed according to
6761| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6762*----------------------------------------------------------------------------*/
6763
e5a41ffa 6764float128 float128_div(float128 a, float128 b, float_status *status)
158142c2
FB
6765{
6766 flag aSign, bSign, zSign;
f4014512 6767 int32_t aExp, bExp, zExp;
bb98fe42
AF
6768 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6769 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
158142c2
FB
6770
6771 aSig1 = extractFloat128Frac1( a );
6772 aSig0 = extractFloat128Frac0( a );
6773 aExp = extractFloat128Exp( a );
6774 aSign = extractFloat128Sign( a );
6775 bSig1 = extractFloat128Frac1( b );
6776 bSig0 = extractFloat128Frac0( b );
6777 bExp = extractFloat128Exp( b );
6778 bSign = extractFloat128Sign( b );
6779 zSign = aSign ^ bSign;
6780 if ( aExp == 0x7FFF ) {
ff32e16e
PM
6781 if (aSig0 | aSig1) {
6782 return propagateFloat128NaN(a, b, status);
6783 }
158142c2 6784 if ( bExp == 0x7FFF ) {
ff32e16e
PM
6785 if (bSig0 | bSig1) {
6786 return propagateFloat128NaN(a, b, status);
6787 }
158142c2
FB
6788 goto invalid;
6789 }
6790 return packFloat128( zSign, 0x7FFF, 0, 0 );
6791 }
6792 if ( bExp == 0x7FFF ) {
ff32e16e
PM
6793 if (bSig0 | bSig1) {
6794 return propagateFloat128NaN(a, b, status);
6795 }
158142c2
FB
6796 return packFloat128( zSign, 0, 0, 0 );
6797 }
6798 if ( bExp == 0 ) {
6799 if ( ( bSig0 | bSig1 ) == 0 ) {
6800 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6801 invalid:
ff32e16e 6802 float_raise(float_flag_invalid, status);
af39bc8c 6803 return float128_default_nan(status);
158142c2 6804 }
ff32e16e 6805 float_raise(float_flag_divbyzero, status);
158142c2
FB
6806 return packFloat128( zSign, 0x7FFF, 0, 0 );
6807 }
6808 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6809 }
6810 if ( aExp == 0 ) {
6811 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6812 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6813 }
6814 zExp = aExp - bExp + 0x3FFD;
6815 shortShift128Left(
6816 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6817 shortShift128Left(
6818 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6819 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6820 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6821 ++zExp;
6822 }
6823 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6824 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6825 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
bb98fe42 6826 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
6827 --zSig0;
6828 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6829 }
6830 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6831 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6832 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6833 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
bb98fe42 6834 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
6835 --zSig1;
6836 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6837 }
6838 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6839 }
6840 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
ff32e16e 6841 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
158142c2
FB
6842
6843}
6844
6845/*----------------------------------------------------------------------------
6846| Returns the remainder of the quadruple-precision floating-point value `a'
6847| with respect to the corresponding value `b'. The operation is performed
6848| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6849*----------------------------------------------------------------------------*/
6850
e5a41ffa 6851float128 float128_rem(float128 a, float128 b, float_status *status)
158142c2 6852{
ed086f3d 6853 flag aSign, zSign;
f4014512 6854 int32_t aExp, bExp, expDiff;
bb98fe42
AF
6855 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6856 uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6857 int64_t sigMean0;
158142c2
FB
6858
6859 aSig1 = extractFloat128Frac1( a );
6860 aSig0 = extractFloat128Frac0( a );
6861 aExp = extractFloat128Exp( a );
6862 aSign = extractFloat128Sign( a );
6863 bSig1 = extractFloat128Frac1( b );
6864 bSig0 = extractFloat128Frac0( b );
6865 bExp = extractFloat128Exp( b );
158142c2
FB
6866 if ( aExp == 0x7FFF ) {
6867 if ( ( aSig0 | aSig1 )
6868 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
ff32e16e 6869 return propagateFloat128NaN(a, b, status);
158142c2
FB
6870 }
6871 goto invalid;
6872 }
6873 if ( bExp == 0x7FFF ) {
ff32e16e
PM
6874 if (bSig0 | bSig1) {
6875 return propagateFloat128NaN(a, b, status);
6876 }
158142c2
FB
6877 return a;
6878 }
6879 if ( bExp == 0 ) {
6880 if ( ( bSig0 | bSig1 ) == 0 ) {
6881 invalid:
ff32e16e 6882 float_raise(float_flag_invalid, status);
af39bc8c 6883 return float128_default_nan(status);
158142c2
FB
6884 }
6885 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6886 }
6887 if ( aExp == 0 ) {
6888 if ( ( aSig0 | aSig1 ) == 0 ) return a;
6889 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6890 }
6891 expDiff = aExp - bExp;
6892 if ( expDiff < -1 ) return a;
6893 shortShift128Left(
6894 aSig0 | LIT64( 0x0001000000000000 ),
6895 aSig1,
6896 15 - ( expDiff < 0 ),
6897 &aSig0,
6898 &aSig1
6899 );
6900 shortShift128Left(
6901 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6902 q = le128( bSig0, bSig1, aSig0, aSig1 );
6903 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6904 expDiff -= 64;
6905 while ( 0 < expDiff ) {
6906 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6907 q = ( 4 < q ) ? q - 4 : 0;
6908 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6909 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6910 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6911 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6912 expDiff -= 61;
6913 }
6914 if ( -64 < expDiff ) {
6915 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6916 q = ( 4 < q ) ? q - 4 : 0;
6917 q >>= - expDiff;
6918 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6919 expDiff += 52;
6920 if ( expDiff < 0 ) {
6921 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6922 }
6923 else {
6924 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6925 }
6926 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6927 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6928 }
6929 else {
6930 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6931 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6932 }
6933 do {
6934 alternateASig0 = aSig0;
6935 alternateASig1 = aSig1;
6936 ++q;
6937 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
bb98fe42 6938 } while ( 0 <= (int64_t) aSig0 );
158142c2 6939 add128(
bb98fe42 6940 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
158142c2
FB
6941 if ( ( sigMean0 < 0 )
6942 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6943 aSig0 = alternateASig0;
6944 aSig1 = alternateASig1;
6945 }
bb98fe42 6946 zSign = ( (int64_t) aSig0 < 0 );
158142c2 6947 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
ff32e16e
PM
6948 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
6949 status);
158142c2
FB
6950}
6951
6952/*----------------------------------------------------------------------------
6953| Returns the square root of the quadruple-precision floating-point value `a'.
6954| The operation is performed according to the IEC/IEEE Standard for Binary
6955| Floating-Point Arithmetic.
6956*----------------------------------------------------------------------------*/
6957
e5a41ffa 6958float128 float128_sqrt(float128 a, float_status *status)
158142c2
FB
6959{
6960 flag aSign;
f4014512 6961 int32_t aExp, zExp;
bb98fe42
AF
6962 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6963 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
158142c2
FB
6964
6965 aSig1 = extractFloat128Frac1( a );
6966 aSig0 = extractFloat128Frac0( a );
6967 aExp = extractFloat128Exp( a );
6968 aSign = extractFloat128Sign( a );
6969 if ( aExp == 0x7FFF ) {
ff32e16e
PM
6970 if (aSig0 | aSig1) {
6971 return propagateFloat128NaN(a, a, status);
6972 }
158142c2
FB
6973 if ( ! aSign ) return a;
6974 goto invalid;
6975 }
6976 if ( aSign ) {
6977 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6978 invalid:
ff32e16e 6979 float_raise(float_flag_invalid, status);
af39bc8c 6980 return float128_default_nan(status);
158142c2
FB
6981 }
6982 if ( aExp == 0 ) {
6983 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6984 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6985 }
6986 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6987 aSig0 |= LIT64( 0x0001000000000000 );
6988 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6989 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6990 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6991 doubleZSig0 = zSig0<<1;
6992 mul64To128( zSig0, zSig0, &term0, &term1 );
6993 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
bb98fe42 6994 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
6995 --zSig0;
6996 doubleZSig0 -= 2;
6997 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6998 }
6999 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
7000 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
7001 if ( zSig1 == 0 ) zSig1 = 1;
7002 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
7003 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
7004 mul64To128( zSig1, zSig1, &term2, &term3 );
7005 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
bb98fe42 7006 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
7007 --zSig1;
7008 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
7009 term3 |= 1;
7010 term2 |= doubleZSig0;
7011 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
7012 }
7013 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7014 }
7015 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
ff32e16e 7016 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
158142c2
FB
7017
7018}
7019
7020/*----------------------------------------------------------------------------
7021| Returns 1 if the quadruple-precision floating-point value `a' is equal to
b689362d
AJ
7022| the corresponding value `b', and 0 otherwise. The invalid exception is
7023| raised if either operand is a NaN. Otherwise, the comparison is performed
158142c2
FB
7024| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7025*----------------------------------------------------------------------------*/
7026
e5a41ffa 7027int float128_eq(float128 a, float128 b, float_status *status)
158142c2
FB
7028{
7029
7030 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
7031 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7032 || ( ( extractFloat128Exp( b ) == 0x7FFF )
7033 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7034 ) {
ff32e16e 7035 float_raise(float_flag_invalid, status);
158142c2
FB
7036 return 0;
7037 }
7038 return
7039 ( a.low == b.low )
7040 && ( ( a.high == b.high )
7041 || ( ( a.low == 0 )
bb98fe42 7042 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
7043 );
7044
7045}
7046
7047/*----------------------------------------------------------------------------
7048| Returns 1 if the quadruple-precision floating-point value `a' is less than
f5a64251
AJ
7049| or equal to the corresponding value `b', and 0 otherwise. The invalid
7050| exception is raised if either operand is a NaN. The comparison is performed
7051| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
7052*----------------------------------------------------------------------------*/
7053
e5a41ffa 7054int float128_le(float128 a, float128 b, float_status *status)
158142c2
FB
7055{
7056 flag aSign, bSign;
7057
7058 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
7059 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7060 || ( ( extractFloat128Exp( b ) == 0x7FFF )
7061 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7062 ) {
ff32e16e 7063 float_raise(float_flag_invalid, status);
158142c2
FB
7064 return 0;
7065 }
7066 aSign = extractFloat128Sign( a );
7067 bSign = extractFloat128Sign( b );
7068 if ( aSign != bSign ) {
7069 return
7070 aSign
bb98fe42 7071 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
7072 == 0 );
7073 }
7074 return
7075 aSign ? le128( b.high, b.low, a.high, a.low )
7076 : le128( a.high, a.low, b.high, b.low );
7077
7078}
7079
7080/*----------------------------------------------------------------------------
7081| Returns 1 if the quadruple-precision floating-point value `a' is less than
f5a64251
AJ
7082| the corresponding value `b', and 0 otherwise. The invalid exception is
7083| raised if either operand is a NaN. The comparison is performed according
7084| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
7085*----------------------------------------------------------------------------*/
7086
e5a41ffa 7087int float128_lt(float128 a, float128 b, float_status *status)
158142c2
FB
7088{
7089 flag aSign, bSign;
7090
7091 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
7092 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7093 || ( ( extractFloat128Exp( b ) == 0x7FFF )
7094 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7095 ) {
ff32e16e 7096 float_raise(float_flag_invalid, status);
158142c2
FB
7097 return 0;
7098 }
7099 aSign = extractFloat128Sign( a );
7100 bSign = extractFloat128Sign( b );
7101 if ( aSign != bSign ) {
7102 return
7103 aSign
bb98fe42 7104 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
7105 != 0 );
7106 }
7107 return
7108 aSign ? lt128( b.high, b.low, a.high, a.low )
7109 : lt128( a.high, a.low, b.high, b.low );
7110
7111}
7112
67b7861d
AJ
7113/*----------------------------------------------------------------------------
7114| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
f5a64251
AJ
7115| be compared, and 0 otherwise. The invalid exception is raised if either
7116| operand is a NaN. The comparison is performed according to the IEC/IEEE
7117| Standard for Binary Floating-Point Arithmetic.
67b7861d
AJ
7118*----------------------------------------------------------------------------*/
7119
e5a41ffa 7120int float128_unordered(float128 a, float128 b, float_status *status)
67b7861d
AJ
7121{
7122 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
7123 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7124 || ( ( extractFloat128Exp( b ) == 0x7FFF )
7125 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7126 ) {
ff32e16e 7127 float_raise(float_flag_invalid, status);
67b7861d
AJ
7128 return 1;
7129 }
7130 return 0;
7131}
7132
158142c2
FB
7133/*----------------------------------------------------------------------------
7134| Returns 1 if the quadruple-precision floating-point value `a' is equal to
f5a64251
AJ
7135| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
7136| exception. The comparison is performed according to the IEC/IEEE Standard
7137| for Binary Floating-Point Arithmetic.
158142c2
FB
7138*----------------------------------------------------------------------------*/
7139
e5a41ffa 7140int float128_eq_quiet(float128 a, float128 b, float_status *status)
158142c2
FB
7141{
7142
7143 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
7144 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7145 || ( ( extractFloat128Exp( b ) == 0x7FFF )
7146 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7147 ) {
af39bc8c
AM
7148 if (float128_is_signaling_nan(a, status)
7149 || float128_is_signaling_nan(b, status)) {
ff32e16e 7150 float_raise(float_flag_invalid, status);
b689362d 7151 }
158142c2
FB
7152 return 0;
7153 }
7154 return
7155 ( a.low == b.low )
7156 && ( ( a.high == b.high )
7157 || ( ( a.low == 0 )
bb98fe42 7158 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
7159 );
7160
7161}
7162
7163/*----------------------------------------------------------------------------
7164| Returns 1 if the quadruple-precision floating-point value `a' is less than
7165| or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
7166| cause an exception. Otherwise, the comparison is performed according to the
7167| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7168*----------------------------------------------------------------------------*/
7169
e5a41ffa 7170int float128_le_quiet(float128 a, float128 b, float_status *status)
158142c2
FB
7171{
7172 flag aSign, bSign;
7173
7174 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
7175 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7176 || ( ( extractFloat128Exp( b ) == 0x7FFF )
7177 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7178 ) {
af39bc8c
AM
7179 if (float128_is_signaling_nan(a, status)
7180 || float128_is_signaling_nan(b, status)) {
ff32e16e 7181 float_raise(float_flag_invalid, status);
158142c2
FB
7182 }
7183 return 0;
7184 }
7185 aSign = extractFloat128Sign( a );
7186 bSign = extractFloat128Sign( b );
7187 if ( aSign != bSign ) {
7188 return
7189 aSign
bb98fe42 7190 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
7191 == 0 );
7192 }
7193 return
7194 aSign ? le128( b.high, b.low, a.high, a.low )
7195 : le128( a.high, a.low, b.high, b.low );
7196
7197}
7198
7199/*----------------------------------------------------------------------------
7200| Returns 1 if the quadruple-precision floating-point value `a' is less than
7201| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
7202| exception. Otherwise, the comparison is performed according to the IEC/IEEE
7203| Standard for Binary Floating-Point Arithmetic.
7204*----------------------------------------------------------------------------*/
7205
e5a41ffa 7206int float128_lt_quiet(float128 a, float128 b, float_status *status)
158142c2
FB
7207{
7208 flag aSign, bSign;
7209
7210 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
7211 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7212 || ( ( extractFloat128Exp( b ) == 0x7FFF )
7213 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7214 ) {
af39bc8c
AM
7215 if (float128_is_signaling_nan(a, status)
7216 || float128_is_signaling_nan(b, status)) {
ff32e16e 7217 float_raise(float_flag_invalid, status);
158142c2
FB
7218 }
7219 return 0;
7220 }
7221 aSign = extractFloat128Sign( a );
7222 bSign = extractFloat128Sign( b );
7223 if ( aSign != bSign ) {
7224 return
7225 aSign
bb98fe42 7226 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
7227 != 0 );
7228 }
7229 return
7230 aSign ? lt128( b.high, b.low, a.high, a.low )
7231 : lt128( a.high, a.low, b.high, b.low );
7232
7233}
7234
67b7861d
AJ
7235/*----------------------------------------------------------------------------
7236| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7237| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
7238| comparison is performed according to the IEC/IEEE Standard for Binary
7239| Floating-Point Arithmetic.
7240*----------------------------------------------------------------------------*/
7241
e5a41ffa 7242int float128_unordered_quiet(float128 a, float128 b, float_status *status)
67b7861d
AJ
7243{
7244 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
7245 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7246 || ( ( extractFloat128Exp( b ) == 0x7FFF )
7247 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7248 ) {
af39bc8c
AM
7249 if (float128_is_signaling_nan(a, status)
7250 || float128_is_signaling_nan(b, status)) {
ff32e16e 7251 float_raise(float_flag_invalid, status);
67b7861d
AJ
7252 }
7253 return 1;
7254 }
7255 return 0;
7256}
7257
1d6bda35 7258/* misc functions */
e5a41ffa 7259float32 uint32_to_float32(uint32_t a, float_status *status)
1d6bda35 7260{
ff32e16e 7261 return int64_to_float32(a, status);
1d6bda35
FB
7262}
7263
e5a41ffa 7264float64 uint32_to_float64(uint32_t a, float_status *status)
1d6bda35 7265{
ff32e16e 7266 return int64_to_float64(a, status);
1d6bda35
FB
7267}
7268
3a87d009 7269uint32_t float32_to_uint32(float32 a, float_status *status)
1d6bda35
FB
7270{
7271 int64_t v;
3a87d009 7272 uint32_t res;
34e1c27b 7273 int old_exc_flags = get_float_exception_flags(status);
1d6bda35 7274
ff32e16e 7275 v = float32_to_int64(a, status);
1d6bda35
FB
7276 if (v < 0) {
7277 res = 0;
1d6bda35
FB
7278 } else if (v > 0xffffffff) {
7279 res = 0xffffffff;
1d6bda35 7280 } else {
34e1c27b 7281 return v;
1d6bda35 7282 }
34e1c27b 7283 set_float_exception_flags(old_exc_flags, status);
ff32e16e 7284 float_raise(float_flag_invalid, status);
1d6bda35
FB
7285 return res;
7286}
7287
3a87d009 7288uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *status)
1d6bda35
FB
7289{
7290 int64_t v;
3a87d009 7291 uint32_t res;
34e1c27b 7292 int old_exc_flags = get_float_exception_flags(status);
1d6bda35 7293
ff32e16e 7294 v = float32_to_int64_round_to_zero(a, status);
1d6bda35
FB
7295 if (v < 0) {
7296 res = 0;
1d6bda35
FB
7297 } else if (v > 0xffffffff) {
7298 res = 0xffffffff;
1d6bda35 7299 } else {
34e1c27b 7300 return v;
1d6bda35 7301 }
34e1c27b 7302 set_float_exception_flags(old_exc_flags, status);
ff32e16e 7303 float_raise(float_flag_invalid, status);
1d6bda35
FB
7304 return res;
7305}
7306
0bb721d7 7307int16_t float32_to_int16(float32 a, float_status *status)
f581bf54
WN
7308{
7309 int32_t v;
0bb721d7 7310 int16_t res;
f581bf54
WN
7311 int old_exc_flags = get_float_exception_flags(status);
7312
ff32e16e 7313 v = float32_to_int32(a, status);
f581bf54
WN
7314 if (v < -0x8000) {
7315 res = -0x8000;
7316 } else if (v > 0x7fff) {
7317 res = 0x7fff;
7318 } else {
7319 return v;
7320 }
7321
7322 set_float_exception_flags(old_exc_flags, status);
ff32e16e 7323 float_raise(float_flag_invalid, status);
f581bf54
WN
7324 return res;
7325}
7326
0bb721d7 7327uint16_t float32_to_uint16(float32 a, float_status *status)
f581bf54
WN
7328{
7329 int32_t v;
0bb721d7 7330 uint16_t res;
f581bf54
WN
7331 int old_exc_flags = get_float_exception_flags(status);
7332
ff32e16e 7333 v = float32_to_int32(a, status);
f581bf54
WN
7334 if (v < 0) {
7335 res = 0;
7336 } else if (v > 0xffff) {
7337 res = 0xffff;
7338 } else {
7339 return v;
7340 }
7341
7342 set_float_exception_flags(old_exc_flags, status);
ff32e16e 7343 float_raise(float_flag_invalid, status);
f581bf54
WN
7344 return res;
7345}
7346
0bb721d7 7347uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *status)
cbcef455
PM
7348{
7349 int64_t v;
0bb721d7 7350 uint16_t res;
34e1c27b 7351 int old_exc_flags = get_float_exception_flags(status);
cbcef455 7352
ff32e16e 7353 v = float32_to_int64_round_to_zero(a, status);
cbcef455
PM
7354 if (v < 0) {
7355 res = 0;
cbcef455
PM
7356 } else if (v > 0xffff) {
7357 res = 0xffff;
cbcef455 7358 } else {
34e1c27b 7359 return v;
cbcef455 7360 }
34e1c27b 7361 set_float_exception_flags(old_exc_flags, status);
ff32e16e 7362 float_raise(float_flag_invalid, status);
cbcef455
PM
7363 return res;
7364}
7365
3a87d009 7366uint32_t float64_to_uint32(float64 a, float_status *status)
1d6bda35 7367{
5e7f654f 7368 uint64_t v;
3a87d009 7369 uint32_t res;
5e7f654f 7370 int old_exc_flags = get_float_exception_flags(status);
1d6bda35 7371
ff32e16e 7372 v = float64_to_uint64(a, status);
5e7f654f 7373 if (v > 0xffffffff) {
1d6bda35 7374 res = 0xffffffff;
1d6bda35 7375 } else {
5e7f654f 7376 return v;
1d6bda35 7377 }
5e7f654f 7378 set_float_exception_flags(old_exc_flags, status);
ff32e16e 7379 float_raise(float_flag_invalid, status);
1d6bda35
FB
7380 return res;
7381}
7382
3a87d009 7383uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *status)
1d6bda35 7384{
fd728f2f 7385 uint64_t v;
3a87d009 7386 uint32_t res;
fd728f2f 7387 int old_exc_flags = get_float_exception_flags(status);
1d6bda35 7388
ff32e16e 7389 v = float64_to_uint64_round_to_zero(a, status);
fd728f2f 7390 if (v > 0xffffffff) {
1d6bda35 7391 res = 0xffffffff;
1d6bda35 7392 } else {
fd728f2f 7393 return v;
1d6bda35 7394 }
fd728f2f 7395 set_float_exception_flags(old_exc_flags, status);
ff32e16e 7396 float_raise(float_flag_invalid, status);
1d6bda35
FB
7397 return res;
7398}
7399
0bb721d7 7400int16_t float64_to_int16(float64 a, float_status *status)
f581bf54
WN
7401{
7402 int64_t v;
0bb721d7 7403 int16_t res;
f581bf54
WN
7404 int old_exc_flags = get_float_exception_flags(status);
7405
ff32e16e 7406 v = float64_to_int32(a, status);
f581bf54
WN
7407 if (v < -0x8000) {
7408 res = -0x8000;
7409 } else if (v > 0x7fff) {
7410 res = 0x7fff;
7411 } else {
7412 return v;
7413 }
7414
7415 set_float_exception_flags(old_exc_flags, status);
ff32e16e 7416 float_raise(float_flag_invalid, status);
f581bf54
WN
7417 return res;
7418}
7419
0bb721d7 7420uint16_t float64_to_uint16(float64 a, float_status *status)
f581bf54
WN
7421{
7422 int64_t v;
0bb721d7 7423 uint16_t res;
f581bf54
WN
7424 int old_exc_flags = get_float_exception_flags(status);
7425
ff32e16e 7426 v = float64_to_int32(a, status);
f581bf54
WN
7427 if (v < 0) {
7428 res = 0;
7429 } else if (v > 0xffff) {
7430 res = 0xffff;
7431 } else {
7432 return v;
7433 }
7434
7435 set_float_exception_flags(old_exc_flags, status);
ff32e16e 7436 float_raise(float_flag_invalid, status);
f581bf54
WN
7437 return res;
7438}
7439
0bb721d7 7440uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *status)
cbcef455
PM
7441{
7442 int64_t v;
0bb721d7 7443 uint16_t res;
34e1c27b 7444 int old_exc_flags = get_float_exception_flags(status);
cbcef455 7445
ff32e16e 7446 v = float64_to_int64_round_to_zero(a, status);
cbcef455
PM
7447 if (v < 0) {
7448 res = 0;
cbcef455
PM
7449 } else if (v > 0xffff) {
7450 res = 0xffff;
cbcef455 7451 } else {
34e1c27b 7452 return v;
cbcef455 7453 }
34e1c27b 7454 set_float_exception_flags(old_exc_flags, status);
ff32e16e 7455 float_raise(float_flag_invalid, status);
cbcef455
PM
7456 return res;
7457}
7458
fb3ea83a
TM
7459/*----------------------------------------------------------------------------
7460| Returns the result of converting the double-precision floating-point value
7461| `a' to the 64-bit unsigned integer format. The conversion is
7462| performed according to the IEC/IEEE Standard for Binary Floating-Point
7463| Arithmetic---which means in particular that the conversion is rounded
7464| according to the current rounding mode. If `a' is a NaN, the largest
7465| positive integer is returned. If the conversion overflows, the
7466| largest unsigned integer is returned. If 'a' is negative, the value is
7467| rounded and zero is returned; negative values that do not round to zero
7468| will raise the inexact exception.
7469*----------------------------------------------------------------------------*/
75d62a58 7470
e5a41ffa 7471uint64_t float64_to_uint64(float64 a, float_status *status)
fb3ea83a
TM
7472{
7473 flag aSign;
0c48262d 7474 int aExp;
07d792d2 7475 int shiftCount;
fb3ea83a 7476 uint64_t aSig, aSigExtra;
ff32e16e 7477 a = float64_squash_input_denormal(a, status);
75d62a58 7478
fb3ea83a
TM
7479 aSig = extractFloat64Frac(a);
7480 aExp = extractFloat64Exp(a);
7481 aSign = extractFloat64Sign(a);
7482 if (aSign && (aExp > 1022)) {
ff32e16e 7483 float_raise(float_flag_invalid, status);
fb3ea83a
TM
7484 if (float64_is_any_nan(a)) {
7485 return LIT64(0xFFFFFFFFFFFFFFFF);
7486 } else {
7487 return 0;
7488 }
7489 }
7490 if (aExp) {
7491 aSig |= LIT64(0x0010000000000000);
7492 }
7493 shiftCount = 0x433 - aExp;
7494 if (shiftCount <= 0) {
7495 if (0x43E < aExp) {
ff32e16e 7496 float_raise(float_flag_invalid, status);
fb3ea83a
TM
7497 return LIT64(0xFFFFFFFFFFFFFFFF);
7498 }
7499 aSigExtra = 0;
7500 aSig <<= -shiftCount;
7501 } else {
7502 shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra);
7503 }
ff32e16e 7504 return roundAndPackUint64(aSign, aSig, aSigExtra, status);
75d62a58
JM
7505}
7506
e5a41ffa 7507uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *status)
75d62a58 7508{
a2f2d288 7509 signed char current_rounding_mode = status->float_rounding_mode;
ff32e16e 7510 set_float_rounding_mode(float_round_to_zero, status);
d000b477 7511 uint64_t v = float64_to_uint64(a, status);
ff32e16e 7512 set_float_rounding_mode(current_rounding_mode, status);
0a87a310 7513 return v;
75d62a58
JM
7514}
7515
1d6bda35 7516#define COMPARE(s, nan_exp) \
e5a41ffa
PM
7517static inline int float ## s ## _compare_internal(float ## s a, float ## s b,\
7518 int is_quiet, float_status *status) \
1d6bda35
FB
7519{ \
7520 flag aSign, bSign; \
bb98fe42 7521 uint ## s ## _t av, bv; \
ff32e16e
PM
7522 a = float ## s ## _squash_input_denormal(a, status); \
7523 b = float ## s ## _squash_input_denormal(b, status); \
1d6bda35
FB
7524 \
7525 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
7526 extractFloat ## s ## Frac( a ) ) || \
7527 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
7528 extractFloat ## s ## Frac( b ) )) { \
7529 if (!is_quiet || \
af39bc8c
AM
7530 float ## s ## _is_signaling_nan(a, status) || \
7531 float ## s ## _is_signaling_nan(b, status)) { \
ff32e16e 7532 float_raise(float_flag_invalid, status); \
1d6bda35
FB
7533 } \
7534 return float_relation_unordered; \
7535 } \
7536 aSign = extractFloat ## s ## Sign( a ); \
7537 bSign = extractFloat ## s ## Sign( b ); \
f090c9d4 7538 av = float ## s ## _val(a); \
cd8a2533 7539 bv = float ## s ## _val(b); \
1d6bda35 7540 if ( aSign != bSign ) { \
bb98fe42 7541 if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) { \
1d6bda35
FB
7542 /* zero case */ \
7543 return float_relation_equal; \
7544 } else { \
7545 return 1 - (2 * aSign); \
7546 } \
7547 } else { \
f090c9d4 7548 if (av == bv) { \
1d6bda35
FB
7549 return float_relation_equal; \
7550 } else { \
f090c9d4 7551 return 1 - 2 * (aSign ^ ( av < bv )); \
1d6bda35
FB
7552 } \
7553 } \
7554} \
7555 \
e5a41ffa 7556int float ## s ## _compare(float ## s a, float ## s b, float_status *status) \
1d6bda35 7557{ \
ff32e16e 7558 return float ## s ## _compare_internal(a, b, 0, status); \
1d6bda35
FB
7559} \
7560 \
e5a41ffa
PM
7561int float ## s ## _compare_quiet(float ## s a, float ## s b, \
7562 float_status *status) \
1d6bda35 7563{ \
ff32e16e 7564 return float ## s ## _compare_internal(a, b, 1, status); \
1d6bda35
FB
7565}
7566
7567COMPARE(32, 0xff)
7568COMPARE(64, 0x7ff)
9ee6e8bb 7569
e5a41ffa
PM
7570static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
7571 int is_quiet, float_status *status)
f6714d36
AJ
7572{
7573 flag aSign, bSign;
7574
d1eb8f2a
AD
7575 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
7576 float_raise(float_flag_invalid, status);
7577 return float_relation_unordered;
7578 }
f6714d36
AJ
7579 if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7580 ( extractFloatx80Frac( a )<<1 ) ) ||
7581 ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7582 ( extractFloatx80Frac( b )<<1 ) )) {
7583 if (!is_quiet ||
af39bc8c
AM
7584 floatx80_is_signaling_nan(a, status) ||
7585 floatx80_is_signaling_nan(b, status)) {
ff32e16e 7586 float_raise(float_flag_invalid, status);
f6714d36
AJ
7587 }
7588 return float_relation_unordered;
7589 }
7590 aSign = extractFloatx80Sign( a );
7591 bSign = extractFloatx80Sign( b );
7592 if ( aSign != bSign ) {
7593
7594 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7595 ( ( a.low | b.low ) == 0 ) ) {
7596 /* zero case */
7597 return float_relation_equal;
7598 } else {
7599 return 1 - (2 * aSign);
7600 }
7601 } else {
7602 if (a.low == b.low && a.high == b.high) {
7603 return float_relation_equal;
7604 } else {
7605 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7606 }
7607 }
7608}
7609
e5a41ffa 7610int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
f6714d36 7611{
ff32e16e 7612 return floatx80_compare_internal(a, b, 0, status);
f6714d36
AJ
7613}
7614
e5a41ffa 7615int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
f6714d36 7616{
ff32e16e 7617 return floatx80_compare_internal(a, b, 1, status);
f6714d36
AJ
7618}
7619
e5a41ffa
PM
7620static inline int float128_compare_internal(float128 a, float128 b,
7621 int is_quiet, float_status *status)
1f587329
BS
7622{
7623 flag aSign, bSign;
7624
7625 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7626 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7627 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7628 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7629 if (!is_quiet ||
af39bc8c
AM
7630 float128_is_signaling_nan(a, status) ||
7631 float128_is_signaling_nan(b, status)) {
ff32e16e 7632 float_raise(float_flag_invalid, status);
1f587329
BS
7633 }
7634 return float_relation_unordered;
7635 }
7636 aSign = extractFloat128Sign( a );
7637 bSign = extractFloat128Sign( b );
7638 if ( aSign != bSign ) {
7639 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7640 /* zero case */
7641 return float_relation_equal;
7642 } else {
7643 return 1 - (2 * aSign);
7644 }
7645 } else {
7646 if (a.low == b.low && a.high == b.high) {
7647 return float_relation_equal;
7648 } else {
7649 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7650 }
7651 }
7652}
7653
e5a41ffa 7654int float128_compare(float128 a, float128 b, float_status *status)
1f587329 7655{
ff32e16e 7656 return float128_compare_internal(a, b, 0, status);
1f587329
BS
7657}
7658
e5a41ffa 7659int float128_compare_quiet(float128 a, float128 b, float_status *status)
1f587329 7660{
ff32e16e 7661 return float128_compare_internal(a, b, 1, status);
1f587329
BS
7662}
7663
274f1b04
PM
7664/* min() and max() functions. These can't be implemented as
7665 * 'compare and pick one input' because that would mishandle
7666 * NaNs and +0 vs -0.
e17ab310
WN
7667 *
7668 * minnum() and maxnum() functions. These are similar to the min()
7669 * and max() functions but if one of the arguments is a QNaN and
7670 * the other is numerical then the numerical argument is returned.
7671 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
7672 * and maxNum() operations. min() and max() are the typical min/max
7673 * semantics provided by many CPUs which predate that specification.
2d31e060
LA
7674 *
7675 * minnummag() and maxnummag() functions correspond to minNumMag()
7676 * and minNumMag() from the IEEE-754 2008.
274f1b04 7677 */
e70614ea 7678#define MINMAX(s) \
a49db98d 7679static inline float ## s float ## s ## _minmax(float ## s a, float ## s b, \
2d31e060 7680 int ismin, int isieee, \
e5a41ffa
PM
7681 int ismag, \
7682 float_status *status) \
274f1b04
PM
7683{ \
7684 flag aSign, bSign; \
2d31e060 7685 uint ## s ## _t av, bv, aav, abv; \
ff32e16e
PM
7686 a = float ## s ## _squash_input_denormal(a, status); \
7687 b = float ## s ## _squash_input_denormal(b, status); \
274f1b04
PM
7688 if (float ## s ## _is_any_nan(a) || \
7689 float ## s ## _is_any_nan(b)) { \
e17ab310 7690 if (isieee) { \
af39bc8c 7691 if (float ## s ## _is_quiet_nan(a, status) && \
e17ab310
WN
7692 !float ## s ##_is_any_nan(b)) { \
7693 return b; \
af39bc8c
AM
7694 } else if (float ## s ## _is_quiet_nan(b, status) && \
7695 !float ## s ## _is_any_nan(a)) { \
e17ab310
WN
7696 return a; \
7697 } \
7698 } \
ff32e16e 7699 return propagateFloat ## s ## NaN(a, b, status); \
274f1b04
PM
7700 } \
7701 aSign = extractFloat ## s ## Sign(a); \
7702 bSign = extractFloat ## s ## Sign(b); \
7703 av = float ## s ## _val(a); \
7704 bv = float ## s ## _val(b); \
2d31e060
LA
7705 if (ismag) { \
7706 aav = float ## s ## _abs(av); \
7707 abv = float ## s ## _abs(bv); \
7708 if (aav != abv) { \
7709 if (ismin) { \
7710 return (aav < abv) ? a : b; \
7711 } else { \
7712 return (aav < abv) ? b : a; \
7713 } \
7714 } \
7715 } \
274f1b04
PM
7716 if (aSign != bSign) { \
7717 if (ismin) { \
7718 return aSign ? a : b; \
7719 } else { \
7720 return aSign ? b : a; \
7721 } \
7722 } else { \
7723 if (ismin) { \
7724 return (aSign ^ (av < bv)) ? a : b; \
7725 } else { \
7726 return (aSign ^ (av < bv)) ? b : a; \
7727 } \
7728 } \
7729} \
7730 \
e5a41ffa
PM
7731float ## s float ## s ## _min(float ## s a, float ## s b, \
7732 float_status *status) \
274f1b04 7733{ \
ff32e16e 7734 return float ## s ## _minmax(a, b, 1, 0, 0, status); \
274f1b04
PM
7735} \
7736 \
e5a41ffa
PM
7737float ## s float ## s ## _max(float ## s a, float ## s b, \
7738 float_status *status) \
274f1b04 7739{ \
ff32e16e 7740 return float ## s ## _minmax(a, b, 0, 0, 0, status); \
e17ab310
WN
7741} \
7742 \
e5a41ffa
PM
7743float ## s float ## s ## _minnum(float ## s a, float ## s b, \
7744 float_status *status) \
e17ab310 7745{ \
ff32e16e 7746 return float ## s ## _minmax(a, b, 1, 1, 0, status); \
e17ab310
WN
7747} \
7748 \
e5a41ffa
PM
7749float ## s float ## s ## _maxnum(float ## s a, float ## s b, \
7750 float_status *status) \
e17ab310 7751{ \
ff32e16e 7752 return float ## s ## _minmax(a, b, 0, 1, 0, status); \
2d31e060
LA
7753} \
7754 \
e5a41ffa
PM
7755float ## s float ## s ## _minnummag(float ## s a, float ## s b, \
7756 float_status *status) \
2d31e060 7757{ \
ff32e16e 7758 return float ## s ## _minmax(a, b, 1, 1, 1, status); \
2d31e060
LA
7759} \
7760 \
e5a41ffa
PM
7761float ## s float ## s ## _maxnummag(float ## s a, float ## s b, \
7762 float_status *status) \
2d31e060 7763{ \
ff32e16e 7764 return float ## s ## _minmax(a, b, 0, 1, 1, status); \
274f1b04
PM
7765}
7766
e70614ea
WN
7767MINMAX(32)
7768MINMAX(64)
274f1b04
PM
7769
7770
9ee6e8bb 7771/* Multiply A by 2 raised to the power N. */
e5a41ffa 7772float32 float32_scalbn(float32 a, int n, float_status *status)
9ee6e8bb
PB
7773{
7774 flag aSign;
326b9e98 7775 int16_t aExp;
bb98fe42 7776 uint32_t aSig;
9ee6e8bb 7777
ff32e16e 7778 a = float32_squash_input_denormal(a, status);
9ee6e8bb
PB
7779 aSig = extractFloat32Frac( a );
7780 aExp = extractFloat32Exp( a );
7781 aSign = extractFloat32Sign( a );
7782
7783 if ( aExp == 0xFF ) {
326b9e98 7784 if ( aSig ) {
ff32e16e 7785 return propagateFloat32NaN(a, a, status);
326b9e98 7786 }
9ee6e8bb
PB
7787 return a;
7788 }
3c85c37f 7789 if (aExp != 0) {
69397542 7790 aSig |= 0x00800000;
3c85c37f 7791 } else if (aSig == 0) {
69397542 7792 return a;
3c85c37f
PM
7793 } else {
7794 aExp++;
7795 }
69397542 7796
326b9e98
AJ
7797 if (n > 0x200) {
7798 n = 0x200;
7799 } else if (n < -0x200) {
7800 n = -0x200;
7801 }
7802
69397542
PB
7803 aExp += n - 1;
7804 aSig <<= 7;
ff32e16e 7805 return normalizeRoundAndPackFloat32(aSign, aExp, aSig, status);
9ee6e8bb
PB
7806}
7807
e5a41ffa 7808float64 float64_scalbn(float64 a, int n, float_status *status)
9ee6e8bb
PB
7809{
7810 flag aSign;
326b9e98 7811 int16_t aExp;
bb98fe42 7812 uint64_t aSig;
9ee6e8bb 7813
ff32e16e 7814 a = float64_squash_input_denormal(a, status);
9ee6e8bb
PB
7815 aSig = extractFloat64Frac( a );
7816 aExp = extractFloat64Exp( a );
7817 aSign = extractFloat64Sign( a );
7818
7819 if ( aExp == 0x7FF ) {
326b9e98 7820 if ( aSig ) {
ff32e16e 7821 return propagateFloat64NaN(a, a, status);
326b9e98 7822 }
9ee6e8bb
PB
7823 return a;
7824 }
3c85c37f 7825 if (aExp != 0) {
69397542 7826 aSig |= LIT64( 0x0010000000000000 );
3c85c37f 7827 } else if (aSig == 0) {
69397542 7828 return a;
3c85c37f
PM
7829 } else {
7830 aExp++;
7831 }
69397542 7832
326b9e98
AJ
7833 if (n > 0x1000) {
7834 n = 0x1000;
7835 } else if (n < -0x1000) {
7836 n = -0x1000;
7837 }
7838
69397542
PB
7839 aExp += n - 1;
7840 aSig <<= 10;
ff32e16e 7841 return normalizeRoundAndPackFloat64(aSign, aExp, aSig, status);
9ee6e8bb
PB
7842}
7843
e5a41ffa 7844floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
9ee6e8bb
PB
7845{
7846 flag aSign;
326b9e98 7847 int32_t aExp;
bb98fe42 7848 uint64_t aSig;
9ee6e8bb 7849
d1eb8f2a
AD
7850 if (floatx80_invalid_encoding(a)) {
7851 float_raise(float_flag_invalid, status);
7852 return floatx80_default_nan(status);
7853 }
9ee6e8bb
PB
7854 aSig = extractFloatx80Frac( a );
7855 aExp = extractFloatx80Exp( a );
7856 aSign = extractFloatx80Sign( a );
7857
326b9e98
AJ
7858 if ( aExp == 0x7FFF ) {
7859 if ( aSig<<1 ) {
ff32e16e 7860 return propagateFloatx80NaN(a, a, status);
326b9e98 7861 }
9ee6e8bb
PB
7862 return a;
7863 }
326b9e98 7864
3c85c37f
PM
7865 if (aExp == 0) {
7866 if (aSig == 0) {
7867 return a;
7868 }
7869 aExp++;
7870 }
69397542 7871
326b9e98
AJ
7872 if (n > 0x10000) {
7873 n = 0x10000;
7874 } else if (n < -0x10000) {
7875 n = -0x10000;
7876 }
7877
9ee6e8bb 7878 aExp += n;
a2f2d288
PM
7879 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7880 aSign, aExp, aSig, 0, status);
9ee6e8bb 7881}
9ee6e8bb 7882
e5a41ffa 7883float128 float128_scalbn(float128 a, int n, float_status *status)
9ee6e8bb
PB
7884{
7885 flag aSign;
326b9e98 7886 int32_t aExp;
bb98fe42 7887 uint64_t aSig0, aSig1;
9ee6e8bb
PB
7888
7889 aSig1 = extractFloat128Frac1( a );
7890 aSig0 = extractFloat128Frac0( a );
7891 aExp = extractFloat128Exp( a );
7892 aSign = extractFloat128Sign( a );
7893 if ( aExp == 0x7FFF ) {
326b9e98 7894 if ( aSig0 | aSig1 ) {
ff32e16e 7895 return propagateFloat128NaN(a, a, status);
326b9e98 7896 }
9ee6e8bb
PB
7897 return a;
7898 }
3c85c37f 7899 if (aExp != 0) {
69397542 7900 aSig0 |= LIT64( 0x0001000000000000 );
3c85c37f 7901 } else if (aSig0 == 0 && aSig1 == 0) {
69397542 7902 return a;
3c85c37f
PM
7903 } else {
7904 aExp++;
7905 }
69397542 7906
326b9e98
AJ
7907 if (n > 0x10000) {
7908 n = 0x10000;
7909 } else if (n < -0x10000) {
7910 n = -0x10000;
7911 }
7912
69397542
PB
7913 aExp += n - 1;
7914 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
ff32e16e 7915 , status);
9ee6e8bb
PB
7916
7917}