]> git.proxmox.com Git - mirror_qemu.git/blame - fpu/softfloat.c
fpu/softfloat: re-factor round_to_int
[mirror_qemu.git] / fpu / softfloat.c
CommitLineData
8d725fac
AF
1/*
2 * QEMU float support
3 *
16017c48
PM
4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
10 * the BSD license
11 * GPL-v2-or-later
12 *
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
8d725fac 16 */
158142c2 17
a7d1ac78
PM
18/*
19===============================================================================
20This C source file is part of the SoftFloat IEC/IEEE Floating-point
21Arithmetic Package, Release 2a.
158142c2
FB
22
23Written by John R. Hauser. This work was made possible in part by the
24International Computer Science Institute, located at Suite 600, 1947 Center
25Street, Berkeley, California 94704. Funding was partially provided by the
26National Science Foundation under grant MIP-9311980. The original version
27of this code was written as part of a project to build a fixed-point vector
28processor in collaboration with the University of California at Berkeley,
29overseen by Profs. Nelson Morgan and John Wawrzynek. More information
a7d1ac78 30is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
158142c2
FB
31arithmetic/SoftFloat.html'.
32
a7d1ac78
PM
33THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
158142c2
FB
38
39Derivative works are acceptable, even for commercial purposes, so long as
a7d1ac78
PM
40(1) they include prominent notice that the work is derivative, and (2) they
41include prominent notice akin to these four paragraphs for those parts of
42this code that are retained.
158142c2 43
a7d1ac78
PM
44===============================================================================
45*/
158142c2 46
16017c48
PM
47/* BSD licensing:
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
50 *
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
53 *
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
56 *
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
60 *
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
64 *
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
76 */
77
78/* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
80 */
81
2ac8bd03
PM
82/* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
84 */
d38ea87a 85#include "qemu/osdep.h"
6fff2167 86#include "qemu/bitops.h"
6b4c305c 87#include "fpu/softfloat.h"
158142c2 88
dc355b76 89/* We only need stdlib for abort() */
dc355b76 90
158142c2
FB
91/*----------------------------------------------------------------------------
92| Primitive arithmetic functions, including multi-word arithmetic, and
93| division and square root approximations. (Can be specialized to target if
94| desired.)
95*----------------------------------------------------------------------------*/
96#include "softfloat-macros.h"
97
98/*----------------------------------------------------------------------------
99| Functions and definitions to determine: (1) whether tininess for underflow
100| is detected before or after rounding by default, (2) what (if anything)
101| happens when exceptions are raised, (3) how signaling NaNs are distinguished
102| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
103| are propagated from function inputs to output. These details are target-
104| specific.
105*----------------------------------------------------------------------------*/
106#include "softfloat-specialize.h"
107
bb4d4bb3
PM
108/*----------------------------------------------------------------------------
109| Returns the fraction bits of the half-precision floating-point value `a'.
110*----------------------------------------------------------------------------*/
111
a49db98d 112static inline uint32_t extractFloat16Frac(float16 a)
bb4d4bb3
PM
113{
114 return float16_val(a) & 0x3ff;
115}
116
117/*----------------------------------------------------------------------------
118| Returns the exponent bits of the half-precision floating-point value `a'.
119*----------------------------------------------------------------------------*/
120
0c48262d 121static inline int extractFloat16Exp(float16 a)
bb4d4bb3
PM
122{
123 return (float16_val(a) >> 10) & 0x1f;
124}
125
126/*----------------------------------------------------------------------------
127| Returns the sign bit of the single-precision floating-point value `a'.
128*----------------------------------------------------------------------------*/
129
a49db98d 130static inline flag extractFloat16Sign(float16 a)
bb4d4bb3
PM
131{
132 return float16_val(a)>>15;
133}
134
d97544c9
AB
135/*----------------------------------------------------------------------------
136| Returns the fraction bits of the single-precision floating-point value `a'.
137*----------------------------------------------------------------------------*/
138
139static inline uint32_t extractFloat32Frac(float32 a)
140{
141 return float32_val(a) & 0x007FFFFF;
142}
143
144/*----------------------------------------------------------------------------
145| Returns the exponent bits of the single-precision floating-point value `a'.
146*----------------------------------------------------------------------------*/
147
148static inline int extractFloat32Exp(float32 a)
149{
150 return (float32_val(a) >> 23) & 0xFF;
151}
152
153/*----------------------------------------------------------------------------
154| Returns the sign bit of the single-precision floating-point value `a'.
155*----------------------------------------------------------------------------*/
156
157static inline flag extractFloat32Sign(float32 a)
158{
159 return float32_val(a) >> 31;
160}
161
162/*----------------------------------------------------------------------------
163| Returns the fraction bits of the double-precision floating-point value `a'.
164*----------------------------------------------------------------------------*/
165
166static inline uint64_t extractFloat64Frac(float64 a)
167{
168 return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF);
169}
170
171/*----------------------------------------------------------------------------
172| Returns the exponent bits of the double-precision floating-point value `a'.
173*----------------------------------------------------------------------------*/
174
175static inline int extractFloat64Exp(float64 a)
176{
177 return (float64_val(a) >> 52) & 0x7FF;
178}
179
180/*----------------------------------------------------------------------------
181| Returns the sign bit of the double-precision floating-point value `a'.
182*----------------------------------------------------------------------------*/
183
184static inline flag extractFloat64Sign(float64 a)
185{
186 return float64_val(a) >> 63;
187}
188
a90119b5
AB
189/*
190 * Classify a floating point number. Everything above float_class_qnan
191 * is a NaN so cls >= float_class_qnan is any NaN.
192 */
193
194typedef enum __attribute__ ((__packed__)) {
195 float_class_unclassified,
196 float_class_zero,
197 float_class_normal,
198 float_class_inf,
199 float_class_qnan, /* all NaNs from here */
200 float_class_snan,
201 float_class_dnan,
202 float_class_msnan, /* maybe silenced */
203} FloatClass;
204
205/*
206 * Structure holding all of the decomposed parts of a float. The
207 * exponent is unbiased and the fraction is normalized. All
208 * calculations are done with a 64 bit fraction and then rounded as
209 * appropriate for the final format.
210 *
211 * Thanks to the packed FloatClass a decent compiler should be able to
212 * fit the whole structure into registers and avoid using the stack
213 * for parameter passing.
214 */
215
216typedef struct {
217 uint64_t frac;
218 int32_t exp;
219 FloatClass cls;
220 bool sign;
221} FloatParts;
222
223#define DECOMPOSED_BINARY_POINT (64 - 2)
224#define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT)
225#define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1)
226
227/* Structure holding all of the relevant parameters for a format.
228 * exp_size: the size of the exponent field
229 * exp_bias: the offset applied to the exponent field
230 * exp_max: the maximum normalised exponent
231 * frac_size: the size of the fraction field
232 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
233 * The following are computed based the size of fraction
234 * frac_lsb: least significant bit of fraction
235 * fram_lsbm1: the bit bellow the least significant bit (for rounding)
236 * round_mask/roundeven_mask: masks used for rounding
237 */
238typedef struct {
239 int exp_size;
240 int exp_bias;
241 int exp_max;
242 int frac_size;
243 int frac_shift;
244 uint64_t frac_lsb;
245 uint64_t frac_lsbm1;
246 uint64_t round_mask;
247 uint64_t roundeven_mask;
248} FloatFmt;
249
250/* Expand fields based on the size of exponent and fraction */
251#define FLOAT_PARAMS(E, F) \
252 .exp_size = E, \
253 .exp_bias = ((1 << E) - 1) >> 1, \
254 .exp_max = (1 << E) - 1, \
255 .frac_size = F, \
256 .frac_shift = DECOMPOSED_BINARY_POINT - F, \
257 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \
258 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \
259 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \
260 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
261
262static const FloatFmt float16_params = {
263 FLOAT_PARAMS(5, 10)
264};
265
266static const FloatFmt float32_params = {
267 FLOAT_PARAMS(8, 23)
268};
269
270static const FloatFmt float64_params = {
271 FLOAT_PARAMS(11, 52)
272};
273
6fff2167
AB
274/* Unpack a float to parts, but do not canonicalize. */
275static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
276{
277 const int sign_pos = fmt.frac_size + fmt.exp_size;
278
279 return (FloatParts) {
280 .cls = float_class_unclassified,
281 .sign = extract64(raw, sign_pos, 1),
282 .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
283 .frac = extract64(raw, 0, fmt.frac_size),
284 };
285}
286
287static inline FloatParts float16_unpack_raw(float16 f)
288{
289 return unpack_raw(float16_params, f);
290}
291
292static inline FloatParts float32_unpack_raw(float32 f)
293{
294 return unpack_raw(float32_params, f);
295}
296
297static inline FloatParts float64_unpack_raw(float64 f)
298{
299 return unpack_raw(float64_params, f);
300}
301
302/* Pack a float from parts, but do not canonicalize. */
303static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
304{
305 const int sign_pos = fmt.frac_size + fmt.exp_size;
306 uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
307 return deposit64(ret, sign_pos, 1, p.sign);
308}
309
310static inline float16 float16_pack_raw(FloatParts p)
311{
312 return make_float16(pack_raw(float16_params, p));
313}
314
315static inline float32 float32_pack_raw(FloatParts p)
316{
317 return make_float32(pack_raw(float32_params, p));
318}
319
320static inline float64 float64_pack_raw(FloatParts p)
321{
322 return make_float64(pack_raw(float64_params, p));
323}
324
325/* Canonicalize EXP and FRAC, setting CLS. */
326static FloatParts canonicalize(FloatParts part, const FloatFmt *parm,
327 float_status *status)
328{
329 if (part.exp == parm->exp_max) {
330 if (part.frac == 0) {
331 part.cls = float_class_inf;
332 } else {
333#ifdef NO_SIGNALING_NANS
334 part.cls = float_class_qnan;
335#else
336 int64_t msb = part.frac << (parm->frac_shift + 2);
337 if ((msb < 0) == status->snan_bit_is_one) {
338 part.cls = float_class_snan;
339 } else {
340 part.cls = float_class_qnan;
341 }
342#endif
343 }
344 } else if (part.exp == 0) {
345 if (likely(part.frac == 0)) {
346 part.cls = float_class_zero;
347 } else if (status->flush_inputs_to_zero) {
348 float_raise(float_flag_input_denormal, status);
349 part.cls = float_class_zero;
350 part.frac = 0;
351 } else {
352 int shift = clz64(part.frac) - 1;
353 part.cls = float_class_normal;
354 part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
355 part.frac <<= shift;
356 }
357 } else {
358 part.cls = float_class_normal;
359 part.exp -= parm->exp_bias;
360 part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
361 }
362 return part;
363}
364
365/* Round and uncanonicalize a floating-point number by parts. There
366 * are FRAC_SHIFT bits that may require rounding at the bottom of the
367 * fraction; these bits will be removed. The exponent will be biased
368 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
369 */
370
371static FloatParts round_canonical(FloatParts p, float_status *s,
372 const FloatFmt *parm)
373{
374 const uint64_t frac_lsbm1 = parm->frac_lsbm1;
375 const uint64_t round_mask = parm->round_mask;
376 const uint64_t roundeven_mask = parm->roundeven_mask;
377 const int exp_max = parm->exp_max;
378 const int frac_shift = parm->frac_shift;
379 uint64_t frac, inc;
380 int exp, flags = 0;
381 bool overflow_norm;
382
383 frac = p.frac;
384 exp = p.exp;
385
386 switch (p.cls) {
387 case float_class_normal:
388 switch (s->float_rounding_mode) {
389 case float_round_nearest_even:
390 overflow_norm = false;
391 inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
392 break;
393 case float_round_ties_away:
394 overflow_norm = false;
395 inc = frac_lsbm1;
396 break;
397 case float_round_to_zero:
398 overflow_norm = true;
399 inc = 0;
400 break;
401 case float_round_up:
402 inc = p.sign ? 0 : round_mask;
403 overflow_norm = p.sign;
404 break;
405 case float_round_down:
406 inc = p.sign ? round_mask : 0;
407 overflow_norm = !p.sign;
408 break;
409 default:
410 g_assert_not_reached();
411 }
412
413 exp += parm->exp_bias;
414 if (likely(exp > 0)) {
415 if (frac & round_mask) {
416 flags |= float_flag_inexact;
417 frac += inc;
418 if (frac & DECOMPOSED_OVERFLOW_BIT) {
419 frac >>= 1;
420 exp++;
421 }
422 }
423 frac >>= frac_shift;
424
425 if (unlikely(exp >= exp_max)) {
426 flags |= float_flag_overflow | float_flag_inexact;
427 if (overflow_norm) {
428 exp = exp_max - 1;
429 frac = -1;
430 } else {
431 p.cls = float_class_inf;
432 goto do_inf;
433 }
434 }
435 } else if (s->flush_to_zero) {
436 flags |= float_flag_output_denormal;
437 p.cls = float_class_zero;
438 goto do_zero;
439 } else {
440 bool is_tiny = (s->float_detect_tininess
441 == float_tininess_before_rounding)
442 || (exp < 0)
443 || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
444
445 shift64RightJamming(frac, 1 - exp, &frac);
446 if (frac & round_mask) {
447 /* Need to recompute round-to-even. */
448 if (s->float_rounding_mode == float_round_nearest_even) {
449 inc = ((frac & roundeven_mask) != frac_lsbm1
450 ? frac_lsbm1 : 0);
451 }
452 flags |= float_flag_inexact;
453 frac += inc;
454 }
455
456 exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
457 frac >>= frac_shift;
458
459 if (is_tiny && (flags & float_flag_inexact)) {
460 flags |= float_flag_underflow;
461 }
462 if (exp == 0 && frac == 0) {
463 p.cls = float_class_zero;
464 }
465 }
466 break;
467
468 case float_class_zero:
469 do_zero:
470 exp = 0;
471 frac = 0;
472 break;
473
474 case float_class_inf:
475 do_inf:
476 exp = exp_max;
477 frac = 0;
478 break;
479
480 case float_class_qnan:
481 case float_class_snan:
482 exp = exp_max;
483 break;
484
485 default:
486 g_assert_not_reached();
487 }
488
489 float_raise(flags, s);
490 p.exp = exp;
491 p.frac = frac;
492 return p;
493}
494
495static FloatParts float16_unpack_canonical(float16 f, float_status *s)
496{
497 return canonicalize(float16_unpack_raw(f), &float16_params, s);
498}
499
500static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
501{
502 switch (p.cls) {
503 case float_class_dnan:
504 return float16_default_nan(s);
505 case float_class_msnan:
506 return float16_maybe_silence_nan(float16_pack_raw(p), s);
507 default:
508 p = round_canonical(p, s, &float16_params);
509 return float16_pack_raw(p);
510 }
511}
512
513static FloatParts float32_unpack_canonical(float32 f, float_status *s)
514{
515 return canonicalize(float32_unpack_raw(f), &float32_params, s);
516}
517
518static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
519{
520 switch (p.cls) {
521 case float_class_dnan:
522 return float32_default_nan(s);
523 case float_class_msnan:
524 return float32_maybe_silence_nan(float32_pack_raw(p), s);
525 default:
526 p = round_canonical(p, s, &float32_params);
527 return float32_pack_raw(p);
528 }
529}
530
531static FloatParts float64_unpack_canonical(float64 f, float_status *s)
532{
533 return canonicalize(float64_unpack_raw(f), &float64_params, s);
534}
535
536static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
537{
538 switch (p.cls) {
539 case float_class_dnan:
540 return float64_default_nan(s);
541 case float_class_msnan:
542 return float64_maybe_silence_nan(float64_pack_raw(p), s);
543 default:
544 p = round_canonical(p, s, &float64_params);
545 return float64_pack_raw(p);
546 }
547}
548
549/* Simple helpers for checking if what NaN we have */
550static bool is_nan(FloatClass c)
551{
552 return unlikely(c >= float_class_qnan);
553}
554static bool is_snan(FloatClass c)
555{
556 return c == float_class_snan;
557}
558static bool is_qnan(FloatClass c)
559{
560 return c == float_class_qnan;
561}
562
dbe4d53a
AB
563static FloatParts return_nan(FloatParts a, float_status *s)
564{
565 switch (a.cls) {
566 case float_class_snan:
567 s->float_exception_flags |= float_flag_invalid;
568 a.cls = float_class_msnan;
569 /* fall through */
570 case float_class_qnan:
571 if (s->default_nan_mode) {
572 a.cls = float_class_dnan;
573 }
574 break;
575
576 default:
577 g_assert_not_reached();
578 }
579 return a;
580}
581
6fff2167
AB
582static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
583{
584 if (is_snan(a.cls) || is_snan(b.cls)) {
585 s->float_exception_flags |= float_flag_invalid;
586 }
587
588 if (s->default_nan_mode) {
589 a.cls = float_class_dnan;
590 } else {
591 if (pickNaN(is_qnan(a.cls), is_snan(a.cls),
592 is_qnan(b.cls), is_snan(b.cls),
593 a.frac > b.frac ||
594 (a.frac == b.frac && a.sign < b.sign))) {
595 a = b;
596 }
597 a.cls = float_class_msnan;
598 }
599 return a;
600}
601
d446830a
AB
602static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
603 bool inf_zero, float_status *s)
604{
605 if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
606 s->float_exception_flags |= float_flag_invalid;
607 }
608
609 if (s->default_nan_mode) {
610 a.cls = float_class_dnan;
611 } else {
612 switch (pickNaNMulAdd(is_qnan(a.cls), is_snan(a.cls),
613 is_qnan(b.cls), is_snan(b.cls),
614 is_qnan(c.cls), is_snan(c.cls),
615 inf_zero, s)) {
616 case 0:
617 break;
618 case 1:
619 a = b;
620 break;
621 case 2:
622 a = c;
623 break;
624 case 3:
625 a.cls = float_class_dnan;
626 return a;
627 default:
628 g_assert_not_reached();
629 }
630
631 a.cls = float_class_msnan;
632 }
633 return a;
634}
635
6fff2167
AB
636/*
637 * Returns the result of adding or subtracting the values of the
638 * floating-point values `a' and `b'. The operation is performed
639 * according to the IEC/IEEE Standard for Binary Floating-Point
640 * Arithmetic.
641 */
642
643static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
644 float_status *s)
645{
646 bool a_sign = a.sign;
647 bool b_sign = b.sign ^ subtract;
648
649 if (a_sign != b_sign) {
650 /* Subtraction */
651
652 if (a.cls == float_class_normal && b.cls == float_class_normal) {
653 if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
654 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
655 a.frac = a.frac - b.frac;
656 } else {
657 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
658 a.frac = b.frac - a.frac;
659 a.exp = b.exp;
660 a_sign ^= 1;
661 }
662
663 if (a.frac == 0) {
664 a.cls = float_class_zero;
665 a.sign = s->float_rounding_mode == float_round_down;
666 } else {
667 int shift = clz64(a.frac) - 1;
668 a.frac = a.frac << shift;
669 a.exp = a.exp - shift;
670 a.sign = a_sign;
671 }
672 return a;
673 }
674 if (is_nan(a.cls) || is_nan(b.cls)) {
675 return pick_nan(a, b, s);
676 }
677 if (a.cls == float_class_inf) {
678 if (b.cls == float_class_inf) {
679 float_raise(float_flag_invalid, s);
680 a.cls = float_class_dnan;
681 }
682 return a;
683 }
684 if (a.cls == float_class_zero && b.cls == float_class_zero) {
685 a.sign = s->float_rounding_mode == float_round_down;
686 return a;
687 }
688 if (a.cls == float_class_zero || b.cls == float_class_inf) {
689 b.sign = a_sign ^ 1;
690 return b;
691 }
692 if (b.cls == float_class_zero) {
693 return a;
694 }
695 } else {
696 /* Addition */
697 if (a.cls == float_class_normal && b.cls == float_class_normal) {
698 if (a.exp > b.exp) {
699 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
700 } else if (a.exp < b.exp) {
701 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
702 a.exp = b.exp;
703 }
704 a.frac += b.frac;
705 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
706 a.frac >>= 1;
707 a.exp += 1;
708 }
709 return a;
710 }
711 if (is_nan(a.cls) || is_nan(b.cls)) {
712 return pick_nan(a, b, s);
713 }
714 if (a.cls == float_class_inf || b.cls == float_class_zero) {
715 return a;
716 }
717 if (b.cls == float_class_inf || a.cls == float_class_zero) {
718 b.sign = b_sign;
719 return b;
720 }
721 }
722 g_assert_not_reached();
723}
724
725/*
726 * Returns the result of adding or subtracting the floating-point
727 * values `a' and `b'. The operation is performed according to the
728 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
729 */
730
731float16 __attribute__((flatten)) float16_add(float16 a, float16 b,
732 float_status *status)
733{
734 FloatParts pa = float16_unpack_canonical(a, status);
735 FloatParts pb = float16_unpack_canonical(b, status);
736 FloatParts pr = addsub_floats(pa, pb, false, status);
737
738 return float16_round_pack_canonical(pr, status);
739}
740
741float32 __attribute__((flatten)) float32_add(float32 a, float32 b,
742 float_status *status)
743{
744 FloatParts pa = float32_unpack_canonical(a, status);
745 FloatParts pb = float32_unpack_canonical(b, status);
746 FloatParts pr = addsub_floats(pa, pb, false, status);
747
748 return float32_round_pack_canonical(pr, status);
749}
750
751float64 __attribute__((flatten)) float64_add(float64 a, float64 b,
752 float_status *status)
753{
754 FloatParts pa = float64_unpack_canonical(a, status);
755 FloatParts pb = float64_unpack_canonical(b, status);
756 FloatParts pr = addsub_floats(pa, pb, false, status);
757
758 return float64_round_pack_canonical(pr, status);
759}
760
761float16 __attribute__((flatten)) float16_sub(float16 a, float16 b,
762 float_status *status)
763{
764 FloatParts pa = float16_unpack_canonical(a, status);
765 FloatParts pb = float16_unpack_canonical(b, status);
766 FloatParts pr = addsub_floats(pa, pb, true, status);
767
768 return float16_round_pack_canonical(pr, status);
769}
770
771float32 __attribute__((flatten)) float32_sub(float32 a, float32 b,
772 float_status *status)
773{
774 FloatParts pa = float32_unpack_canonical(a, status);
775 FloatParts pb = float32_unpack_canonical(b, status);
776 FloatParts pr = addsub_floats(pa, pb, true, status);
777
778 return float32_round_pack_canonical(pr, status);
779}
780
781float64 __attribute__((flatten)) float64_sub(float64 a, float64 b,
782 float_status *status)
783{
784 FloatParts pa = float64_unpack_canonical(a, status);
785 FloatParts pb = float64_unpack_canonical(b, status);
786 FloatParts pr = addsub_floats(pa, pb, true, status);
787
788 return float64_round_pack_canonical(pr, status);
789}
790
74d707e2
AB
791/*
792 * Returns the result of multiplying the floating-point values `a' and
793 * `b'. The operation is performed according to the IEC/IEEE Standard
794 * for Binary Floating-Point Arithmetic.
795 */
796
797static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
798{
799 bool sign = a.sign ^ b.sign;
800
801 if (a.cls == float_class_normal && b.cls == float_class_normal) {
802 uint64_t hi, lo;
803 int exp = a.exp + b.exp;
804
805 mul64To128(a.frac, b.frac, &hi, &lo);
806 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
807 if (lo & DECOMPOSED_OVERFLOW_BIT) {
808 shift64RightJamming(lo, 1, &lo);
809 exp += 1;
810 }
811
812 /* Re-use a */
813 a.exp = exp;
814 a.sign = sign;
815 a.frac = lo;
816 return a;
817 }
818 /* handle all the NaN cases */
819 if (is_nan(a.cls) || is_nan(b.cls)) {
820 return pick_nan(a, b, s);
821 }
822 /* Inf * Zero == NaN */
823 if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
824 (a.cls == float_class_zero && b.cls == float_class_inf)) {
825 s->float_exception_flags |= float_flag_invalid;
826 a.cls = float_class_dnan;
827 a.sign = sign;
828 return a;
829 }
830 /* Multiply by 0 or Inf */
831 if (a.cls == float_class_inf || a.cls == float_class_zero) {
832 a.sign = sign;
833 return a;
834 }
835 if (b.cls == float_class_inf || b.cls == float_class_zero) {
836 b.sign = sign;
837 return b;
838 }
839 g_assert_not_reached();
840}
841
842float16 __attribute__((flatten)) float16_mul(float16 a, float16 b,
843 float_status *status)
844{
845 FloatParts pa = float16_unpack_canonical(a, status);
846 FloatParts pb = float16_unpack_canonical(b, status);
847 FloatParts pr = mul_floats(pa, pb, status);
848
849 return float16_round_pack_canonical(pr, status);
850}
851
852float32 __attribute__((flatten)) float32_mul(float32 a, float32 b,
853 float_status *status)
854{
855 FloatParts pa = float32_unpack_canonical(a, status);
856 FloatParts pb = float32_unpack_canonical(b, status);
857 FloatParts pr = mul_floats(pa, pb, status);
858
859 return float32_round_pack_canonical(pr, status);
860}
861
862float64 __attribute__((flatten)) float64_mul(float64 a, float64 b,
863 float_status *status)
864{
865 FloatParts pa = float64_unpack_canonical(a, status);
866 FloatParts pb = float64_unpack_canonical(b, status);
867 FloatParts pr = mul_floats(pa, pb, status);
868
869 return float64_round_pack_canonical(pr, status);
870}
871
d446830a
AB
872/*
873 * Returns the result of multiplying the floating-point values `a' and
874 * `b' then adding 'c', with no intermediate rounding step after the
875 * multiplication. The operation is performed according to the
876 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
877 * The flags argument allows the caller to select negation of the
878 * addend, the intermediate product, or the final result. (The
879 * difference between this and having the caller do a separate
880 * negation is that negating externally will flip the sign bit on
881 * NaNs.)
882 */
883
884static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
885 int flags, float_status *s)
886{
887 bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
888 ((1 << float_class_inf) | (1 << float_class_zero));
889 bool p_sign;
890 bool sign_flip = flags & float_muladd_negate_result;
891 FloatClass p_class;
892 uint64_t hi, lo;
893 int p_exp;
894
895 /* It is implementation-defined whether the cases of (0,inf,qnan)
896 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
897 * they return if they do), so we have to hand this information
898 * off to the target-specific pick-a-NaN routine.
899 */
900 if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
901 return pick_nan_muladd(a, b, c, inf_zero, s);
902 }
903
904 if (inf_zero) {
905 s->float_exception_flags |= float_flag_invalid;
906 a.cls = float_class_dnan;
907 return a;
908 }
909
910 if (flags & float_muladd_negate_c) {
911 c.sign ^= 1;
912 }
913
914 p_sign = a.sign ^ b.sign;
915
916 if (flags & float_muladd_negate_product) {
917 p_sign ^= 1;
918 }
919
920 if (a.cls == float_class_inf || b.cls == float_class_inf) {
921 p_class = float_class_inf;
922 } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
923 p_class = float_class_zero;
924 } else {
925 p_class = float_class_normal;
926 }
927
928 if (c.cls == float_class_inf) {
929 if (p_class == float_class_inf && p_sign != c.sign) {
930 s->float_exception_flags |= float_flag_invalid;
931 a.cls = float_class_dnan;
932 } else {
933 a.cls = float_class_inf;
934 a.sign = c.sign ^ sign_flip;
935 }
936 return a;
937 }
938
939 if (p_class == float_class_inf) {
940 a.cls = float_class_inf;
941 a.sign = p_sign ^ sign_flip;
942 return a;
943 }
944
945 if (p_class == float_class_zero) {
946 if (c.cls == float_class_zero) {
947 if (p_sign != c.sign) {
948 p_sign = s->float_rounding_mode == float_round_down;
949 }
950 c.sign = p_sign;
951 } else if (flags & float_muladd_halve_result) {
952 c.exp -= 1;
953 }
954 c.sign ^= sign_flip;
955 return c;
956 }
957
958 /* a & b should be normals now... */
959 assert(a.cls == float_class_normal &&
960 b.cls == float_class_normal);
961
962 p_exp = a.exp + b.exp;
963
964 /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
965 * result.
966 */
967 mul64To128(a.frac, b.frac, &hi, &lo);
968 /* binary point now at bit 124 */
969
970 /* check for overflow */
971 if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
972 shift128RightJamming(hi, lo, 1, &hi, &lo);
973 p_exp += 1;
974 }
975
976 /* + add/sub */
977 if (c.cls == float_class_zero) {
978 /* move binary point back to 62 */
979 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
980 } else {
981 int exp_diff = p_exp - c.exp;
982 if (p_sign == c.sign) {
983 /* Addition */
984 if (exp_diff <= 0) {
985 shift128RightJamming(hi, lo,
986 DECOMPOSED_BINARY_POINT - exp_diff,
987 &hi, &lo);
988 lo += c.frac;
989 p_exp = c.exp;
990 } else {
991 uint64_t c_hi, c_lo;
992 /* shift c to the same binary point as the product (124) */
993 c_hi = c.frac >> 2;
994 c_lo = 0;
995 shift128RightJamming(c_hi, c_lo,
996 exp_diff,
997 &c_hi, &c_lo);
998 add128(hi, lo, c_hi, c_lo, &hi, &lo);
999 /* move binary point back to 62 */
1000 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1001 }
1002
1003 if (lo & DECOMPOSED_OVERFLOW_BIT) {
1004 shift64RightJamming(lo, 1, &lo);
1005 p_exp += 1;
1006 }
1007
1008 } else {
1009 /* Subtraction */
1010 uint64_t c_hi, c_lo;
1011 /* make C binary point match product at bit 124 */
1012 c_hi = c.frac >> 2;
1013 c_lo = 0;
1014
1015 if (exp_diff <= 0) {
1016 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1017 if (exp_diff == 0
1018 &&
1019 (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1020 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1021 } else {
1022 sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1023 p_sign ^= 1;
1024 p_exp = c.exp;
1025 }
1026 } else {
1027 shift128RightJamming(c_hi, c_lo,
1028 exp_diff,
1029 &c_hi, &c_lo);
1030 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1031 }
1032
1033 if (hi == 0 && lo == 0) {
1034 a.cls = float_class_zero;
1035 a.sign = s->float_rounding_mode == float_round_down;
1036 a.sign ^= sign_flip;
1037 return a;
1038 } else {
1039 int shift;
1040 if (hi != 0) {
1041 shift = clz64(hi);
1042 } else {
1043 shift = clz64(lo) + 64;
1044 }
1045 /* Normalizing to a binary point of 124 is the
1046 correct adjust for the exponent. However since we're
1047 shifting, we might as well put the binary point back
1048 at 62 where we really want it. Therefore shift as
1049 if we're leaving 1 bit at the top of the word, but
1050 adjust the exponent as if we're leaving 3 bits. */
1051 shift -= 1;
1052 if (shift >= 64) {
1053 lo = lo << (shift - 64);
1054 } else {
1055 hi = (hi << shift) | (lo >> (64 - shift));
1056 lo = hi | ((lo << shift) != 0);
1057 }
1058 p_exp -= shift - 2;
1059 }
1060 }
1061 }
1062
1063 if (flags & float_muladd_halve_result) {
1064 p_exp -= 1;
1065 }
1066
1067 /* finally prepare our result */
1068 a.cls = float_class_normal;
1069 a.sign = p_sign ^ sign_flip;
1070 a.exp = p_exp;
1071 a.frac = lo;
1072
1073 return a;
1074}
1075
1076float16 __attribute__((flatten)) float16_muladd(float16 a, float16 b, float16 c,
1077 int flags, float_status *status)
1078{
1079 FloatParts pa = float16_unpack_canonical(a, status);
1080 FloatParts pb = float16_unpack_canonical(b, status);
1081 FloatParts pc = float16_unpack_canonical(c, status);
1082 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1083
1084 return float16_round_pack_canonical(pr, status);
1085}
1086
1087float32 __attribute__((flatten)) float32_muladd(float32 a, float32 b, float32 c,
1088 int flags, float_status *status)
1089{
1090 FloatParts pa = float32_unpack_canonical(a, status);
1091 FloatParts pb = float32_unpack_canonical(b, status);
1092 FloatParts pc = float32_unpack_canonical(c, status);
1093 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1094
1095 return float32_round_pack_canonical(pr, status);
1096}
1097
1098float64 __attribute__((flatten)) float64_muladd(float64 a, float64 b, float64 c,
1099 int flags, float_status *status)
1100{
1101 FloatParts pa = float64_unpack_canonical(a, status);
1102 FloatParts pb = float64_unpack_canonical(b, status);
1103 FloatParts pc = float64_unpack_canonical(c, status);
1104 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1105
1106 return float64_round_pack_canonical(pr, status);
1107}
1108
cf07323d
AB
1109/*
1110 * Returns the result of dividing the floating-point value `a' by the
1111 * corresponding value `b'. The operation is performed according to
1112 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1113 */
1114
1115static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1116{
1117 bool sign = a.sign ^ b.sign;
1118
1119 if (a.cls == float_class_normal && b.cls == float_class_normal) {
1120 uint64_t temp_lo, temp_hi;
1121 int exp = a.exp - b.exp;
1122 if (a.frac < b.frac) {
1123 exp -= 1;
1124 shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1,
1125 &temp_hi, &temp_lo);
1126 } else {
1127 shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT,
1128 &temp_hi, &temp_lo);
1129 }
1130 /* LSB of quot is set if inexact which roundandpack will use
1131 * to set flags. Yet again we re-use a for the result */
1132 a.frac = div128To64(temp_lo, temp_hi, b.frac);
1133 a.sign = sign;
1134 a.exp = exp;
1135 return a;
1136 }
1137 /* handle all the NaN cases */
1138 if (is_nan(a.cls) || is_nan(b.cls)) {
1139 return pick_nan(a, b, s);
1140 }
1141 /* 0/0 or Inf/Inf */
1142 if (a.cls == b.cls
1143 &&
1144 (a.cls == float_class_inf || a.cls == float_class_zero)) {
1145 s->float_exception_flags |= float_flag_invalid;
1146 a.cls = float_class_dnan;
1147 return a;
1148 }
1149 /* Div 0 => Inf */
1150 if (b.cls == float_class_zero) {
1151 s->float_exception_flags |= float_flag_divbyzero;
1152 a.cls = float_class_inf;
1153 a.sign = sign;
1154 return a;
1155 }
1156 /* Inf / x or 0 / x */
1157 if (a.cls == float_class_inf || a.cls == float_class_zero) {
1158 a.sign = sign;
1159 return a;
1160 }
1161 /* Div by Inf */
1162 if (b.cls == float_class_inf) {
1163 a.cls = float_class_zero;
1164 a.sign = sign;
1165 return a;
1166 }
1167 g_assert_not_reached();
1168}
1169
1170float16 float16_div(float16 a, float16 b, float_status *status)
1171{
1172 FloatParts pa = float16_unpack_canonical(a, status);
1173 FloatParts pb = float16_unpack_canonical(b, status);
1174 FloatParts pr = div_floats(pa, pb, status);
1175
1176 return float16_round_pack_canonical(pr, status);
1177}
1178
1179float32 float32_div(float32 a, float32 b, float_status *status)
1180{
1181 FloatParts pa = float32_unpack_canonical(a, status);
1182 FloatParts pb = float32_unpack_canonical(b, status);
1183 FloatParts pr = div_floats(pa, pb, status);
1184
1185 return float32_round_pack_canonical(pr, status);
1186}
1187
1188float64 float64_div(float64 a, float64 b, float_status *status)
1189{
1190 FloatParts pa = float64_unpack_canonical(a, status);
1191 FloatParts pb = float64_unpack_canonical(b, status);
1192 FloatParts pr = div_floats(pa, pb, status);
1193
1194 return float64_round_pack_canonical(pr, status);
1195}
1196
dbe4d53a
AB
1197/*
1198 * Rounds the floating-point value `a' to an integer, and returns the
1199 * result as a floating-point value. The operation is performed
1200 * according to the IEC/IEEE Standard for Binary Floating-Point
1201 * Arithmetic.
1202 */
1203
1204static FloatParts round_to_int(FloatParts a, int rounding_mode, float_status *s)
1205{
1206 if (is_nan(a.cls)) {
1207 return return_nan(a, s);
1208 }
1209
1210 switch (a.cls) {
1211 case float_class_zero:
1212 case float_class_inf:
1213 case float_class_qnan:
1214 /* already "integral" */
1215 break;
1216 case float_class_normal:
1217 if (a.exp >= DECOMPOSED_BINARY_POINT) {
1218 /* already integral */
1219 break;
1220 }
1221 if (a.exp < 0) {
1222 bool one;
1223 /* all fractional */
1224 s->float_exception_flags |= float_flag_inexact;
1225 switch (rounding_mode) {
1226 case float_round_nearest_even:
1227 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
1228 break;
1229 case float_round_ties_away:
1230 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
1231 break;
1232 case float_round_to_zero:
1233 one = false;
1234 break;
1235 case float_round_up:
1236 one = !a.sign;
1237 break;
1238 case float_round_down:
1239 one = a.sign;
1240 break;
1241 default:
1242 g_assert_not_reached();
1243 }
1244
1245 if (one) {
1246 a.frac = DECOMPOSED_IMPLICIT_BIT;
1247 a.exp = 0;
1248 } else {
1249 a.cls = float_class_zero;
1250 }
1251 } else {
1252 uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
1253 uint64_t frac_lsbm1 = frac_lsb >> 1;
1254 uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
1255 uint64_t rnd_mask = rnd_even_mask >> 1;
1256 uint64_t inc;
1257
1258 switch (rounding_mode) {
1259 case float_round_nearest_even:
1260 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
1261 break;
1262 case float_round_ties_away:
1263 inc = frac_lsbm1;
1264 break;
1265 case float_round_to_zero:
1266 inc = 0;
1267 break;
1268 case float_round_up:
1269 inc = a.sign ? 0 : rnd_mask;
1270 break;
1271 case float_round_down:
1272 inc = a.sign ? rnd_mask : 0;
1273 break;
1274 default:
1275 g_assert_not_reached();
1276 }
1277
1278 if (a.frac & rnd_mask) {
1279 s->float_exception_flags |= float_flag_inexact;
1280 a.frac += inc;
1281 a.frac &= ~rnd_mask;
1282 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1283 a.frac >>= 1;
1284 a.exp++;
1285 }
1286 }
1287 }
1288 break;
1289 default:
1290 g_assert_not_reached();
1291 }
1292 return a;
1293}
1294
1295float16 float16_round_to_int(float16 a, float_status *s)
1296{
1297 FloatParts pa = float16_unpack_canonical(a, s);
1298 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1299 return float16_round_pack_canonical(pr, s);
1300}
1301
1302float32 float32_round_to_int(float32 a, float_status *s)
1303{
1304 FloatParts pa = float32_unpack_canonical(a, s);
1305 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1306 return float32_round_pack_canonical(pr, s);
1307}
1308
1309float64 float64_round_to_int(float64 a, float_status *s)
1310{
1311 FloatParts pa = float64_unpack_canonical(a, s);
1312 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1313 return float64_round_pack_canonical(pr, s);
1314}
1315
1316float64 float64_trunc_to_int(float64 a, float_status *s)
1317{
1318 FloatParts pa = float64_unpack_canonical(a, s);
1319 FloatParts pr = round_to_int(pa, float_round_to_zero, s);
1320 return float64_round_pack_canonical(pr, s);
1321}
1322
158142c2
FB
1323/*----------------------------------------------------------------------------
1324| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
1325| and 7, and returns the properly rounded 32-bit integer corresponding to the
1326| input. If `zSign' is 1, the input is negated before being converted to an
1327| integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
1328| is simply rounded to an integer, with the inexact exception raised if the
1329| input cannot be represented exactly as an integer. However, if the fixed-
1330| point input is too large, the invalid exception is raised and the largest
1331| positive or negative integer is returned.
1332*----------------------------------------------------------------------------*/
1333
f4014512 1334static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
158142c2 1335{
8f506c70 1336 int8_t roundingMode;
158142c2 1337 flag roundNearestEven;
8f506c70 1338 int8_t roundIncrement, roundBits;
760e1416 1339 int32_t z;
158142c2 1340
a2f2d288 1341 roundingMode = status->float_rounding_mode;
158142c2 1342 roundNearestEven = ( roundingMode == float_round_nearest_even );
dc355b76
PM
1343 switch (roundingMode) {
1344 case float_round_nearest_even:
f9288a76 1345 case float_round_ties_away:
dc355b76
PM
1346 roundIncrement = 0x40;
1347 break;
1348 case float_round_to_zero:
1349 roundIncrement = 0;
1350 break;
1351 case float_round_up:
1352 roundIncrement = zSign ? 0 : 0x7f;
1353 break;
1354 case float_round_down:
1355 roundIncrement = zSign ? 0x7f : 0;
1356 break;
1357 default:
1358 abort();
158142c2
FB
1359 }
1360 roundBits = absZ & 0x7F;
1361 absZ = ( absZ + roundIncrement )>>7;
1362 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
1363 z = absZ;
1364 if ( zSign ) z = - z;
1365 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
ff32e16e 1366 float_raise(float_flag_invalid, status);
bb98fe42 1367 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2 1368 }
a2f2d288
PM
1369 if (roundBits) {
1370 status->float_exception_flags |= float_flag_inexact;
1371 }
158142c2
FB
1372 return z;
1373
1374}
1375
1376/*----------------------------------------------------------------------------
1377| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
1378| `absZ1', with binary point between bits 63 and 64 (between the input words),
1379| and returns the properly rounded 64-bit integer corresponding to the input.
1380| If `zSign' is 1, the input is negated before being converted to an integer.
1381| Ordinarily, the fixed-point input is simply rounded to an integer, with
1382| the inexact exception raised if the input cannot be represented exactly as
1383| an integer. However, if the fixed-point input is too large, the invalid
1384| exception is raised and the largest positive or negative integer is
1385| returned.
1386*----------------------------------------------------------------------------*/
1387
f42c2224 1388static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
e5a41ffa 1389 float_status *status)
158142c2 1390{
8f506c70 1391 int8_t roundingMode;
158142c2 1392 flag roundNearestEven, increment;
760e1416 1393 int64_t z;
158142c2 1394
a2f2d288 1395 roundingMode = status->float_rounding_mode;
158142c2 1396 roundNearestEven = ( roundingMode == float_round_nearest_even );
dc355b76
PM
1397 switch (roundingMode) {
1398 case float_round_nearest_even:
f9288a76 1399 case float_round_ties_away:
dc355b76
PM
1400 increment = ((int64_t) absZ1 < 0);
1401 break;
1402 case float_round_to_zero:
1403 increment = 0;
1404 break;
1405 case float_round_up:
1406 increment = !zSign && absZ1;
1407 break;
1408 case float_round_down:
1409 increment = zSign && absZ1;
1410 break;
1411 default:
1412 abort();
158142c2
FB
1413 }
1414 if ( increment ) {
1415 ++absZ0;
1416 if ( absZ0 == 0 ) goto overflow;
bb98fe42 1417 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
158142c2
FB
1418 }
1419 z = absZ0;
1420 if ( zSign ) z = - z;
1421 if ( z && ( ( z < 0 ) ^ zSign ) ) {
1422 overflow:
ff32e16e 1423 float_raise(float_flag_invalid, status);
158142c2 1424 return
bb98fe42 1425 zSign ? (int64_t) LIT64( 0x8000000000000000 )
158142c2
FB
1426 : LIT64( 0x7FFFFFFFFFFFFFFF );
1427 }
a2f2d288
PM
1428 if (absZ1) {
1429 status->float_exception_flags |= float_flag_inexact;
1430 }
158142c2
FB
1431 return z;
1432
1433}
1434
fb3ea83a
TM
1435/*----------------------------------------------------------------------------
1436| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
1437| `absZ1', with binary point between bits 63 and 64 (between the input words),
1438| and returns the properly rounded 64-bit unsigned integer corresponding to the
1439| input. Ordinarily, the fixed-point input is simply rounded to an integer,
1440| with the inexact exception raised if the input cannot be represented exactly
1441| as an integer. However, if the fixed-point input is too large, the invalid
1442| exception is raised and the largest unsigned integer is returned.
1443*----------------------------------------------------------------------------*/
1444
f42c2224 1445static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
e5a41ffa 1446 uint64_t absZ1, float_status *status)
fb3ea83a 1447{
8f506c70 1448 int8_t roundingMode;
fb3ea83a
TM
1449 flag roundNearestEven, increment;
1450
a2f2d288 1451 roundingMode = status->float_rounding_mode;
fb3ea83a 1452 roundNearestEven = (roundingMode == float_round_nearest_even);
dc355b76
PM
1453 switch (roundingMode) {
1454 case float_round_nearest_even:
f9288a76 1455 case float_round_ties_away:
dc355b76
PM
1456 increment = ((int64_t)absZ1 < 0);
1457 break;
1458 case float_round_to_zero:
1459 increment = 0;
1460 break;
1461 case float_round_up:
1462 increment = !zSign && absZ1;
1463 break;
1464 case float_round_down:
1465 increment = zSign && absZ1;
1466 break;
1467 default:
1468 abort();
fb3ea83a
TM
1469 }
1470 if (increment) {
1471 ++absZ0;
1472 if (absZ0 == 0) {
ff32e16e 1473 float_raise(float_flag_invalid, status);
fb3ea83a
TM
1474 return LIT64(0xFFFFFFFFFFFFFFFF);
1475 }
1476 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
1477 }
1478
1479 if (zSign && absZ0) {
ff32e16e 1480 float_raise(float_flag_invalid, status);
fb3ea83a
TM
1481 return 0;
1482 }
1483
1484 if (absZ1) {
a2f2d288 1485 status->float_exception_flags |= float_flag_inexact;
fb3ea83a
TM
1486 }
1487 return absZ0;
1488}
1489
37d18660
PM
1490/*----------------------------------------------------------------------------
1491| If `a' is denormal and we are in flush-to-zero mode then set the
1492| input-denormal exception and return zero. Otherwise just return the value.
1493*----------------------------------------------------------------------------*/
e5a41ffa 1494float32 float32_squash_input_denormal(float32 a, float_status *status)
37d18660 1495{
a2f2d288 1496 if (status->flush_inputs_to_zero) {
37d18660 1497 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
ff32e16e 1498 float_raise(float_flag_input_denormal, status);
37d18660
PM
1499 return make_float32(float32_val(a) & 0x80000000);
1500 }
1501 }
1502 return a;
1503}
1504
158142c2
FB
1505/*----------------------------------------------------------------------------
1506| Normalizes the subnormal single-precision floating-point value represented
1507| by the denormalized significand `aSig'. The normalized exponent and
1508| significand are stored at the locations pointed to by `zExpPtr' and
1509| `zSigPtr', respectively.
1510*----------------------------------------------------------------------------*/
1511
1512static void
0c48262d 1513 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
158142c2 1514{
8f506c70 1515 int8_t shiftCount;
158142c2
FB
1516
1517 shiftCount = countLeadingZeros32( aSig ) - 8;
1518 *zSigPtr = aSig<<shiftCount;
1519 *zExpPtr = 1 - shiftCount;
1520
1521}
1522
1523/*----------------------------------------------------------------------------
1524| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
1525| single-precision floating-point value, returning the result. After being
1526| shifted into the proper positions, the three fields are simply added
1527| together to form the result. This means that any integer portion of `zSig'
1528| will be added into the exponent. Since a properly normalized significand
1529| will have an integer portion equal to 1, the `zExp' input should be 1 less
1530| than the desired result exponent whenever `zSig' is a complete, normalized
1531| significand.
1532*----------------------------------------------------------------------------*/
1533
0c48262d 1534static inline float32 packFloat32(flag zSign, int zExp, uint32_t zSig)
158142c2
FB
1535{
1536
f090c9d4 1537 return make_float32(
bb98fe42 1538 ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
158142c2
FB
1539
1540}
1541
1542/*----------------------------------------------------------------------------
1543| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1544| and significand `zSig', and returns the proper single-precision floating-
1545| point value corresponding to the abstract input. Ordinarily, the abstract
1546| value is simply rounded and packed into the single-precision format, with
1547| the inexact exception raised if the abstract input cannot be represented
1548| exactly. However, if the abstract value is too large, the overflow and
1549| inexact exceptions are raised and an infinity or maximal finite value is
1550| returned. If the abstract value is too small, the input value is rounded to
1551| a subnormal number, and the underflow and inexact exceptions are raised if
1552| the abstract input cannot be represented exactly as a subnormal single-
1553| precision floating-point number.
1554| The input significand `zSig' has its binary point between bits 30
1555| and 29, which is 7 bits to the left of the usual location. This shifted
1556| significand must be normalized or smaller. If `zSig' is not normalized,
1557| `zExp' must be 0; in that case, the result returned is a subnormal number,
1558| and it must not require rounding. In the usual case that `zSig' is
1559| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
1560| The handling of underflow and overflow follows the IEC/IEEE Standard for
1561| Binary Floating-Point Arithmetic.
1562*----------------------------------------------------------------------------*/
1563
0c48262d 1564static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
e5a41ffa 1565 float_status *status)
158142c2 1566{
8f506c70 1567 int8_t roundingMode;
158142c2 1568 flag roundNearestEven;
8f506c70 1569 int8_t roundIncrement, roundBits;
158142c2
FB
1570 flag isTiny;
1571
a2f2d288 1572 roundingMode = status->float_rounding_mode;
158142c2 1573 roundNearestEven = ( roundingMode == float_round_nearest_even );
dc355b76
PM
1574 switch (roundingMode) {
1575 case float_round_nearest_even:
f9288a76 1576 case float_round_ties_away:
dc355b76
PM
1577 roundIncrement = 0x40;
1578 break;
1579 case float_round_to_zero:
1580 roundIncrement = 0;
1581 break;
1582 case float_round_up:
1583 roundIncrement = zSign ? 0 : 0x7f;
1584 break;
1585 case float_round_down:
1586 roundIncrement = zSign ? 0x7f : 0;
1587 break;
1588 default:
1589 abort();
1590 break;
158142c2
FB
1591 }
1592 roundBits = zSig & 0x7F;
bb98fe42 1593 if ( 0xFD <= (uint16_t) zExp ) {
158142c2
FB
1594 if ( ( 0xFD < zExp )
1595 || ( ( zExp == 0xFD )
bb98fe42 1596 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
158142c2 1597 ) {
ff32e16e 1598 float_raise(float_flag_overflow | float_flag_inexact, status);
f090c9d4 1599 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
158142c2
FB
1600 }
1601 if ( zExp < 0 ) {
a2f2d288 1602 if (status->flush_to_zero) {
ff32e16e 1603 float_raise(float_flag_output_denormal, status);
e6afc87f
PM
1604 return packFloat32(zSign, 0, 0);
1605 }
158142c2 1606 isTiny =
a2f2d288
PM
1607 (status->float_detect_tininess
1608 == float_tininess_before_rounding)
158142c2
FB
1609 || ( zExp < -1 )
1610 || ( zSig + roundIncrement < 0x80000000 );
1611 shift32RightJamming( zSig, - zExp, &zSig );
1612 zExp = 0;
1613 roundBits = zSig & 0x7F;
ff32e16e
PM
1614 if (isTiny && roundBits) {
1615 float_raise(float_flag_underflow, status);
1616 }
158142c2
FB
1617 }
1618 }
a2f2d288
PM
1619 if (roundBits) {
1620 status->float_exception_flags |= float_flag_inexact;
1621 }
158142c2
FB
1622 zSig = ( zSig + roundIncrement )>>7;
1623 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
1624 if ( zSig == 0 ) zExp = 0;
1625 return packFloat32( zSign, zExp, zSig );
1626
1627}
1628
1629/*----------------------------------------------------------------------------
1630| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1631| and significand `zSig', and returns the proper single-precision floating-
1632| point value corresponding to the abstract input. This routine is just like
1633| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
1634| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
1635| floating-point exponent.
1636*----------------------------------------------------------------------------*/
1637
1638static float32
0c48262d 1639 normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
e5a41ffa 1640 float_status *status)
158142c2 1641{
8f506c70 1642 int8_t shiftCount;
158142c2
FB
1643
1644 shiftCount = countLeadingZeros32( zSig ) - 1;
ff32e16e
PM
1645 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
1646 status);
158142c2
FB
1647
1648}
1649
37d18660
PM
1650/*----------------------------------------------------------------------------
1651| If `a' is denormal and we are in flush-to-zero mode then set the
1652| input-denormal exception and return zero. Otherwise just return the value.
1653*----------------------------------------------------------------------------*/
e5a41ffa 1654float64 float64_squash_input_denormal(float64 a, float_status *status)
37d18660 1655{
a2f2d288 1656 if (status->flush_inputs_to_zero) {
37d18660 1657 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
ff32e16e 1658 float_raise(float_flag_input_denormal, status);
37d18660
PM
1659 return make_float64(float64_val(a) & (1ULL << 63));
1660 }
1661 }
1662 return a;
1663}
1664
158142c2
FB
1665/*----------------------------------------------------------------------------
1666| Normalizes the subnormal double-precision floating-point value represented
1667| by the denormalized significand `aSig'. The normalized exponent and
1668| significand are stored at the locations pointed to by `zExpPtr' and
1669| `zSigPtr', respectively.
1670*----------------------------------------------------------------------------*/
1671
1672static void
0c48262d 1673 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
158142c2 1674{
8f506c70 1675 int8_t shiftCount;
158142c2
FB
1676
1677 shiftCount = countLeadingZeros64( aSig ) - 11;
1678 *zSigPtr = aSig<<shiftCount;
1679 *zExpPtr = 1 - shiftCount;
1680
1681}
1682
1683/*----------------------------------------------------------------------------
1684| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
1685| double-precision floating-point value, returning the result. After being
1686| shifted into the proper positions, the three fields are simply added
1687| together to form the result. This means that any integer portion of `zSig'
1688| will be added into the exponent. Since a properly normalized significand
1689| will have an integer portion equal to 1, the `zExp' input should be 1 less
1690| than the desired result exponent whenever `zSig' is a complete, normalized
1691| significand.
1692*----------------------------------------------------------------------------*/
1693
0c48262d 1694static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
158142c2
FB
1695{
1696
f090c9d4 1697 return make_float64(
bb98fe42 1698 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
158142c2
FB
1699
1700}
1701
1702/*----------------------------------------------------------------------------
1703| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1704| and significand `zSig', and returns the proper double-precision floating-
1705| point value corresponding to the abstract input. Ordinarily, the abstract
1706| value is simply rounded and packed into the double-precision format, with
1707| the inexact exception raised if the abstract input cannot be represented
1708| exactly. However, if the abstract value is too large, the overflow and
1709| inexact exceptions are raised and an infinity or maximal finite value is
a7d1ac78
PM
1710| returned. If the abstract value is too small, the input value is rounded to
1711| a subnormal number, and the underflow and inexact exceptions are raised if
1712| the abstract input cannot be represented exactly as a subnormal double-
158142c2
FB
1713| precision floating-point number.
1714| The input significand `zSig' has its binary point between bits 62
1715| and 61, which is 10 bits to the left of the usual location. This shifted
1716| significand must be normalized or smaller. If `zSig' is not normalized,
1717| `zExp' must be 0; in that case, the result returned is a subnormal number,
1718| and it must not require rounding. In the usual case that `zSig' is
1719| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
1720| The handling of underflow and overflow follows the IEC/IEEE Standard for
1721| Binary Floating-Point Arithmetic.
1722*----------------------------------------------------------------------------*/
1723
0c48262d 1724static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
e5a41ffa 1725 float_status *status)
158142c2 1726{
8f506c70 1727 int8_t roundingMode;
158142c2 1728 flag roundNearestEven;
0c48262d 1729 int roundIncrement, roundBits;
158142c2
FB
1730 flag isTiny;
1731
a2f2d288 1732 roundingMode = status->float_rounding_mode;
158142c2 1733 roundNearestEven = ( roundingMode == float_round_nearest_even );
dc355b76
PM
1734 switch (roundingMode) {
1735 case float_round_nearest_even:
f9288a76 1736 case float_round_ties_away:
dc355b76
PM
1737 roundIncrement = 0x200;
1738 break;
1739 case float_round_to_zero:
1740 roundIncrement = 0;
1741 break;
1742 case float_round_up:
1743 roundIncrement = zSign ? 0 : 0x3ff;
1744 break;
1745 case float_round_down:
1746 roundIncrement = zSign ? 0x3ff : 0;
1747 break;
9ee6f678
BR
1748 case float_round_to_odd:
1749 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
1750 break;
dc355b76
PM
1751 default:
1752 abort();
158142c2
FB
1753 }
1754 roundBits = zSig & 0x3FF;
bb98fe42 1755 if ( 0x7FD <= (uint16_t) zExp ) {
158142c2
FB
1756 if ( ( 0x7FD < zExp )
1757 || ( ( zExp == 0x7FD )
bb98fe42 1758 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
158142c2 1759 ) {
9ee6f678
BR
1760 bool overflow_to_inf = roundingMode != float_round_to_odd &&
1761 roundIncrement != 0;
ff32e16e 1762 float_raise(float_flag_overflow | float_flag_inexact, status);
9ee6f678 1763 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
158142c2
FB
1764 }
1765 if ( zExp < 0 ) {
a2f2d288 1766 if (status->flush_to_zero) {
ff32e16e 1767 float_raise(float_flag_output_denormal, status);
e6afc87f
PM
1768 return packFloat64(zSign, 0, 0);
1769 }
158142c2 1770 isTiny =
a2f2d288
PM
1771 (status->float_detect_tininess
1772 == float_tininess_before_rounding)
158142c2
FB
1773 || ( zExp < -1 )
1774 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
1775 shift64RightJamming( zSig, - zExp, &zSig );
1776 zExp = 0;
1777 roundBits = zSig & 0x3FF;
ff32e16e
PM
1778 if (isTiny && roundBits) {
1779 float_raise(float_flag_underflow, status);
1780 }
9ee6f678
BR
1781 if (roundingMode == float_round_to_odd) {
1782 /*
1783 * For round-to-odd case, the roundIncrement depends on
1784 * zSig which just changed.
1785 */
1786 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
1787 }
158142c2
FB
1788 }
1789 }
a2f2d288
PM
1790 if (roundBits) {
1791 status->float_exception_flags |= float_flag_inexact;
1792 }
158142c2
FB
1793 zSig = ( zSig + roundIncrement )>>10;
1794 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
1795 if ( zSig == 0 ) zExp = 0;
1796 return packFloat64( zSign, zExp, zSig );
1797
1798}
1799
1800/*----------------------------------------------------------------------------
1801| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1802| and significand `zSig', and returns the proper double-precision floating-
1803| point value corresponding to the abstract input. This routine is just like
1804| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
1805| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
1806| floating-point exponent.
1807*----------------------------------------------------------------------------*/
1808
1809static float64
0c48262d 1810 normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
e5a41ffa 1811 float_status *status)
158142c2 1812{
8f506c70 1813 int8_t shiftCount;
158142c2
FB
1814
1815 shiftCount = countLeadingZeros64( zSig ) - 1;
ff32e16e
PM
1816 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
1817 status);
158142c2
FB
1818
1819}
1820
158142c2
FB
1821/*----------------------------------------------------------------------------
1822| Returns the fraction bits of the extended double-precision floating-point
1823| value `a'.
1824*----------------------------------------------------------------------------*/
1825
a49db98d 1826static inline uint64_t extractFloatx80Frac( floatx80 a )
158142c2
FB
1827{
1828
1829 return a.low;
1830
1831}
1832
1833/*----------------------------------------------------------------------------
1834| Returns the exponent bits of the extended double-precision floating-point
1835| value `a'.
1836*----------------------------------------------------------------------------*/
1837
f4014512 1838static inline int32_t extractFloatx80Exp( floatx80 a )
158142c2
FB
1839{
1840
1841 return a.high & 0x7FFF;
1842
1843}
1844
1845/*----------------------------------------------------------------------------
1846| Returns the sign bit of the extended double-precision floating-point value
1847| `a'.
1848*----------------------------------------------------------------------------*/
1849
a49db98d 1850static inline flag extractFloatx80Sign( floatx80 a )
158142c2
FB
1851{
1852
1853 return a.high>>15;
1854
1855}
1856
1857/*----------------------------------------------------------------------------
1858| Normalizes the subnormal extended double-precision floating-point value
1859| represented by the denormalized significand `aSig'. The normalized exponent
1860| and significand are stored at the locations pointed to by `zExpPtr' and
1861| `zSigPtr', respectively.
1862*----------------------------------------------------------------------------*/
1863
1864static void
f4014512 1865 normalizeFloatx80Subnormal( uint64_t aSig, int32_t *zExpPtr, uint64_t *zSigPtr )
158142c2 1866{
8f506c70 1867 int8_t shiftCount;
158142c2
FB
1868
1869 shiftCount = countLeadingZeros64( aSig );
1870 *zSigPtr = aSig<<shiftCount;
1871 *zExpPtr = 1 - shiftCount;
1872
1873}
1874
1875/*----------------------------------------------------------------------------
1876| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
1877| extended double-precision floating-point value, returning the result.
1878*----------------------------------------------------------------------------*/
1879
f4014512 1880static inline floatx80 packFloatx80( flag zSign, int32_t zExp, uint64_t zSig )
158142c2
FB
1881{
1882 floatx80 z;
1883
1884 z.low = zSig;
bb98fe42 1885 z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
158142c2
FB
1886 return z;
1887
1888}
1889
1890/*----------------------------------------------------------------------------
1891| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1892| and extended significand formed by the concatenation of `zSig0' and `zSig1',
1893| and returns the proper extended double-precision floating-point value
1894| corresponding to the abstract input. Ordinarily, the abstract value is
1895| rounded and packed into the extended double-precision format, with the
1896| inexact exception raised if the abstract input cannot be represented
1897| exactly. However, if the abstract value is too large, the overflow and
1898| inexact exceptions are raised and an infinity or maximal finite value is
1899| returned. If the abstract value is too small, the input value is rounded to
1900| a subnormal number, and the underflow and inexact exceptions are raised if
1901| the abstract input cannot be represented exactly as a subnormal extended
1902| double-precision floating-point number.
1903| If `roundingPrecision' is 32 or 64, the result is rounded to the same
1904| number of bits as single or double precision, respectively. Otherwise, the
1905| result is rounded to the full precision of the extended double-precision
1906| format.
1907| The input significand must be normalized or smaller. If the input
1908| significand is not normalized, `zExp' must be 0; in that case, the result
1909| returned is a subnormal number, and it must not require rounding. The
1910| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
1911| Floating-Point Arithmetic.
1912*----------------------------------------------------------------------------*/
1913
8f506c70 1914static floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
f4014512 1915 int32_t zExp, uint64_t zSig0, uint64_t zSig1,
e5a41ffa 1916 float_status *status)
158142c2 1917{
8f506c70 1918 int8_t roundingMode;
158142c2 1919 flag roundNearestEven, increment, isTiny;
f42c2224 1920 int64_t roundIncrement, roundMask, roundBits;
158142c2 1921
a2f2d288 1922 roundingMode = status->float_rounding_mode;
158142c2
FB
1923 roundNearestEven = ( roundingMode == float_round_nearest_even );
1924 if ( roundingPrecision == 80 ) goto precision80;
1925 if ( roundingPrecision == 64 ) {
1926 roundIncrement = LIT64( 0x0000000000000400 );
1927 roundMask = LIT64( 0x00000000000007FF );
1928 }
1929 else if ( roundingPrecision == 32 ) {
1930 roundIncrement = LIT64( 0x0000008000000000 );
1931 roundMask = LIT64( 0x000000FFFFFFFFFF );
1932 }
1933 else {
1934 goto precision80;
1935 }
1936 zSig0 |= ( zSig1 != 0 );
dc355b76
PM
1937 switch (roundingMode) {
1938 case float_round_nearest_even:
f9288a76 1939 case float_round_ties_away:
dc355b76
PM
1940 break;
1941 case float_round_to_zero:
1942 roundIncrement = 0;
1943 break;
1944 case float_round_up:
1945 roundIncrement = zSign ? 0 : roundMask;
1946 break;
1947 case float_round_down:
1948 roundIncrement = zSign ? roundMask : 0;
1949 break;
1950 default:
1951 abort();
158142c2
FB
1952 }
1953 roundBits = zSig0 & roundMask;
bb98fe42 1954 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
158142c2
FB
1955 if ( ( 0x7FFE < zExp )
1956 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
1957 ) {
1958 goto overflow;
1959 }
1960 if ( zExp <= 0 ) {
a2f2d288 1961 if (status->flush_to_zero) {
ff32e16e 1962 float_raise(float_flag_output_denormal, status);
e6afc87f
PM
1963 return packFloatx80(zSign, 0, 0);
1964 }
158142c2 1965 isTiny =
a2f2d288
PM
1966 (status->float_detect_tininess
1967 == float_tininess_before_rounding)
158142c2
FB
1968 || ( zExp < 0 )
1969 || ( zSig0 <= zSig0 + roundIncrement );
1970 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
1971 zExp = 0;
1972 roundBits = zSig0 & roundMask;
ff32e16e
PM
1973 if (isTiny && roundBits) {
1974 float_raise(float_flag_underflow, status);
1975 }
a2f2d288
PM
1976 if (roundBits) {
1977 status->float_exception_flags |= float_flag_inexact;
1978 }
158142c2 1979 zSig0 += roundIncrement;
bb98fe42 1980 if ( (int64_t) zSig0 < 0 ) zExp = 1;
158142c2
FB
1981 roundIncrement = roundMask + 1;
1982 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
1983 roundMask |= roundIncrement;
1984 }
1985 zSig0 &= ~ roundMask;
1986 return packFloatx80( zSign, zExp, zSig0 );
1987 }
1988 }
a2f2d288
PM
1989 if (roundBits) {
1990 status->float_exception_flags |= float_flag_inexact;
1991 }
158142c2
FB
1992 zSig0 += roundIncrement;
1993 if ( zSig0 < roundIncrement ) {
1994 ++zExp;
1995 zSig0 = LIT64( 0x8000000000000000 );
1996 }
1997 roundIncrement = roundMask + 1;
1998 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
1999 roundMask |= roundIncrement;
2000 }
2001 zSig0 &= ~ roundMask;
2002 if ( zSig0 == 0 ) zExp = 0;
2003 return packFloatx80( zSign, zExp, zSig0 );
2004 precision80:
dc355b76
PM
2005 switch (roundingMode) {
2006 case float_round_nearest_even:
f9288a76 2007 case float_round_ties_away:
dc355b76
PM
2008 increment = ((int64_t)zSig1 < 0);
2009 break;
2010 case float_round_to_zero:
2011 increment = 0;
2012 break;
2013 case float_round_up:
2014 increment = !zSign && zSig1;
2015 break;
2016 case float_round_down:
2017 increment = zSign && zSig1;
2018 break;
2019 default:
2020 abort();
158142c2 2021 }
bb98fe42 2022 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
158142c2
FB
2023 if ( ( 0x7FFE < zExp )
2024 || ( ( zExp == 0x7FFE )
2025 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
2026 && increment
2027 )
2028 ) {
2029 roundMask = 0;
2030 overflow:
ff32e16e 2031 float_raise(float_flag_overflow | float_flag_inexact, status);
158142c2
FB
2032 if ( ( roundingMode == float_round_to_zero )
2033 || ( zSign && ( roundingMode == float_round_up ) )
2034 || ( ! zSign && ( roundingMode == float_round_down ) )
2035 ) {
2036 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
2037 }
2038 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2039 }
2040 if ( zExp <= 0 ) {
2041 isTiny =
a2f2d288
PM
2042 (status->float_detect_tininess
2043 == float_tininess_before_rounding)
158142c2
FB
2044 || ( zExp < 0 )
2045 || ! increment
2046 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
2047 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
2048 zExp = 0;
ff32e16e
PM
2049 if (isTiny && zSig1) {
2050 float_raise(float_flag_underflow, status);
2051 }
a2f2d288
PM
2052 if (zSig1) {
2053 status->float_exception_flags |= float_flag_inexact;
2054 }
dc355b76
PM
2055 switch (roundingMode) {
2056 case float_round_nearest_even:
f9288a76 2057 case float_round_ties_away:
dc355b76
PM
2058 increment = ((int64_t)zSig1 < 0);
2059 break;
2060 case float_round_to_zero:
2061 increment = 0;
2062 break;
2063 case float_round_up:
2064 increment = !zSign && zSig1;
2065 break;
2066 case float_round_down:
2067 increment = zSign && zSig1;
2068 break;
2069 default:
2070 abort();
158142c2
FB
2071 }
2072 if ( increment ) {
2073 ++zSig0;
2074 zSig0 &=
bb98fe42
AF
2075 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
2076 if ( (int64_t) zSig0 < 0 ) zExp = 1;
158142c2
FB
2077 }
2078 return packFloatx80( zSign, zExp, zSig0 );
2079 }
2080 }
a2f2d288
PM
2081 if (zSig1) {
2082 status->float_exception_flags |= float_flag_inexact;
2083 }
158142c2
FB
2084 if ( increment ) {
2085 ++zSig0;
2086 if ( zSig0 == 0 ) {
2087 ++zExp;
2088 zSig0 = LIT64( 0x8000000000000000 );
2089 }
2090 else {
bb98fe42 2091 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
158142c2
FB
2092 }
2093 }
2094 else {
2095 if ( zSig0 == 0 ) zExp = 0;
2096 }
2097 return packFloatx80( zSign, zExp, zSig0 );
2098
2099}
2100
2101/*----------------------------------------------------------------------------
2102| Takes an abstract floating-point value having sign `zSign', exponent
2103| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
2104| and returns the proper extended double-precision floating-point value
2105| corresponding to the abstract input. This routine is just like
2106| `roundAndPackFloatx80' except that the input significand does not have to be
2107| normalized.
2108*----------------------------------------------------------------------------*/
2109
8f506c70 2110static floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
f4014512 2111 flag zSign, int32_t zExp,
e5a41ffa
PM
2112 uint64_t zSig0, uint64_t zSig1,
2113 float_status *status)
158142c2 2114{
8f506c70 2115 int8_t shiftCount;
158142c2
FB
2116
2117 if ( zSig0 == 0 ) {
2118 zSig0 = zSig1;
2119 zSig1 = 0;
2120 zExp -= 64;
2121 }
2122 shiftCount = countLeadingZeros64( zSig0 );
2123 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
2124 zExp -= shiftCount;
ff32e16e
PM
2125 return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
2126 zSig0, zSig1, status);
158142c2
FB
2127
2128}
2129
158142c2
FB
2130/*----------------------------------------------------------------------------
2131| Returns the least-significant 64 fraction bits of the quadruple-precision
2132| floating-point value `a'.
2133*----------------------------------------------------------------------------*/
2134
a49db98d 2135static inline uint64_t extractFloat128Frac1( float128 a )
158142c2
FB
2136{
2137
2138 return a.low;
2139
2140}
2141
2142/*----------------------------------------------------------------------------
2143| Returns the most-significant 48 fraction bits of the quadruple-precision
2144| floating-point value `a'.
2145*----------------------------------------------------------------------------*/
2146
a49db98d 2147static inline uint64_t extractFloat128Frac0( float128 a )
158142c2
FB
2148{
2149
2150 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
2151
2152}
2153
2154/*----------------------------------------------------------------------------
2155| Returns the exponent bits of the quadruple-precision floating-point value
2156| `a'.
2157*----------------------------------------------------------------------------*/
2158
f4014512 2159static inline int32_t extractFloat128Exp( float128 a )
158142c2
FB
2160{
2161
2162 return ( a.high>>48 ) & 0x7FFF;
2163
2164}
2165
2166/*----------------------------------------------------------------------------
2167| Returns the sign bit of the quadruple-precision floating-point value `a'.
2168*----------------------------------------------------------------------------*/
2169
a49db98d 2170static inline flag extractFloat128Sign( float128 a )
158142c2
FB
2171{
2172
2173 return a.high>>63;
2174
2175}
2176
2177/*----------------------------------------------------------------------------
2178| Normalizes the subnormal quadruple-precision floating-point value
2179| represented by the denormalized significand formed by the concatenation of
2180| `aSig0' and `aSig1'. The normalized exponent is stored at the location
2181| pointed to by `zExpPtr'. The most significant 49 bits of the normalized
2182| significand are stored at the location pointed to by `zSig0Ptr', and the
2183| least significant 64 bits of the normalized significand are stored at the
2184| location pointed to by `zSig1Ptr'.
2185*----------------------------------------------------------------------------*/
2186
2187static void
2188 normalizeFloat128Subnormal(
bb98fe42
AF
2189 uint64_t aSig0,
2190 uint64_t aSig1,
f4014512 2191 int32_t *zExpPtr,
bb98fe42
AF
2192 uint64_t *zSig0Ptr,
2193 uint64_t *zSig1Ptr
158142c2
FB
2194 )
2195{
8f506c70 2196 int8_t shiftCount;
158142c2
FB
2197
2198 if ( aSig0 == 0 ) {
2199 shiftCount = countLeadingZeros64( aSig1 ) - 15;
2200 if ( shiftCount < 0 ) {
2201 *zSig0Ptr = aSig1>>( - shiftCount );
2202 *zSig1Ptr = aSig1<<( shiftCount & 63 );
2203 }
2204 else {
2205 *zSig0Ptr = aSig1<<shiftCount;
2206 *zSig1Ptr = 0;
2207 }
2208 *zExpPtr = - shiftCount - 63;
2209 }
2210 else {
2211 shiftCount = countLeadingZeros64( aSig0 ) - 15;
2212 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
2213 *zExpPtr = 1 - shiftCount;
2214 }
2215
2216}
2217
2218/*----------------------------------------------------------------------------
2219| Packs the sign `zSign', the exponent `zExp', and the significand formed
2220| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
2221| floating-point value, returning the result. After being shifted into the
2222| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
2223| added together to form the most significant 32 bits of the result. This
2224| means that any integer portion of `zSig0' will be added into the exponent.
2225| Since a properly normalized significand will have an integer portion equal
2226| to 1, the `zExp' input should be 1 less than the desired result exponent
2227| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
2228| significand.
2229*----------------------------------------------------------------------------*/
2230
a49db98d 2231static inline float128
f4014512 2232 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
158142c2
FB
2233{
2234 float128 z;
2235
2236 z.low = zSig1;
bb98fe42 2237 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
158142c2
FB
2238 return z;
2239
2240}
2241
2242/*----------------------------------------------------------------------------
2243| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2244| and extended significand formed by the concatenation of `zSig0', `zSig1',
2245| and `zSig2', and returns the proper quadruple-precision floating-point value
2246| corresponding to the abstract input. Ordinarily, the abstract value is
2247| simply rounded and packed into the quadruple-precision format, with the
2248| inexact exception raised if the abstract input cannot be represented
2249| exactly. However, if the abstract value is too large, the overflow and
2250| inexact exceptions are raised and an infinity or maximal finite value is
2251| returned. If the abstract value is too small, the input value is rounded to
2252| a subnormal number, and the underflow and inexact exceptions are raised if
2253| the abstract input cannot be represented exactly as a subnormal quadruple-
2254| precision floating-point number.
2255| The input significand must be normalized or smaller. If the input
2256| significand is not normalized, `zExp' must be 0; in that case, the result
2257| returned is a subnormal number, and it must not require rounding. In the
2258| usual case that the input significand is normalized, `zExp' must be 1 less
2259| than the ``true'' floating-point exponent. The handling of underflow and
2260| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2261*----------------------------------------------------------------------------*/
2262
f4014512 2263static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
e5a41ffa
PM
2264 uint64_t zSig0, uint64_t zSig1,
2265 uint64_t zSig2, float_status *status)
158142c2 2266{
8f506c70 2267 int8_t roundingMode;
158142c2
FB
2268 flag roundNearestEven, increment, isTiny;
2269
a2f2d288 2270 roundingMode = status->float_rounding_mode;
158142c2 2271 roundNearestEven = ( roundingMode == float_round_nearest_even );
dc355b76
PM
2272 switch (roundingMode) {
2273 case float_round_nearest_even:
f9288a76 2274 case float_round_ties_away:
dc355b76
PM
2275 increment = ((int64_t)zSig2 < 0);
2276 break;
2277 case float_round_to_zero:
2278 increment = 0;
2279 break;
2280 case float_round_up:
2281 increment = !zSign && zSig2;
2282 break;
2283 case float_round_down:
2284 increment = zSign && zSig2;
2285 break;
9ee6f678
BR
2286 case float_round_to_odd:
2287 increment = !(zSig1 & 0x1) && zSig2;
2288 break;
dc355b76
PM
2289 default:
2290 abort();
158142c2 2291 }
bb98fe42 2292 if ( 0x7FFD <= (uint32_t) zExp ) {
158142c2
FB
2293 if ( ( 0x7FFD < zExp )
2294 || ( ( zExp == 0x7FFD )
2295 && eq128(
2296 LIT64( 0x0001FFFFFFFFFFFF ),
2297 LIT64( 0xFFFFFFFFFFFFFFFF ),
2298 zSig0,
2299 zSig1
2300 )
2301 && increment
2302 )
2303 ) {
ff32e16e 2304 float_raise(float_flag_overflow | float_flag_inexact, status);
158142c2
FB
2305 if ( ( roundingMode == float_round_to_zero )
2306 || ( zSign && ( roundingMode == float_round_up ) )
2307 || ( ! zSign && ( roundingMode == float_round_down ) )
9ee6f678 2308 || (roundingMode == float_round_to_odd)
158142c2
FB
2309 ) {
2310 return
2311 packFloat128(
2312 zSign,
2313 0x7FFE,
2314 LIT64( 0x0000FFFFFFFFFFFF ),
2315 LIT64( 0xFFFFFFFFFFFFFFFF )
2316 );
2317 }
2318 return packFloat128( zSign, 0x7FFF, 0, 0 );
2319 }
2320 if ( zExp < 0 ) {
a2f2d288 2321 if (status->flush_to_zero) {
ff32e16e 2322 float_raise(float_flag_output_denormal, status);
e6afc87f
PM
2323 return packFloat128(zSign, 0, 0, 0);
2324 }
158142c2 2325 isTiny =
a2f2d288
PM
2326 (status->float_detect_tininess
2327 == float_tininess_before_rounding)
158142c2
FB
2328 || ( zExp < -1 )
2329 || ! increment
2330 || lt128(
2331 zSig0,
2332 zSig1,
2333 LIT64( 0x0001FFFFFFFFFFFF ),
2334 LIT64( 0xFFFFFFFFFFFFFFFF )
2335 );
2336 shift128ExtraRightJamming(
2337 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
2338 zExp = 0;
ff32e16e
PM
2339 if (isTiny && zSig2) {
2340 float_raise(float_flag_underflow, status);
2341 }
dc355b76
PM
2342 switch (roundingMode) {
2343 case float_round_nearest_even:
f9288a76 2344 case float_round_ties_away:
dc355b76
PM
2345 increment = ((int64_t)zSig2 < 0);
2346 break;
2347 case float_round_to_zero:
2348 increment = 0;
2349 break;
2350 case float_round_up:
2351 increment = !zSign && zSig2;
2352 break;
2353 case float_round_down:
2354 increment = zSign && zSig2;
2355 break;
9ee6f678
BR
2356 case float_round_to_odd:
2357 increment = !(zSig1 & 0x1) && zSig2;
2358 break;
dc355b76
PM
2359 default:
2360 abort();
158142c2
FB
2361 }
2362 }
2363 }
a2f2d288
PM
2364 if (zSig2) {
2365 status->float_exception_flags |= float_flag_inexact;
2366 }
158142c2
FB
2367 if ( increment ) {
2368 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
2369 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
2370 }
2371 else {
2372 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
2373 }
2374 return packFloat128( zSign, zExp, zSig0, zSig1 );
2375
2376}
2377
2378/*----------------------------------------------------------------------------
2379| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2380| and significand formed by the concatenation of `zSig0' and `zSig1', and
2381| returns the proper quadruple-precision floating-point value corresponding
2382| to the abstract input. This routine is just like `roundAndPackFloat128'
2383| except that the input significand has fewer bits and does not have to be
2384| normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
2385| point exponent.
2386*----------------------------------------------------------------------------*/
2387
f4014512 2388static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
e5a41ffa
PM
2389 uint64_t zSig0, uint64_t zSig1,
2390 float_status *status)
158142c2 2391{
8f506c70 2392 int8_t shiftCount;
bb98fe42 2393 uint64_t zSig2;
158142c2
FB
2394
2395 if ( zSig0 == 0 ) {
2396 zSig0 = zSig1;
2397 zSig1 = 0;
2398 zExp -= 64;
2399 }
2400 shiftCount = countLeadingZeros64( zSig0 ) - 15;
2401 if ( 0 <= shiftCount ) {
2402 zSig2 = 0;
2403 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
2404 }
2405 else {
2406 shift128ExtraRightJamming(
2407 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
2408 }
2409 zExp -= shiftCount;
ff32e16e 2410 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
158142c2
FB
2411
2412}
2413
158142c2
FB
2414/*----------------------------------------------------------------------------
2415| Returns the result of converting the 32-bit two's complement integer `a'
2416| to the single-precision floating-point format. The conversion is performed
2417| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2418*----------------------------------------------------------------------------*/
2419
e5a41ffa 2420float32 int32_to_float32(int32_t a, float_status *status)
158142c2
FB
2421{
2422 flag zSign;
2423
f090c9d4 2424 if ( a == 0 ) return float32_zero;
bb98fe42 2425 if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
158142c2 2426 zSign = ( a < 0 );
ff32e16e 2427 return normalizeRoundAndPackFloat32(zSign, 0x9C, zSign ? -a : a, status);
158142c2
FB
2428}
2429
2430/*----------------------------------------------------------------------------
2431| Returns the result of converting the 32-bit two's complement integer `a'
2432| to the double-precision floating-point format. The conversion is performed
2433| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2434*----------------------------------------------------------------------------*/
2435
e5a41ffa 2436float64 int32_to_float64(int32_t a, float_status *status)
158142c2
FB
2437{
2438 flag zSign;
3a87d009 2439 uint32_t absA;
8f506c70 2440 int8_t shiftCount;
bb98fe42 2441 uint64_t zSig;
158142c2 2442
f090c9d4 2443 if ( a == 0 ) return float64_zero;
158142c2
FB
2444 zSign = ( a < 0 );
2445 absA = zSign ? - a : a;
2446 shiftCount = countLeadingZeros32( absA ) + 21;
2447 zSig = absA;
2448 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
2449
2450}
2451
158142c2
FB
2452/*----------------------------------------------------------------------------
2453| Returns the result of converting the 32-bit two's complement integer `a'
2454| to the extended double-precision floating-point format. The conversion
2455| is performed according to the IEC/IEEE Standard for Binary Floating-Point
2456| Arithmetic.
2457*----------------------------------------------------------------------------*/
2458
e5a41ffa 2459floatx80 int32_to_floatx80(int32_t a, float_status *status)
158142c2
FB
2460{
2461 flag zSign;
3a87d009 2462 uint32_t absA;
8f506c70 2463 int8_t shiftCount;
bb98fe42 2464 uint64_t zSig;
158142c2
FB
2465
2466 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
2467 zSign = ( a < 0 );
2468 absA = zSign ? - a : a;
2469 shiftCount = countLeadingZeros32( absA ) + 32;
2470 zSig = absA;
2471 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
2472
2473}
2474
158142c2
FB
2475/*----------------------------------------------------------------------------
2476| Returns the result of converting the 32-bit two's complement integer `a' to
2477| the quadruple-precision floating-point format. The conversion is performed
2478| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2479*----------------------------------------------------------------------------*/
2480
e5a41ffa 2481float128 int32_to_float128(int32_t a, float_status *status)
158142c2
FB
2482{
2483 flag zSign;
3a87d009 2484 uint32_t absA;
8f506c70 2485 int8_t shiftCount;
bb98fe42 2486 uint64_t zSig0;
158142c2
FB
2487
2488 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
2489 zSign = ( a < 0 );
2490 absA = zSign ? - a : a;
2491 shiftCount = countLeadingZeros32( absA ) + 17;
2492 zSig0 = absA;
2493 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
2494
2495}
2496
158142c2
FB
2497/*----------------------------------------------------------------------------
2498| Returns the result of converting the 64-bit two's complement integer `a'
2499| to the single-precision floating-point format. The conversion is performed
2500| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2501*----------------------------------------------------------------------------*/
2502
e5a41ffa 2503float32 int64_to_float32(int64_t a, float_status *status)
158142c2
FB
2504{
2505 flag zSign;
182f42fd 2506 uint64_t absA;
8f506c70 2507 int8_t shiftCount;
158142c2 2508
f090c9d4 2509 if ( a == 0 ) return float32_zero;
158142c2
FB
2510 zSign = ( a < 0 );
2511 absA = zSign ? - a : a;
2512 shiftCount = countLeadingZeros64( absA ) - 40;
2513 if ( 0 <= shiftCount ) {
2514 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
2515 }
2516 else {
2517 shiftCount += 7;
2518 if ( shiftCount < 0 ) {
2519 shift64RightJamming( absA, - shiftCount, &absA );
2520 }
2521 else {
2522 absA <<= shiftCount;
2523 }
ff32e16e 2524 return roundAndPackFloat32(zSign, 0x9C - shiftCount, absA, status);
158142c2
FB
2525 }
2526
2527}
2528
2529/*----------------------------------------------------------------------------
2530| Returns the result of converting the 64-bit two's complement integer `a'
2531| to the double-precision floating-point format. The conversion is performed
2532| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2533*----------------------------------------------------------------------------*/
2534
e5a41ffa 2535float64 int64_to_float64(int64_t a, float_status *status)
158142c2
FB
2536{
2537 flag zSign;
2538
f090c9d4 2539 if ( a == 0 ) return float64_zero;
bb98fe42 2540 if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
158142c2
FB
2541 return packFloat64( 1, 0x43E, 0 );
2542 }
2543 zSign = ( a < 0 );
ff32e16e 2544 return normalizeRoundAndPackFloat64(zSign, 0x43C, zSign ? -a : a, status);
158142c2
FB
2545}
2546
158142c2
FB
2547/*----------------------------------------------------------------------------
2548| Returns the result of converting the 64-bit two's complement integer `a'
2549| to the extended double-precision floating-point format. The conversion
2550| is performed according to the IEC/IEEE Standard for Binary Floating-Point
2551| Arithmetic.
2552*----------------------------------------------------------------------------*/
2553
e5a41ffa 2554floatx80 int64_to_floatx80(int64_t a, float_status *status)
158142c2
FB
2555{
2556 flag zSign;
182f42fd 2557 uint64_t absA;
8f506c70 2558 int8_t shiftCount;
158142c2
FB
2559
2560 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
2561 zSign = ( a < 0 );
2562 absA = zSign ? - a : a;
2563 shiftCount = countLeadingZeros64( absA );
2564 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
2565
2566}
2567
158142c2
FB
2568/*----------------------------------------------------------------------------
2569| Returns the result of converting the 64-bit two's complement integer `a' to
2570| the quadruple-precision floating-point format. The conversion is performed
2571| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2572*----------------------------------------------------------------------------*/
2573
e5a41ffa 2574float128 int64_to_float128(int64_t a, float_status *status)
158142c2
FB
2575{
2576 flag zSign;
182f42fd 2577 uint64_t absA;
8f506c70 2578 int8_t shiftCount;
f4014512 2579 int32_t zExp;
bb98fe42 2580 uint64_t zSig0, zSig1;
158142c2
FB
2581
2582 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
2583 zSign = ( a < 0 );
2584 absA = zSign ? - a : a;
2585 shiftCount = countLeadingZeros64( absA ) + 49;
2586 zExp = 0x406E - shiftCount;
2587 if ( 64 <= shiftCount ) {
2588 zSig1 = 0;
2589 zSig0 = absA;
2590 shiftCount -= 64;
2591 }
2592 else {
2593 zSig1 = absA;
2594 zSig0 = 0;
2595 }
2596 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
2597 return packFloat128( zSign, zExp, zSig0, zSig1 );
2598
2599}
2600
6bb8e0f1
PM
2601/*----------------------------------------------------------------------------
2602| Returns the result of converting the 64-bit unsigned integer `a'
2603| to the single-precision floating-point format. The conversion is performed
2604| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2605*----------------------------------------------------------------------------*/
2606
e5a41ffa 2607float32 uint64_to_float32(uint64_t a, float_status *status)
6bb8e0f1
PM
2608{
2609 int shiftcount;
2610
2611 if (a == 0) {
2612 return float32_zero;
2613 }
2614
2615 /* Determine (left) shift needed to put first set bit into bit posn 23
2616 * (since packFloat32() expects the binary point between bits 23 and 22);
2617 * this is the fast case for smallish numbers.
2618 */
2619 shiftcount = countLeadingZeros64(a) - 40;
2620 if (shiftcount >= 0) {
2621 return packFloat32(0, 0x95 - shiftcount, a << shiftcount);
2622 }
2623 /* Otherwise we need to do a round-and-pack. roundAndPackFloat32()
2624 * expects the binary point between bits 30 and 29, hence the + 7.
2625 */
2626 shiftcount += 7;
2627 if (shiftcount < 0) {
2628 shift64RightJamming(a, -shiftcount, &a);
2629 } else {
2630 a <<= shiftcount;
2631 }
2632
ff32e16e 2633 return roundAndPackFloat32(0, 0x9c - shiftcount, a, status);
6bb8e0f1
PM
2634}
2635
2636/*----------------------------------------------------------------------------
2637| Returns the result of converting the 64-bit unsigned integer `a'
2638| to the double-precision floating-point format. The conversion is performed
2639| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2640*----------------------------------------------------------------------------*/
2641
e5a41ffa 2642float64 uint64_to_float64(uint64_t a, float_status *status)
6bb8e0f1
PM
2643{
2644 int exp = 0x43C;
2645 int shiftcount;
2646
2647 if (a == 0) {
2648 return float64_zero;
2649 }
2650
2651 shiftcount = countLeadingZeros64(a) - 1;
2652 if (shiftcount < 0) {
2653 shift64RightJamming(a, -shiftcount, &a);
2654 } else {
2655 a <<= shiftcount;
2656 }
ff32e16e 2657 return roundAndPackFloat64(0, exp - shiftcount, a, status);
6bb8e0f1
PM
2658}
2659
2660/*----------------------------------------------------------------------------
2661| Returns the result of converting the 64-bit unsigned integer `a'
2662| to the quadruple-precision floating-point format. The conversion is performed
2663| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2664*----------------------------------------------------------------------------*/
2665
e5a41ffa 2666float128 uint64_to_float128(uint64_t a, float_status *status)
1e397ead
RH
2667{
2668 if (a == 0) {
2669 return float128_zero;
2670 }
ff32e16e 2671 return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status);
1e397ead
RH
2672}
2673
158142c2
FB
2674/*----------------------------------------------------------------------------
2675| Returns the result of converting the single-precision floating-point value
2676| `a' to the 32-bit two's complement integer format. The conversion is
2677| performed according to the IEC/IEEE Standard for Binary Floating-Point
2678| Arithmetic---which means in particular that the conversion is rounded
2679| according to the current rounding mode. If `a' is a NaN, the largest
2680| positive integer is returned. Otherwise, if the conversion overflows, the
2681| largest integer with the same sign as `a' is returned.
2682*----------------------------------------------------------------------------*/
2683
f4014512 2684int32_t float32_to_int32(float32 a, float_status *status)
158142c2
FB
2685{
2686 flag aSign;
0c48262d 2687 int aExp;
07d792d2 2688 int shiftCount;
bb98fe42
AF
2689 uint32_t aSig;
2690 uint64_t aSig64;
158142c2 2691
ff32e16e 2692 a = float32_squash_input_denormal(a, status);
158142c2
FB
2693 aSig = extractFloat32Frac( a );
2694 aExp = extractFloat32Exp( a );
2695 aSign = extractFloat32Sign( a );
2696 if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
2697 if ( aExp ) aSig |= 0x00800000;
2698 shiftCount = 0xAF - aExp;
2699 aSig64 = aSig;
2700 aSig64 <<= 32;
2701 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
ff32e16e 2702 return roundAndPackInt32(aSign, aSig64, status);
158142c2
FB
2703
2704}
2705
2706/*----------------------------------------------------------------------------
2707| Returns the result of converting the single-precision floating-point value
2708| `a' to the 32-bit two's complement integer format. The conversion is
2709| performed according to the IEC/IEEE Standard for Binary Floating-Point
2710| Arithmetic, except that the conversion is always rounded toward zero.
2711| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2712| the conversion overflows, the largest integer with the same sign as `a' is
2713| returned.
2714*----------------------------------------------------------------------------*/
2715
f4014512 2716int32_t float32_to_int32_round_to_zero(float32 a, float_status *status)
158142c2
FB
2717{
2718 flag aSign;
0c48262d 2719 int aExp;
07d792d2 2720 int shiftCount;
bb98fe42 2721 uint32_t aSig;
b3a6a2e0 2722 int32_t z;
ff32e16e 2723 a = float32_squash_input_denormal(a, status);
158142c2
FB
2724
2725 aSig = extractFloat32Frac( a );
2726 aExp = extractFloat32Exp( a );
2727 aSign = extractFloat32Sign( a );
2728 shiftCount = aExp - 0x9E;
2729 if ( 0 <= shiftCount ) {
f090c9d4 2730 if ( float32_val(a) != 0xCF000000 ) {
ff32e16e 2731 float_raise(float_flag_invalid, status);
158142c2
FB
2732 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
2733 }
bb98fe42 2734 return (int32_t) 0x80000000;
158142c2
FB
2735 }
2736 else if ( aExp <= 0x7E ) {
a2f2d288
PM
2737 if (aExp | aSig) {
2738 status->float_exception_flags |= float_flag_inexact;
2739 }
158142c2
FB
2740 return 0;
2741 }
2742 aSig = ( aSig | 0x00800000 )<<8;
2743 z = aSig>>( - shiftCount );
bb98fe42 2744 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
a2f2d288 2745 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
2746 }
2747 if ( aSign ) z = - z;
2748 return z;
2749
2750}
2751
cbcef455
PM
2752/*----------------------------------------------------------------------------
2753| Returns the result of converting the single-precision floating-point value
2754| `a' to the 16-bit two's complement integer format. The conversion is
2755| performed according to the IEC/IEEE Standard for Binary Floating-Point
2756| Arithmetic, except that the conversion is always rounded toward zero.
2757| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2758| the conversion overflows, the largest integer with the same sign as `a' is
2759| returned.
2760*----------------------------------------------------------------------------*/
2761
0bb721d7 2762int16_t float32_to_int16_round_to_zero(float32 a, float_status *status)
cbcef455
PM
2763{
2764 flag aSign;
0c48262d 2765 int aExp;
07d792d2 2766 int shiftCount;
bb98fe42 2767 uint32_t aSig;
f4014512 2768 int32_t z;
cbcef455
PM
2769
2770 aSig = extractFloat32Frac( a );
2771 aExp = extractFloat32Exp( a );
2772 aSign = extractFloat32Sign( a );
2773 shiftCount = aExp - 0x8E;
2774 if ( 0 <= shiftCount ) {
2775 if ( float32_val(a) != 0xC7000000 ) {
ff32e16e 2776 float_raise(float_flag_invalid, status);
cbcef455
PM
2777 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
2778 return 0x7FFF;
2779 }
2780 }
bb98fe42 2781 return (int32_t) 0xffff8000;
cbcef455
PM
2782 }
2783 else if ( aExp <= 0x7E ) {
2784 if ( aExp | aSig ) {
a2f2d288 2785 status->float_exception_flags |= float_flag_inexact;
cbcef455
PM
2786 }
2787 return 0;
2788 }
2789 shiftCount -= 0x10;
2790 aSig = ( aSig | 0x00800000 )<<8;
2791 z = aSig>>( - shiftCount );
bb98fe42 2792 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
a2f2d288 2793 status->float_exception_flags |= float_flag_inexact;
cbcef455
PM
2794 }
2795 if ( aSign ) {
2796 z = - z;
2797 }
2798 return z;
2799
2800}
2801
158142c2
FB
2802/*----------------------------------------------------------------------------
2803| Returns the result of converting the single-precision floating-point value
2804| `a' to the 64-bit two's complement integer format. The conversion is
2805| performed according to the IEC/IEEE Standard for Binary Floating-Point
2806| Arithmetic---which means in particular that the conversion is rounded
2807| according to the current rounding mode. If `a' is a NaN, the largest
2808| positive integer is returned. Otherwise, if the conversion overflows, the
2809| largest integer with the same sign as `a' is returned.
2810*----------------------------------------------------------------------------*/
2811
f42c2224 2812int64_t float32_to_int64(float32 a, float_status *status)
158142c2
FB
2813{
2814 flag aSign;
0c48262d 2815 int aExp;
07d792d2 2816 int shiftCount;
bb98fe42
AF
2817 uint32_t aSig;
2818 uint64_t aSig64, aSigExtra;
ff32e16e 2819 a = float32_squash_input_denormal(a, status);
158142c2
FB
2820
2821 aSig = extractFloat32Frac( a );
2822 aExp = extractFloat32Exp( a );
2823 aSign = extractFloat32Sign( a );
2824 shiftCount = 0xBE - aExp;
2825 if ( shiftCount < 0 ) {
ff32e16e 2826 float_raise(float_flag_invalid, status);
158142c2
FB
2827 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
2828 return LIT64( 0x7FFFFFFFFFFFFFFF );
2829 }
bb98fe42 2830 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
2831 }
2832 if ( aExp ) aSig |= 0x00800000;
2833 aSig64 = aSig;
2834 aSig64 <<= 40;
2835 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
ff32e16e 2836 return roundAndPackInt64(aSign, aSig64, aSigExtra, status);
158142c2
FB
2837
2838}
2839
2f18bbf9
TM
2840/*----------------------------------------------------------------------------
2841| Returns the result of converting the single-precision floating-point value
2842| `a' to the 64-bit unsigned integer format. The conversion is
2843| performed according to the IEC/IEEE Standard for Binary Floating-Point
2844| Arithmetic---which means in particular that the conversion is rounded
2845| according to the current rounding mode. If `a' is a NaN, the largest
2846| unsigned integer is returned. Otherwise, if the conversion overflows, the
2847| largest unsigned integer is returned. If the 'a' is negative, the result
2848| is rounded and zero is returned; values that do not round to zero will
2849| raise the inexact exception flag.
2850*----------------------------------------------------------------------------*/
2851
182f42fd 2852uint64_t float32_to_uint64(float32 a, float_status *status)
2f18bbf9
TM
2853{
2854 flag aSign;
0c48262d 2855 int aExp;
07d792d2 2856 int shiftCount;
2f18bbf9
TM
2857 uint32_t aSig;
2858 uint64_t aSig64, aSigExtra;
ff32e16e 2859 a = float32_squash_input_denormal(a, status);
2f18bbf9
TM
2860
2861 aSig = extractFloat32Frac(a);
2862 aExp = extractFloat32Exp(a);
2863 aSign = extractFloat32Sign(a);
2864 if ((aSign) && (aExp > 126)) {
ff32e16e 2865 float_raise(float_flag_invalid, status);
2f18bbf9
TM
2866 if (float32_is_any_nan(a)) {
2867 return LIT64(0xFFFFFFFFFFFFFFFF);
2868 } else {
2869 return 0;
2870 }
2871 }
2872 shiftCount = 0xBE - aExp;
2873 if (aExp) {
2874 aSig |= 0x00800000;
2875 }
2876 if (shiftCount < 0) {
ff32e16e 2877 float_raise(float_flag_invalid, status);
2f18bbf9
TM
2878 return LIT64(0xFFFFFFFFFFFFFFFF);
2879 }
2880
2881 aSig64 = aSig;
2882 aSig64 <<= 40;
2883 shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra);
ff32e16e 2884 return roundAndPackUint64(aSign, aSig64, aSigExtra, status);
2f18bbf9
TM
2885}
2886
a13d4489
TM
2887/*----------------------------------------------------------------------------
2888| Returns the result of converting the single-precision floating-point value
2889| `a' to the 64-bit unsigned integer format. The conversion is
2890| performed according to the IEC/IEEE Standard for Binary Floating-Point
2891| Arithmetic, except that the conversion is always rounded toward zero. If
2892| `a' is a NaN, the largest unsigned integer is returned. Otherwise, if the
2893| conversion overflows, the largest unsigned integer is returned. If the
2894| 'a' is negative, the result is rounded and zero is returned; values that do
2895| not round to zero will raise the inexact flag.
2896*----------------------------------------------------------------------------*/
2897
182f42fd 2898uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *status)
a13d4489 2899{
a2f2d288 2900 signed char current_rounding_mode = status->float_rounding_mode;
ff32e16e
PM
2901 set_float_rounding_mode(float_round_to_zero, status);
2902 int64_t v = float32_to_uint64(a, status);
2903 set_float_rounding_mode(current_rounding_mode, status);
a13d4489
TM
2904 return v;
2905}
2906
158142c2
FB
2907/*----------------------------------------------------------------------------
2908| Returns the result of converting the single-precision floating-point value
2909| `a' to the 64-bit two's complement integer format. The conversion is
2910| performed according to the IEC/IEEE Standard for Binary Floating-Point
2911| Arithmetic, except that the conversion is always rounded toward zero. If
2912| `a' is a NaN, the largest positive integer is returned. Otherwise, if the
2913| conversion overflows, the largest integer with the same sign as `a' is
2914| returned.
2915*----------------------------------------------------------------------------*/
2916
f42c2224 2917int64_t float32_to_int64_round_to_zero(float32 a, float_status *status)
158142c2
FB
2918{
2919 flag aSign;
0c48262d 2920 int aExp;
07d792d2 2921 int shiftCount;
bb98fe42
AF
2922 uint32_t aSig;
2923 uint64_t aSig64;
f42c2224 2924 int64_t z;
ff32e16e 2925 a = float32_squash_input_denormal(a, status);
158142c2
FB
2926
2927 aSig = extractFloat32Frac( a );
2928 aExp = extractFloat32Exp( a );
2929 aSign = extractFloat32Sign( a );
2930 shiftCount = aExp - 0xBE;
2931 if ( 0 <= shiftCount ) {
f090c9d4 2932 if ( float32_val(a) != 0xDF000000 ) {
ff32e16e 2933 float_raise(float_flag_invalid, status);
158142c2
FB
2934 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
2935 return LIT64( 0x7FFFFFFFFFFFFFFF );
2936 }
2937 }
bb98fe42 2938 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
2939 }
2940 else if ( aExp <= 0x7E ) {
a2f2d288
PM
2941 if (aExp | aSig) {
2942 status->float_exception_flags |= float_flag_inexact;
2943 }
158142c2
FB
2944 return 0;
2945 }
2946 aSig64 = aSig | 0x00800000;
2947 aSig64 <<= 40;
2948 z = aSig64>>( - shiftCount );
bb98fe42 2949 if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
a2f2d288 2950 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
2951 }
2952 if ( aSign ) z = - z;
2953 return z;
2954
2955}
2956
2957/*----------------------------------------------------------------------------
2958| Returns the result of converting the single-precision floating-point value
2959| `a' to the double-precision floating-point format. The conversion is
2960| performed according to the IEC/IEEE Standard for Binary Floating-Point
2961| Arithmetic.
2962*----------------------------------------------------------------------------*/
2963
e5a41ffa 2964float64 float32_to_float64(float32 a, float_status *status)
158142c2
FB
2965{
2966 flag aSign;
0c48262d 2967 int aExp;
bb98fe42 2968 uint32_t aSig;
ff32e16e 2969 a = float32_squash_input_denormal(a, status);
158142c2
FB
2970
2971 aSig = extractFloat32Frac( a );
2972 aExp = extractFloat32Exp( a );
2973 aSign = extractFloat32Sign( a );
2974 if ( aExp == 0xFF ) {
ff32e16e
PM
2975 if (aSig) {
2976 return commonNaNToFloat64(float32ToCommonNaN(a, status), status);
2977 }
158142c2
FB
2978 return packFloat64( aSign, 0x7FF, 0 );
2979 }
2980 if ( aExp == 0 ) {
2981 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
2982 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2983 --aExp;
2984 }
bb98fe42 2985 return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
158142c2
FB
2986
2987}
2988
158142c2
FB
2989/*----------------------------------------------------------------------------
2990| Returns the result of converting the single-precision floating-point value
2991| `a' to the extended double-precision floating-point format. The conversion
2992| is performed according to the IEC/IEEE Standard for Binary Floating-Point
2993| Arithmetic.
2994*----------------------------------------------------------------------------*/
2995
e5a41ffa 2996floatx80 float32_to_floatx80(float32 a, float_status *status)
158142c2
FB
2997{
2998 flag aSign;
0c48262d 2999 int aExp;
bb98fe42 3000 uint32_t aSig;
158142c2 3001
ff32e16e 3002 a = float32_squash_input_denormal(a, status);
158142c2
FB
3003 aSig = extractFloat32Frac( a );
3004 aExp = extractFloat32Exp( a );
3005 aSign = extractFloat32Sign( a );
3006 if ( aExp == 0xFF ) {
ff32e16e
PM
3007 if (aSig) {
3008 return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
3009 }
158142c2
FB
3010 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3011 }
3012 if ( aExp == 0 ) {
3013 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3014 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3015 }
3016 aSig |= 0x00800000;
bb98fe42 3017 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
158142c2
FB
3018
3019}
3020
158142c2
FB
3021/*----------------------------------------------------------------------------
3022| Returns the result of converting the single-precision floating-point value
3023| `a' to the double-precision floating-point format. The conversion is
3024| performed according to the IEC/IEEE Standard for Binary Floating-Point
3025| Arithmetic.
3026*----------------------------------------------------------------------------*/
3027
e5a41ffa 3028float128 float32_to_float128(float32 a, float_status *status)
158142c2
FB
3029{
3030 flag aSign;
0c48262d 3031 int aExp;
bb98fe42 3032 uint32_t aSig;
158142c2 3033
ff32e16e 3034 a = float32_squash_input_denormal(a, status);
158142c2
FB
3035 aSig = extractFloat32Frac( a );
3036 aExp = extractFloat32Exp( a );
3037 aSign = extractFloat32Sign( a );
3038 if ( aExp == 0xFF ) {
ff32e16e
PM
3039 if (aSig) {
3040 return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
3041 }
158142c2
FB
3042 return packFloat128( aSign, 0x7FFF, 0, 0 );
3043 }
3044 if ( aExp == 0 ) {
3045 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3046 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3047 --aExp;
3048 }
bb98fe42 3049 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
158142c2
FB
3050
3051}
3052
158142c2
FB
3053/*----------------------------------------------------------------------------
3054| Returns the remainder of the single-precision floating-point value `a'
3055| with respect to the corresponding value `b'. The operation is performed
3056| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3057*----------------------------------------------------------------------------*/
3058
e5a41ffa 3059float32 float32_rem(float32 a, float32 b, float_status *status)
158142c2 3060{
ed086f3d 3061 flag aSign, zSign;
0c48262d 3062 int aExp, bExp, expDiff;
bb98fe42
AF
3063 uint32_t aSig, bSig;
3064 uint32_t q;
3065 uint64_t aSig64, bSig64, q64;
3066 uint32_t alternateASig;
3067 int32_t sigMean;
ff32e16e
PM
3068 a = float32_squash_input_denormal(a, status);
3069 b = float32_squash_input_denormal(b, status);
158142c2
FB
3070
3071 aSig = extractFloat32Frac( a );
3072 aExp = extractFloat32Exp( a );
3073 aSign = extractFloat32Sign( a );
3074 bSig = extractFloat32Frac( b );
3075 bExp = extractFloat32Exp( b );
158142c2
FB
3076 if ( aExp == 0xFF ) {
3077 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
ff32e16e 3078 return propagateFloat32NaN(a, b, status);
158142c2 3079 }
ff32e16e 3080 float_raise(float_flag_invalid, status);
af39bc8c 3081 return float32_default_nan(status);
158142c2
FB
3082 }
3083 if ( bExp == 0xFF ) {
ff32e16e
PM
3084 if (bSig) {
3085 return propagateFloat32NaN(a, b, status);
3086 }
158142c2
FB
3087 return a;
3088 }
3089 if ( bExp == 0 ) {
3090 if ( bSig == 0 ) {
ff32e16e 3091 float_raise(float_flag_invalid, status);
af39bc8c 3092 return float32_default_nan(status);
158142c2
FB
3093 }
3094 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
3095 }
3096 if ( aExp == 0 ) {
3097 if ( aSig == 0 ) return a;
3098 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3099 }
3100 expDiff = aExp - bExp;
3101 aSig |= 0x00800000;
3102 bSig |= 0x00800000;
3103 if ( expDiff < 32 ) {
3104 aSig <<= 8;
3105 bSig <<= 8;
3106 if ( expDiff < 0 ) {
3107 if ( expDiff < -1 ) return a;
3108 aSig >>= 1;
3109 }
3110 q = ( bSig <= aSig );
3111 if ( q ) aSig -= bSig;
3112 if ( 0 < expDiff ) {
bb98fe42 3113 q = ( ( (uint64_t) aSig )<<32 ) / bSig;
158142c2
FB
3114 q >>= 32 - expDiff;
3115 bSig >>= 2;
3116 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3117 }
3118 else {
3119 aSig >>= 2;
3120 bSig >>= 2;
3121 }
3122 }
3123 else {
3124 if ( bSig <= aSig ) aSig -= bSig;
bb98fe42
AF
3125 aSig64 = ( (uint64_t) aSig )<<40;
3126 bSig64 = ( (uint64_t) bSig )<<40;
158142c2
FB
3127 expDiff -= 64;
3128 while ( 0 < expDiff ) {
3129 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3130 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3131 aSig64 = - ( ( bSig * q64 )<<38 );
3132 expDiff -= 62;
3133 }
3134 expDiff += 64;
3135 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3136 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3137 q = q64>>( 64 - expDiff );
3138 bSig <<= 6;
3139 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
3140 }
3141 do {
3142 alternateASig = aSig;
3143 ++q;
3144 aSig -= bSig;
bb98fe42 3145 } while ( 0 <= (int32_t) aSig );
158142c2
FB
3146 sigMean = aSig + alternateASig;
3147 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3148 aSig = alternateASig;
3149 }
bb98fe42 3150 zSign = ( (int32_t) aSig < 0 );
158142c2 3151 if ( zSign ) aSig = - aSig;
ff32e16e 3152 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
158142c2
FB
3153}
3154
369be8f6 3155
158142c2
FB
3156/*----------------------------------------------------------------------------
3157| Returns the square root of the single-precision floating-point value `a'.
3158| The operation is performed according to the IEC/IEEE Standard for Binary
3159| Floating-Point Arithmetic.
3160*----------------------------------------------------------------------------*/
3161
e5a41ffa 3162float32 float32_sqrt(float32 a, float_status *status)
158142c2
FB
3163{
3164 flag aSign;
0c48262d 3165 int aExp, zExp;
bb98fe42
AF
3166 uint32_t aSig, zSig;
3167 uint64_t rem, term;
ff32e16e 3168 a = float32_squash_input_denormal(a, status);
158142c2
FB
3169
3170 aSig = extractFloat32Frac( a );
3171 aExp = extractFloat32Exp( a );
3172 aSign = extractFloat32Sign( a );
3173 if ( aExp == 0xFF ) {
ff32e16e
PM
3174 if (aSig) {
3175 return propagateFloat32NaN(a, float32_zero, status);
3176 }
158142c2 3177 if ( ! aSign ) return a;
ff32e16e 3178 float_raise(float_flag_invalid, status);
af39bc8c 3179 return float32_default_nan(status);
158142c2
FB
3180 }
3181 if ( aSign ) {
3182 if ( ( aExp | aSig ) == 0 ) return a;
ff32e16e 3183 float_raise(float_flag_invalid, status);
af39bc8c 3184 return float32_default_nan(status);
158142c2
FB
3185 }
3186 if ( aExp == 0 ) {
f090c9d4 3187 if ( aSig == 0 ) return float32_zero;
158142c2
FB
3188 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3189 }
3190 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
3191 aSig = ( aSig | 0x00800000 )<<8;
3192 zSig = estimateSqrt32( aExp, aSig ) + 2;
3193 if ( ( zSig & 0x7F ) <= 5 ) {
3194 if ( zSig < 2 ) {
3195 zSig = 0x7FFFFFFF;
3196 goto roundAndPack;
3197 }
3198 aSig >>= aExp & 1;
bb98fe42
AF
3199 term = ( (uint64_t) zSig ) * zSig;
3200 rem = ( ( (uint64_t) aSig )<<32 ) - term;
3201 while ( (int64_t) rem < 0 ) {
158142c2 3202 --zSig;
bb98fe42 3203 rem += ( ( (uint64_t) zSig )<<1 ) | 1;
158142c2
FB
3204 }
3205 zSig |= ( rem != 0 );
3206 }
3207 shift32RightJamming( zSig, 1, &zSig );
3208 roundAndPack:
ff32e16e 3209 return roundAndPackFloat32(0, zExp, zSig, status);
158142c2
FB
3210
3211}
3212
8229c991
AJ
3213/*----------------------------------------------------------------------------
3214| Returns the binary exponential of the single-precision floating-point value
3215| `a'. The operation is performed according to the IEC/IEEE Standard for
3216| Binary Floating-Point Arithmetic.
3217|
3218| Uses the following identities:
3219|
3220| 1. -------------------------------------------------------------------------
3221| x x*ln(2)
3222| 2 = e
3223|
3224| 2. -------------------------------------------------------------------------
3225| 2 3 4 5 n
3226| x x x x x x x
3227| e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
3228| 1! 2! 3! 4! 5! n!
3229*----------------------------------------------------------------------------*/
3230
3231static const float64 float32_exp2_coefficients[15] =
3232{
d5138cf4
PM
3233 const_float64( 0x3ff0000000000000ll ), /* 1 */
3234 const_float64( 0x3fe0000000000000ll ), /* 2 */
3235 const_float64( 0x3fc5555555555555ll ), /* 3 */
3236 const_float64( 0x3fa5555555555555ll ), /* 4 */
3237 const_float64( 0x3f81111111111111ll ), /* 5 */
3238 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
3239 const_float64( 0x3f2a01a01a01a01all ), /* 7 */
3240 const_float64( 0x3efa01a01a01a01all ), /* 8 */
3241 const_float64( 0x3ec71de3a556c734ll ), /* 9 */
3242 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
3243 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
3244 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
3245 const_float64( 0x3de6124613a86d09ll ), /* 13 */
3246 const_float64( 0x3da93974a8c07c9dll ), /* 14 */
3247 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
8229c991
AJ
3248};
3249
e5a41ffa 3250float32 float32_exp2(float32 a, float_status *status)
8229c991
AJ
3251{
3252 flag aSign;
0c48262d 3253 int aExp;
bb98fe42 3254 uint32_t aSig;
8229c991
AJ
3255 float64 r, x, xn;
3256 int i;
ff32e16e 3257 a = float32_squash_input_denormal(a, status);
8229c991
AJ
3258
3259 aSig = extractFloat32Frac( a );
3260 aExp = extractFloat32Exp( a );
3261 aSign = extractFloat32Sign( a );
3262
3263 if ( aExp == 0xFF) {
ff32e16e
PM
3264 if (aSig) {
3265 return propagateFloat32NaN(a, float32_zero, status);
3266 }
8229c991
AJ
3267 return (aSign) ? float32_zero : a;
3268 }
3269 if (aExp == 0) {
3270 if (aSig == 0) return float32_one;
3271 }
3272
ff32e16e 3273 float_raise(float_flag_inexact, status);
8229c991
AJ
3274
3275 /* ******************************* */
3276 /* using float64 for approximation */
3277 /* ******************************* */
ff32e16e
PM
3278 x = float32_to_float64(a, status);
3279 x = float64_mul(x, float64_ln2, status);
8229c991
AJ
3280
3281 xn = x;
3282 r = float64_one;
3283 for (i = 0 ; i < 15 ; i++) {
3284 float64 f;
3285
ff32e16e
PM
3286 f = float64_mul(xn, float32_exp2_coefficients[i], status);
3287 r = float64_add(r, f, status);
8229c991 3288
ff32e16e 3289 xn = float64_mul(xn, x, status);
8229c991
AJ
3290 }
3291
3292 return float64_to_float32(r, status);
3293}
3294
374dfc33
AJ
3295/*----------------------------------------------------------------------------
3296| Returns the binary log of the single-precision floating-point value `a'.
3297| The operation is performed according to the IEC/IEEE Standard for Binary
3298| Floating-Point Arithmetic.
3299*----------------------------------------------------------------------------*/
e5a41ffa 3300float32 float32_log2(float32 a, float_status *status)
374dfc33
AJ
3301{
3302 flag aSign, zSign;
0c48262d 3303 int aExp;
bb98fe42 3304 uint32_t aSig, zSig, i;
374dfc33 3305
ff32e16e 3306 a = float32_squash_input_denormal(a, status);
374dfc33
AJ
3307 aSig = extractFloat32Frac( a );
3308 aExp = extractFloat32Exp( a );
3309 aSign = extractFloat32Sign( a );
3310
3311 if ( aExp == 0 ) {
3312 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
3313 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3314 }
3315 if ( aSign ) {
ff32e16e 3316 float_raise(float_flag_invalid, status);
af39bc8c 3317 return float32_default_nan(status);
374dfc33
AJ
3318 }
3319 if ( aExp == 0xFF ) {
ff32e16e
PM
3320 if (aSig) {
3321 return propagateFloat32NaN(a, float32_zero, status);
3322 }
374dfc33
AJ
3323 return a;
3324 }
3325
3326 aExp -= 0x7F;
3327 aSig |= 0x00800000;
3328 zSign = aExp < 0;
3329 zSig = aExp << 23;
3330
3331 for (i = 1 << 22; i > 0; i >>= 1) {
bb98fe42 3332 aSig = ( (uint64_t)aSig * aSig ) >> 23;
374dfc33
AJ
3333 if ( aSig & 0x01000000 ) {
3334 aSig >>= 1;
3335 zSig |= i;
3336 }
3337 }
3338
3339 if ( zSign )
3340 zSig = -zSig;
3341
ff32e16e 3342 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
374dfc33
AJ
3343}
3344
158142c2
FB
3345/*----------------------------------------------------------------------------
3346| Returns 1 if the single-precision floating-point value `a' is equal to
b689362d
AJ
3347| the corresponding value `b', and 0 otherwise. The invalid exception is
3348| raised if either operand is a NaN. Otherwise, the comparison is performed
158142c2
FB
3349| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3350*----------------------------------------------------------------------------*/
3351
e5a41ffa 3352int float32_eq(float32 a, float32 b, float_status *status)
158142c2 3353{
b689362d 3354 uint32_t av, bv;
ff32e16e
PM
3355 a = float32_squash_input_denormal(a, status);
3356 b = float32_squash_input_denormal(b, status);
158142c2
FB
3357
3358 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3359 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3360 ) {
ff32e16e 3361 float_raise(float_flag_invalid, status);
158142c2
FB
3362 return 0;
3363 }
b689362d
AJ
3364 av = float32_val(a);
3365 bv = float32_val(b);
3366 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
158142c2
FB
3367}
3368
3369/*----------------------------------------------------------------------------
3370| Returns 1 if the single-precision floating-point value `a' is less than
f5a64251
AJ
3371| or equal to the corresponding value `b', and 0 otherwise. The invalid
3372| exception is raised if either operand is a NaN. The comparison is performed
3373| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
3374*----------------------------------------------------------------------------*/
3375
e5a41ffa 3376int float32_le(float32 a, float32 b, float_status *status)
158142c2
FB
3377{
3378 flag aSign, bSign;
bb98fe42 3379 uint32_t av, bv;
ff32e16e
PM
3380 a = float32_squash_input_denormal(a, status);
3381 b = float32_squash_input_denormal(b, status);
158142c2
FB
3382
3383 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3384 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3385 ) {
ff32e16e 3386 float_raise(float_flag_invalid, status);
158142c2
FB
3387 return 0;
3388 }
3389 aSign = extractFloat32Sign( a );
3390 bSign = extractFloat32Sign( b );
f090c9d4
PB
3391 av = float32_val(a);
3392 bv = float32_val(b);
bb98fe42 3393 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 3394 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
3395
3396}
3397
3398/*----------------------------------------------------------------------------
3399| Returns 1 if the single-precision floating-point value `a' is less than
f5a64251
AJ
3400| the corresponding value `b', and 0 otherwise. The invalid exception is
3401| raised if either operand is a NaN. The comparison is performed according
3402| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
3403*----------------------------------------------------------------------------*/
3404
e5a41ffa 3405int float32_lt(float32 a, float32 b, float_status *status)
158142c2
FB
3406{
3407 flag aSign, bSign;
bb98fe42 3408 uint32_t av, bv;
ff32e16e
PM
3409 a = float32_squash_input_denormal(a, status);
3410 b = float32_squash_input_denormal(b, status);
158142c2
FB
3411
3412 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3413 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3414 ) {
ff32e16e 3415 float_raise(float_flag_invalid, status);
158142c2
FB
3416 return 0;
3417 }
3418 aSign = extractFloat32Sign( a );
3419 bSign = extractFloat32Sign( b );
f090c9d4
PB
3420 av = float32_val(a);
3421 bv = float32_val(b);
bb98fe42 3422 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 3423 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
3424
3425}
3426
67b7861d
AJ
3427/*----------------------------------------------------------------------------
3428| Returns 1 if the single-precision floating-point values `a' and `b' cannot
f5a64251
AJ
3429| be compared, and 0 otherwise. The invalid exception is raised if either
3430| operand is a NaN. The comparison is performed according to the IEC/IEEE
3431| Standard for Binary Floating-Point Arithmetic.
67b7861d
AJ
3432*----------------------------------------------------------------------------*/
3433
e5a41ffa 3434int float32_unordered(float32 a, float32 b, float_status *status)
67b7861d 3435{
ff32e16e
PM
3436 a = float32_squash_input_denormal(a, status);
3437 b = float32_squash_input_denormal(b, status);
67b7861d
AJ
3438
3439 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3440 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3441 ) {
ff32e16e 3442 float_raise(float_flag_invalid, status);
67b7861d
AJ
3443 return 1;
3444 }
3445 return 0;
3446}
b689362d 3447
158142c2
FB
3448/*----------------------------------------------------------------------------
3449| Returns 1 if the single-precision floating-point value `a' is equal to
f5a64251
AJ
3450| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3451| exception. The comparison is performed according to the IEC/IEEE Standard
3452| for Binary Floating-Point Arithmetic.
158142c2
FB
3453*----------------------------------------------------------------------------*/
3454
e5a41ffa 3455int float32_eq_quiet(float32 a, float32 b, float_status *status)
158142c2 3456{
ff32e16e
PM
3457 a = float32_squash_input_denormal(a, status);
3458 b = float32_squash_input_denormal(b, status);
158142c2
FB
3459
3460 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3461 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3462 ) {
af39bc8c
AM
3463 if (float32_is_signaling_nan(a, status)
3464 || float32_is_signaling_nan(b, status)) {
ff32e16e 3465 float_raise(float_flag_invalid, status);
b689362d 3466 }
158142c2
FB
3467 return 0;
3468 }
b689362d
AJ
3469 return ( float32_val(a) == float32_val(b) ) ||
3470 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
158142c2
FB
3471}
3472
3473/*----------------------------------------------------------------------------
3474| Returns 1 if the single-precision floating-point value `a' is less than or
3475| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3476| cause an exception. Otherwise, the comparison is performed according to the
3477| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3478*----------------------------------------------------------------------------*/
3479
e5a41ffa 3480int float32_le_quiet(float32 a, float32 b, float_status *status)
158142c2
FB
3481{
3482 flag aSign, bSign;
bb98fe42 3483 uint32_t av, bv;
ff32e16e
PM
3484 a = float32_squash_input_denormal(a, status);
3485 b = float32_squash_input_denormal(b, status);
158142c2
FB
3486
3487 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3488 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3489 ) {
af39bc8c
AM
3490 if (float32_is_signaling_nan(a, status)
3491 || float32_is_signaling_nan(b, status)) {
ff32e16e 3492 float_raise(float_flag_invalid, status);
158142c2
FB
3493 }
3494 return 0;
3495 }
3496 aSign = extractFloat32Sign( a );
3497 bSign = extractFloat32Sign( b );
f090c9d4
PB
3498 av = float32_val(a);
3499 bv = float32_val(b);
bb98fe42 3500 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 3501 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
3502
3503}
3504
3505/*----------------------------------------------------------------------------
3506| Returns 1 if the single-precision floating-point value `a' is less than
3507| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3508| exception. Otherwise, the comparison is performed according to the IEC/IEEE
3509| Standard for Binary Floating-Point Arithmetic.
3510*----------------------------------------------------------------------------*/
3511
e5a41ffa 3512int float32_lt_quiet(float32 a, float32 b, float_status *status)
158142c2
FB
3513{
3514 flag aSign, bSign;
bb98fe42 3515 uint32_t av, bv;
ff32e16e
PM
3516 a = float32_squash_input_denormal(a, status);
3517 b = float32_squash_input_denormal(b, status);
158142c2
FB
3518
3519 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3520 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3521 ) {
af39bc8c
AM
3522 if (float32_is_signaling_nan(a, status)
3523 || float32_is_signaling_nan(b, status)) {
ff32e16e 3524 float_raise(float_flag_invalid, status);
158142c2
FB
3525 }
3526 return 0;
3527 }
3528 aSign = extractFloat32Sign( a );
3529 bSign = extractFloat32Sign( b );
f090c9d4
PB
3530 av = float32_val(a);
3531 bv = float32_val(b);
bb98fe42 3532 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 3533 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
3534
3535}
3536
67b7861d
AJ
3537/*----------------------------------------------------------------------------
3538| Returns 1 if the single-precision floating-point values `a' and `b' cannot
3539| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
3540| comparison is performed according to the IEC/IEEE Standard for Binary
3541| Floating-Point Arithmetic.
3542*----------------------------------------------------------------------------*/
3543
e5a41ffa 3544int float32_unordered_quiet(float32 a, float32 b, float_status *status)
67b7861d 3545{
ff32e16e
PM
3546 a = float32_squash_input_denormal(a, status);
3547 b = float32_squash_input_denormal(b, status);
67b7861d
AJ
3548
3549 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3550 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3551 ) {
af39bc8c
AM
3552 if (float32_is_signaling_nan(a, status)
3553 || float32_is_signaling_nan(b, status)) {
ff32e16e 3554 float_raise(float_flag_invalid, status);
67b7861d
AJ
3555 }
3556 return 1;
3557 }
3558 return 0;
3559}
3560
158142c2
FB
3561/*----------------------------------------------------------------------------
3562| Returns the result of converting the double-precision floating-point value
3563| `a' to the 32-bit two's complement integer format. The conversion is
3564| performed according to the IEC/IEEE Standard for Binary Floating-Point
3565| Arithmetic---which means in particular that the conversion is rounded
3566| according to the current rounding mode. If `a' is a NaN, the largest
3567| positive integer is returned. Otherwise, if the conversion overflows, the
3568| largest integer with the same sign as `a' is returned.
3569*----------------------------------------------------------------------------*/
3570
f4014512 3571int32_t float64_to_int32(float64 a, float_status *status)
158142c2
FB
3572{
3573 flag aSign;
0c48262d 3574 int aExp;
07d792d2 3575 int shiftCount;
bb98fe42 3576 uint64_t aSig;
ff32e16e 3577 a = float64_squash_input_denormal(a, status);
158142c2
FB
3578
3579 aSig = extractFloat64Frac( a );
3580 aExp = extractFloat64Exp( a );
3581 aSign = extractFloat64Sign( a );
3582 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3583 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3584 shiftCount = 0x42C - aExp;
3585 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
ff32e16e 3586 return roundAndPackInt32(aSign, aSig, status);
158142c2
FB
3587
3588}
3589
3590/*----------------------------------------------------------------------------
3591| Returns the result of converting the double-precision floating-point value
3592| `a' to the 32-bit two's complement integer format. The conversion is
3593| performed according to the IEC/IEEE Standard for Binary Floating-Point
3594| Arithmetic, except that the conversion is always rounded toward zero.
3595| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
3596| the conversion overflows, the largest integer with the same sign as `a' is
3597| returned.
3598*----------------------------------------------------------------------------*/
3599
f4014512 3600int32_t float64_to_int32_round_to_zero(float64 a, float_status *status)
158142c2
FB
3601{
3602 flag aSign;
0c48262d 3603 int aExp;
07d792d2 3604 int shiftCount;
bb98fe42 3605 uint64_t aSig, savedASig;
b3a6a2e0 3606 int32_t z;
ff32e16e 3607 a = float64_squash_input_denormal(a, status);
158142c2
FB
3608
3609 aSig = extractFloat64Frac( a );
3610 aExp = extractFloat64Exp( a );
3611 aSign = extractFloat64Sign( a );
3612 if ( 0x41E < aExp ) {
3613 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3614 goto invalid;
3615 }
3616 else if ( aExp < 0x3FF ) {
a2f2d288
PM
3617 if (aExp || aSig) {
3618 status->float_exception_flags |= float_flag_inexact;
3619 }
158142c2
FB
3620 return 0;
3621 }
3622 aSig |= LIT64( 0x0010000000000000 );
3623 shiftCount = 0x433 - aExp;
3624 savedASig = aSig;
3625 aSig >>= shiftCount;
3626 z = aSig;
3627 if ( aSign ) z = - z;
3628 if ( ( z < 0 ) ^ aSign ) {
3629 invalid:
ff32e16e 3630 float_raise(float_flag_invalid, status);
bb98fe42 3631 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2
FB
3632 }
3633 if ( ( aSig<<shiftCount ) != savedASig ) {
a2f2d288 3634 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
3635 }
3636 return z;
3637
3638}
3639
cbcef455
PM
3640/*----------------------------------------------------------------------------
3641| Returns the result of converting the double-precision floating-point value
3642| `a' to the 16-bit two's complement integer format. The conversion is
3643| performed according to the IEC/IEEE Standard for Binary Floating-Point
3644| Arithmetic, except that the conversion is always rounded toward zero.
3645| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
3646| the conversion overflows, the largest integer with the same sign as `a' is
3647| returned.
3648*----------------------------------------------------------------------------*/
3649
0bb721d7 3650int16_t float64_to_int16_round_to_zero(float64 a, float_status *status)
cbcef455
PM
3651{
3652 flag aSign;
0c48262d 3653 int aExp;
07d792d2 3654 int shiftCount;
bb98fe42 3655 uint64_t aSig, savedASig;
f4014512 3656 int32_t z;
cbcef455
PM
3657
3658 aSig = extractFloat64Frac( a );
3659 aExp = extractFloat64Exp( a );
3660 aSign = extractFloat64Sign( a );
3661 if ( 0x40E < aExp ) {
3662 if ( ( aExp == 0x7FF ) && aSig ) {
3663 aSign = 0;
3664 }
3665 goto invalid;
3666 }
3667 else if ( aExp < 0x3FF ) {
3668 if ( aExp || aSig ) {
a2f2d288 3669 status->float_exception_flags |= float_flag_inexact;
cbcef455
PM
3670 }
3671 return 0;
3672 }
3673 aSig |= LIT64( 0x0010000000000000 );
3674 shiftCount = 0x433 - aExp;
3675 savedASig = aSig;
3676 aSig >>= shiftCount;
3677 z = aSig;
3678 if ( aSign ) {
3679 z = - z;
3680 }
3681 if ( ( (int16_t)z < 0 ) ^ aSign ) {
3682 invalid:
ff32e16e 3683 float_raise(float_flag_invalid, status);
bb98fe42 3684 return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
cbcef455
PM
3685 }
3686 if ( ( aSig<<shiftCount ) != savedASig ) {
a2f2d288 3687 status->float_exception_flags |= float_flag_inexact;
cbcef455
PM
3688 }
3689 return z;
3690}
3691
158142c2
FB
3692/*----------------------------------------------------------------------------
3693| Returns the result of converting the double-precision floating-point value
3694| `a' to the 64-bit two's complement integer format. The conversion is
3695| performed according to the IEC/IEEE Standard for Binary Floating-Point
3696| Arithmetic---which means in particular that the conversion is rounded
3697| according to the current rounding mode. If `a' is a NaN, the largest
3698| positive integer is returned. Otherwise, if the conversion overflows, the
3699| largest integer with the same sign as `a' is returned.
3700*----------------------------------------------------------------------------*/
3701
f42c2224 3702int64_t float64_to_int64(float64 a, float_status *status)
158142c2
FB
3703{
3704 flag aSign;
0c48262d 3705 int aExp;
07d792d2 3706 int shiftCount;
bb98fe42 3707 uint64_t aSig, aSigExtra;
ff32e16e 3708 a = float64_squash_input_denormal(a, status);
158142c2
FB
3709
3710 aSig = extractFloat64Frac( a );
3711 aExp = extractFloat64Exp( a );
3712 aSign = extractFloat64Sign( a );
3713 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3714 shiftCount = 0x433 - aExp;
3715 if ( shiftCount <= 0 ) {
3716 if ( 0x43E < aExp ) {
ff32e16e 3717 float_raise(float_flag_invalid, status);
158142c2
FB
3718 if ( ! aSign
3719 || ( ( aExp == 0x7FF )
3720 && ( aSig != LIT64( 0x0010000000000000 ) ) )
3721 ) {
3722 return LIT64( 0x7FFFFFFFFFFFFFFF );
3723 }
bb98fe42 3724 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
3725 }
3726 aSigExtra = 0;
3727 aSig <<= - shiftCount;
3728 }
3729 else {
3730 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3731 }
ff32e16e 3732 return roundAndPackInt64(aSign, aSig, aSigExtra, status);
158142c2
FB
3733
3734}
3735
3736/*----------------------------------------------------------------------------
3737| Returns the result of converting the double-precision floating-point value
3738| `a' to the 64-bit two's complement integer format. The conversion is
3739| performed according to the IEC/IEEE Standard for Binary Floating-Point
3740| Arithmetic, except that the conversion is always rounded toward zero.
3741| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
3742| the conversion overflows, the largest integer with the same sign as `a' is
3743| returned.
3744*----------------------------------------------------------------------------*/
3745
f42c2224 3746int64_t float64_to_int64_round_to_zero(float64 a, float_status *status)
158142c2
FB
3747{
3748 flag aSign;
0c48262d 3749 int aExp;
07d792d2 3750 int shiftCount;
bb98fe42 3751 uint64_t aSig;
f42c2224 3752 int64_t z;
ff32e16e 3753 a = float64_squash_input_denormal(a, status);
158142c2
FB
3754
3755 aSig = extractFloat64Frac( a );
3756 aExp = extractFloat64Exp( a );
3757 aSign = extractFloat64Sign( a );
3758 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3759 shiftCount = aExp - 0x433;
3760 if ( 0 <= shiftCount ) {
3761 if ( 0x43E <= aExp ) {
f090c9d4 3762 if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
ff32e16e 3763 float_raise(float_flag_invalid, status);
158142c2
FB
3764 if ( ! aSign
3765 || ( ( aExp == 0x7FF )
3766 && ( aSig != LIT64( 0x0010000000000000 ) ) )
3767 ) {
3768 return LIT64( 0x7FFFFFFFFFFFFFFF );
3769 }
3770 }
bb98fe42 3771 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
3772 }
3773 z = aSig<<shiftCount;
3774 }
3775 else {
3776 if ( aExp < 0x3FE ) {
a2f2d288
PM
3777 if (aExp | aSig) {
3778 status->float_exception_flags |= float_flag_inexact;
3779 }
158142c2
FB
3780 return 0;
3781 }
3782 z = aSig>>( - shiftCount );
bb98fe42 3783 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
a2f2d288 3784 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
3785 }
3786 }
3787 if ( aSign ) z = - z;
3788 return z;
3789
3790}
3791
3792/*----------------------------------------------------------------------------
3793| Returns the result of converting the double-precision floating-point value
3794| `a' to the single-precision floating-point format. The conversion is
3795| performed according to the IEC/IEEE Standard for Binary Floating-Point
3796| Arithmetic.
3797*----------------------------------------------------------------------------*/
3798
e5a41ffa 3799float32 float64_to_float32(float64 a, float_status *status)
158142c2
FB
3800{
3801 flag aSign;
0c48262d 3802 int aExp;
bb98fe42
AF
3803 uint64_t aSig;
3804 uint32_t zSig;
ff32e16e 3805 a = float64_squash_input_denormal(a, status);
158142c2
FB
3806
3807 aSig = extractFloat64Frac( a );
3808 aExp = extractFloat64Exp( a );
3809 aSign = extractFloat64Sign( a );
3810 if ( aExp == 0x7FF ) {
ff32e16e
PM
3811 if (aSig) {
3812 return commonNaNToFloat32(float64ToCommonNaN(a, status), status);
3813 }
158142c2
FB
3814 return packFloat32( aSign, 0xFF, 0 );
3815 }
3816 shift64RightJamming( aSig, 22, &aSig );
3817 zSig = aSig;
3818 if ( aExp || zSig ) {
3819 zSig |= 0x40000000;
3820 aExp -= 0x381;
3821 }
ff32e16e 3822 return roundAndPackFloat32(aSign, aExp, zSig, status);
158142c2
FB
3823
3824}
3825
60011498
PB
3826
3827/*----------------------------------------------------------------------------
3828| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3829| half-precision floating-point value, returning the result. After being
3830| shifted into the proper positions, the three fields are simply added
3831| together to form the result. This means that any integer portion of `zSig'
3832| will be added into the exponent. Since a properly normalized significand
3833| will have an integer portion equal to 1, the `zExp' input should be 1 less
3834| than the desired result exponent whenever `zSig' is a complete, normalized
3835| significand.
3836*----------------------------------------------------------------------------*/
0c48262d 3837static float16 packFloat16(flag zSign, int zExp, uint16_t zSig)
60011498 3838{
bb4d4bb3 3839 return make_float16(
bb98fe42 3840 (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
60011498
PB
3841}
3842
c4a1c5e7
PM
3843/*----------------------------------------------------------------------------
3844| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3845| and significand `zSig', and returns the proper half-precision floating-
3846| point value corresponding to the abstract input. Ordinarily, the abstract
3847| value is simply rounded and packed into the half-precision format, with
3848| the inexact exception raised if the abstract input cannot be represented
3849| exactly. However, if the abstract value is too large, the overflow and
3850| inexact exceptions are raised and an infinity or maximal finite value is
3851| returned. If the abstract value is too small, the input value is rounded to
3852| a subnormal number, and the underflow and inexact exceptions are raised if
3853| the abstract input cannot be represented exactly as a subnormal half-
3854| precision floating-point number.
3855| The `ieee' flag indicates whether to use IEEE standard half precision, or
3856| ARM-style "alternative representation", which omits the NaN and Inf
3857| encodings in order to raise the maximum representable exponent by one.
3858| The input significand `zSig' has its binary point between bits 22
3859| and 23, which is 13 bits to the left of the usual location. This shifted
3860| significand must be normalized or smaller. If `zSig' is not normalized,
3861| `zExp' must be 0; in that case, the result returned is a subnormal number,
3862| and it must not require rounding. In the usual case that `zSig' is
3863| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3864| Note the slightly odd position of the binary point in zSig compared with the
3865| other roundAndPackFloat functions. This should probably be fixed if we
3866| need to implement more float16 routines than just conversion.
3867| The handling of underflow and overflow follows the IEC/IEEE Standard for
3868| Binary Floating-Point Arithmetic.
3869*----------------------------------------------------------------------------*/
3870
0c48262d 3871static float16 roundAndPackFloat16(flag zSign, int zExp,
e5a41ffa
PM
3872 uint32_t zSig, flag ieee,
3873 float_status *status)
c4a1c5e7
PM
3874{
3875 int maxexp = ieee ? 29 : 30;
3876 uint32_t mask;
3877 uint32_t increment;
c4a1c5e7
PM
3878 bool rounding_bumps_exp;
3879 bool is_tiny = false;
3880
3881 /* Calculate the mask of bits of the mantissa which are not
3882 * representable in half-precision and will be lost.
3883 */
3884 if (zExp < 1) {
3885 /* Will be denormal in halfprec */
3886 mask = 0x00ffffff;
3887 if (zExp >= -11) {
3888 mask >>= 11 + zExp;
3889 }
3890 } else {
3891 /* Normal number in halfprec */
3892 mask = 0x00001fff;
3893 }
3894
a2f2d288 3895 switch (status->float_rounding_mode) {
c4a1c5e7
PM
3896 case float_round_nearest_even:
3897 increment = (mask + 1) >> 1;
3898 if ((zSig & mask) == increment) {
3899 increment = zSig & (increment << 1);
3900 }
3901 break;
f9288a76
PM
3902 case float_round_ties_away:
3903 increment = (mask + 1) >> 1;
3904 break;
c4a1c5e7
PM
3905 case float_round_up:
3906 increment = zSign ? 0 : mask;
3907 break;
3908 case float_round_down:
3909 increment = zSign ? mask : 0;
3910 break;
3911 default: /* round_to_zero */
3912 increment = 0;
3913 break;
3914 }
3915
3916 rounding_bumps_exp = (zSig + increment >= 0x01000000);
3917
3918 if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3919 if (ieee) {
ff32e16e 3920 float_raise(float_flag_overflow | float_flag_inexact, status);
c4a1c5e7
PM
3921 return packFloat16(zSign, 0x1f, 0);
3922 } else {
ff32e16e 3923 float_raise(float_flag_invalid, status);
c4a1c5e7
PM
3924 return packFloat16(zSign, 0x1f, 0x3ff);
3925 }
3926 }
3927
3928 if (zExp < 0) {
3929 /* Note that flush-to-zero does not affect half-precision results */
3930 is_tiny =
a2f2d288 3931 (status->float_detect_tininess == float_tininess_before_rounding)
c4a1c5e7
PM
3932 || (zExp < -1)
3933 || (!rounding_bumps_exp);
3934 }
3935 if (zSig & mask) {
ff32e16e 3936 float_raise(float_flag_inexact, status);
c4a1c5e7 3937 if (is_tiny) {
ff32e16e 3938 float_raise(float_flag_underflow, status);
c4a1c5e7
PM
3939 }
3940 }
3941
3942 zSig += increment;
3943 if (rounding_bumps_exp) {
3944 zSig >>= 1;
3945 zExp++;
3946 }
3947
3948 if (zExp < -10) {
3949 return packFloat16(zSign, 0, 0);
3950 }
3951 if (zExp < 0) {
3952 zSig >>= -zExp;
3953 zExp = 0;
3954 }
3955 return packFloat16(zSign, zExp, zSig >> 13);
3956}
3957
210cbd49
AB
3958/*----------------------------------------------------------------------------
3959| If `a' is denormal and we are in flush-to-zero mode then set the
3960| input-denormal exception and return zero. Otherwise just return the value.
3961*----------------------------------------------------------------------------*/
3962float16 float16_squash_input_denormal(float16 a, float_status *status)
3963{
3964 if (status->flush_inputs_to_zero) {
3965 if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) {
3966 float_raise(float_flag_input_denormal, status);
3967 return make_float16(float16_val(a) & 0x8000);
3968 }
3969 }
3970 return a;
3971}
3972
0c48262d 3973static void normalizeFloat16Subnormal(uint32_t aSig, int *zExpPtr,
c4a1c5e7
PM
3974 uint32_t *zSigPtr)
3975{
3976 int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3977 *zSigPtr = aSig << shiftCount;
3978 *zExpPtr = 1 - shiftCount;
3979}
3980
60011498
PB
3981/* Half precision floats come in two formats: standard IEEE and "ARM" format.
3982 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
bb4d4bb3 3983
e5a41ffa 3984float32 float16_to_float32(float16 a, flag ieee, float_status *status)
60011498
PB
3985{
3986 flag aSign;
0c48262d 3987 int aExp;
bb98fe42 3988 uint32_t aSig;
60011498 3989
bb4d4bb3
PM
3990 aSign = extractFloat16Sign(a);
3991 aExp = extractFloat16Exp(a);
3992 aSig = extractFloat16Frac(a);
60011498
PB
3993
3994 if (aExp == 0x1f && ieee) {
3995 if (aSig) {
ff32e16e 3996 return commonNaNToFloat32(float16ToCommonNaN(a, status), status);
60011498 3997 }
4be8eeac 3998 return packFloat32(aSign, 0xff, 0);
60011498
PB
3999 }
4000 if (aExp == 0) {
60011498
PB
4001 if (aSig == 0) {
4002 return packFloat32(aSign, 0, 0);
4003 }
4004
c4a1c5e7
PM
4005 normalizeFloat16Subnormal(aSig, &aExp, &aSig);
4006 aExp--;
60011498
PB
4007 }
4008 return packFloat32( aSign, aExp + 0x70, aSig << 13);
4009}
4010
e5a41ffa 4011float16 float32_to_float16(float32 a, flag ieee, float_status *status)
60011498
PB
4012{
4013 flag aSign;
0c48262d 4014 int aExp;
bb98fe42 4015 uint32_t aSig;
38970efa 4016
ff32e16e 4017 a = float32_squash_input_denormal(a, status);
60011498
PB
4018
4019 aSig = extractFloat32Frac( a );
4020 aExp = extractFloat32Exp( a );
4021 aSign = extractFloat32Sign( a );
4022 if ( aExp == 0xFF ) {
4023 if (aSig) {
600e30d2 4024 /* Input is a NaN */
600e30d2 4025 if (!ieee) {
ff32e16e 4026 float_raise(float_flag_invalid, status);
600e30d2
PM
4027 return packFloat16(aSign, 0, 0);
4028 }
38970efa 4029 return commonNaNToFloat16(
ff32e16e 4030 float32ToCommonNaN(a, status), status);
60011498 4031 }
600e30d2
PM
4032 /* Infinity */
4033 if (!ieee) {
ff32e16e 4034 float_raise(float_flag_invalid, status);
600e30d2
PM
4035 return packFloat16(aSign, 0x1f, 0x3ff);
4036 }
4037 return packFloat16(aSign, 0x1f, 0);
60011498 4038 }
600e30d2 4039 if (aExp == 0 && aSig == 0) {
60011498
PB
4040 return packFloat16(aSign, 0, 0);
4041 }
38970efa
PM
4042 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
4043 * even if the input is denormal; however this is harmless because
4044 * the largest possible single-precision denormal is still smaller
4045 * than the smallest representable half-precision denormal, and so we
4046 * will end up ignoring aSig and returning via the "always return zero"
4047 * codepath.
4048 */
60011498 4049 aSig |= 0x00800000;
c4a1c5e7 4050 aExp -= 0x71;
60011498 4051
ff32e16e 4052 return roundAndPackFloat16(aSign, aExp, aSig, ieee, status);
60011498
PB
4053}
4054
e5a41ffa 4055float64 float16_to_float64(float16 a, flag ieee, float_status *status)
14c9a07e
PM
4056{
4057 flag aSign;
0c48262d 4058 int aExp;
14c9a07e
PM
4059 uint32_t aSig;
4060
4061 aSign = extractFloat16Sign(a);
4062 aExp = extractFloat16Exp(a);
4063 aSig = extractFloat16Frac(a);
4064
4065 if (aExp == 0x1f && ieee) {
4066 if (aSig) {
4067 return commonNaNToFloat64(
ff32e16e 4068 float16ToCommonNaN(a, status), status);
14c9a07e
PM
4069 }
4070 return packFloat64(aSign, 0x7ff, 0);
4071 }
4072 if (aExp == 0) {
4073 if (aSig == 0) {
4074 return packFloat64(aSign, 0, 0);
4075 }
4076
4077 normalizeFloat16Subnormal(aSig, &aExp, &aSig);
4078 aExp--;
4079 }
4080 return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);
4081}
4082
e5a41ffa 4083float16 float64_to_float16(float64 a, flag ieee, float_status *status)
14c9a07e
PM
4084{
4085 flag aSign;
0c48262d 4086 int aExp;
14c9a07e
PM
4087 uint64_t aSig;
4088 uint32_t zSig;
4089
ff32e16e 4090 a = float64_squash_input_denormal(a, status);
14c9a07e
PM
4091
4092 aSig = extractFloat64Frac(a);
4093 aExp = extractFloat64Exp(a);
4094 aSign = extractFloat64Sign(a);
4095 if (aExp == 0x7FF) {
4096 if (aSig) {
4097 /* Input is a NaN */
4098 if (!ieee) {
ff32e16e 4099 float_raise(float_flag_invalid, status);
14c9a07e
PM
4100 return packFloat16(aSign, 0, 0);
4101 }
4102 return commonNaNToFloat16(
ff32e16e 4103 float64ToCommonNaN(a, status), status);
14c9a07e
PM
4104 }
4105 /* Infinity */
4106 if (!ieee) {
ff32e16e 4107 float_raise(float_flag_invalid, status);
14c9a07e
PM
4108 return packFloat16(aSign, 0x1f, 0x3ff);
4109 }
4110 return packFloat16(aSign, 0x1f, 0);
4111 }
4112 shift64RightJamming(aSig, 29, &aSig);
4113 zSig = aSig;
4114 if (aExp == 0 && zSig == 0) {
4115 return packFloat16(aSign, 0, 0);
4116 }
4117 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
4118 * even if the input is denormal; however this is harmless because
4119 * the largest possible single-precision denormal is still smaller
4120 * than the smallest representable half-precision denormal, and so we
4121 * will end up ignoring aSig and returning via the "always return zero"
4122 * codepath.
4123 */
4124 zSig |= 0x00800000;
4125 aExp -= 0x3F1;
4126
ff32e16e 4127 return roundAndPackFloat16(aSign, aExp, zSig, ieee, status);
14c9a07e
PM
4128}
4129
158142c2
FB
4130/*----------------------------------------------------------------------------
4131| Returns the result of converting the double-precision floating-point value
4132| `a' to the extended double-precision floating-point format. The conversion
4133| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4134| Arithmetic.
4135*----------------------------------------------------------------------------*/
4136
e5a41ffa 4137floatx80 float64_to_floatx80(float64 a, float_status *status)
158142c2
FB
4138{
4139 flag aSign;
0c48262d 4140 int aExp;
bb98fe42 4141 uint64_t aSig;
158142c2 4142
ff32e16e 4143 a = float64_squash_input_denormal(a, status);
158142c2
FB
4144 aSig = extractFloat64Frac( a );
4145 aExp = extractFloat64Exp( a );
4146 aSign = extractFloat64Sign( a );
4147 if ( aExp == 0x7FF ) {
ff32e16e
PM
4148 if (aSig) {
4149 return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
4150 }
158142c2
FB
4151 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4152 }
4153 if ( aExp == 0 ) {
4154 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4155 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4156 }
4157 return
4158 packFloatx80(
4159 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
4160
4161}
4162
158142c2
FB
4163/*----------------------------------------------------------------------------
4164| Returns the result of converting the double-precision floating-point value
4165| `a' to the quadruple-precision floating-point format. The conversion is
4166| performed according to the IEC/IEEE Standard for Binary Floating-Point
4167| Arithmetic.
4168*----------------------------------------------------------------------------*/
4169
e5a41ffa 4170float128 float64_to_float128(float64 a, float_status *status)
158142c2
FB
4171{
4172 flag aSign;
0c48262d 4173 int aExp;
bb98fe42 4174 uint64_t aSig, zSig0, zSig1;
158142c2 4175
ff32e16e 4176 a = float64_squash_input_denormal(a, status);
158142c2
FB
4177 aSig = extractFloat64Frac( a );
4178 aExp = extractFloat64Exp( a );
4179 aSign = extractFloat64Sign( a );
4180 if ( aExp == 0x7FF ) {
ff32e16e
PM
4181 if (aSig) {
4182 return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
4183 }
158142c2
FB
4184 return packFloat128( aSign, 0x7FFF, 0, 0 );
4185 }
4186 if ( aExp == 0 ) {
4187 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4188 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4189 --aExp;
4190 }
4191 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
4192 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
4193
4194}
4195
158142c2
FB
4196
4197/*----------------------------------------------------------------------------
4198| Returns the remainder of the double-precision floating-point value `a'
4199| with respect to the corresponding value `b'. The operation is performed
4200| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4201*----------------------------------------------------------------------------*/
4202
e5a41ffa 4203float64 float64_rem(float64 a, float64 b, float_status *status)
158142c2 4204{
ed086f3d 4205 flag aSign, zSign;
0c48262d 4206 int aExp, bExp, expDiff;
bb98fe42
AF
4207 uint64_t aSig, bSig;
4208 uint64_t q, alternateASig;
4209 int64_t sigMean;
158142c2 4210
ff32e16e
PM
4211 a = float64_squash_input_denormal(a, status);
4212 b = float64_squash_input_denormal(b, status);
158142c2
FB
4213 aSig = extractFloat64Frac( a );
4214 aExp = extractFloat64Exp( a );
4215 aSign = extractFloat64Sign( a );
4216 bSig = extractFloat64Frac( b );
4217 bExp = extractFloat64Exp( b );
158142c2
FB
4218 if ( aExp == 0x7FF ) {
4219 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
ff32e16e 4220 return propagateFloat64NaN(a, b, status);
158142c2 4221 }
ff32e16e 4222 float_raise(float_flag_invalid, status);
af39bc8c 4223 return float64_default_nan(status);
158142c2
FB
4224 }
4225 if ( bExp == 0x7FF ) {
ff32e16e
PM
4226 if (bSig) {
4227 return propagateFloat64NaN(a, b, status);
4228 }
158142c2
FB
4229 return a;
4230 }
4231 if ( bExp == 0 ) {
4232 if ( bSig == 0 ) {
ff32e16e 4233 float_raise(float_flag_invalid, status);
af39bc8c 4234 return float64_default_nan(status);
158142c2
FB
4235 }
4236 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4237 }
4238 if ( aExp == 0 ) {
4239 if ( aSig == 0 ) return a;
4240 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4241 }
4242 expDiff = aExp - bExp;
4243 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4244 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4245 if ( expDiff < 0 ) {
4246 if ( expDiff < -1 ) return a;
4247 aSig >>= 1;
4248 }
4249 q = ( bSig <= aSig );
4250 if ( q ) aSig -= bSig;
4251 expDiff -= 64;
4252 while ( 0 < expDiff ) {
4253 q = estimateDiv128To64( aSig, 0, bSig );
4254 q = ( 2 < q ) ? q - 2 : 0;
4255 aSig = - ( ( bSig>>2 ) * q );
4256 expDiff -= 62;
4257 }
4258 expDiff += 64;
4259 if ( 0 < expDiff ) {
4260 q = estimateDiv128To64( aSig, 0, bSig );
4261 q = ( 2 < q ) ? q - 2 : 0;
4262 q >>= 64 - expDiff;
4263 bSig >>= 2;
4264 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4265 }
4266 else {
4267 aSig >>= 2;
4268 bSig >>= 2;
4269 }
4270 do {
4271 alternateASig = aSig;
4272 ++q;
4273 aSig -= bSig;
bb98fe42 4274 } while ( 0 <= (int64_t) aSig );
158142c2
FB
4275 sigMean = aSig + alternateASig;
4276 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4277 aSig = alternateASig;
4278 }
bb98fe42 4279 zSign = ( (int64_t) aSig < 0 );
158142c2 4280 if ( zSign ) aSig = - aSig;
ff32e16e 4281 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
158142c2
FB
4282
4283}
4284
369be8f6 4285
158142c2
FB
4286/*----------------------------------------------------------------------------
4287| Returns the square root of the double-precision floating-point value `a'.
4288| The operation is performed according to the IEC/IEEE Standard for Binary
4289| Floating-Point Arithmetic.
4290*----------------------------------------------------------------------------*/
4291
e5a41ffa 4292float64 float64_sqrt(float64 a, float_status *status)
158142c2
FB
4293{
4294 flag aSign;
0c48262d 4295 int aExp, zExp;
bb98fe42
AF
4296 uint64_t aSig, zSig, doubleZSig;
4297 uint64_t rem0, rem1, term0, term1;
ff32e16e 4298 a = float64_squash_input_denormal(a, status);
158142c2
FB
4299
4300 aSig = extractFloat64Frac( a );
4301 aExp = extractFloat64Exp( a );
4302 aSign = extractFloat64Sign( a );
4303 if ( aExp == 0x7FF ) {
ff32e16e
PM
4304 if (aSig) {
4305 return propagateFloat64NaN(a, a, status);
4306 }
158142c2 4307 if ( ! aSign ) return a;
ff32e16e 4308 float_raise(float_flag_invalid, status);
af39bc8c 4309 return float64_default_nan(status);
158142c2
FB
4310 }
4311 if ( aSign ) {
4312 if ( ( aExp | aSig ) == 0 ) return a;
ff32e16e 4313 float_raise(float_flag_invalid, status);
af39bc8c 4314 return float64_default_nan(status);
158142c2
FB
4315 }
4316 if ( aExp == 0 ) {
f090c9d4 4317 if ( aSig == 0 ) return float64_zero;
158142c2
FB
4318 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4319 }
4320 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
4321 aSig |= LIT64( 0x0010000000000000 );
4322 zSig = estimateSqrt32( aExp, aSig>>21 );
4323 aSig <<= 9 - ( aExp & 1 );
4324 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
4325 if ( ( zSig & 0x1FF ) <= 5 ) {
4326 doubleZSig = zSig<<1;
4327 mul64To128( zSig, zSig, &term0, &term1 );
4328 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
bb98fe42 4329 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
4330 --zSig;
4331 doubleZSig -= 2;
4332 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
4333 }
4334 zSig |= ( ( rem0 | rem1 ) != 0 );
4335 }
ff32e16e 4336 return roundAndPackFloat64(0, zExp, zSig, status);
158142c2
FB
4337
4338}
4339
374dfc33
AJ
4340/*----------------------------------------------------------------------------
4341| Returns the binary log of the double-precision floating-point value `a'.
4342| The operation is performed according to the IEC/IEEE Standard for Binary
4343| Floating-Point Arithmetic.
4344*----------------------------------------------------------------------------*/
e5a41ffa 4345float64 float64_log2(float64 a, float_status *status)
374dfc33
AJ
4346{
4347 flag aSign, zSign;
0c48262d 4348 int aExp;
bb98fe42 4349 uint64_t aSig, aSig0, aSig1, zSig, i;
ff32e16e 4350 a = float64_squash_input_denormal(a, status);
374dfc33
AJ
4351
4352 aSig = extractFloat64Frac( a );
4353 aExp = extractFloat64Exp( a );
4354 aSign = extractFloat64Sign( a );
4355
4356 if ( aExp == 0 ) {
4357 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4358 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4359 }
4360 if ( aSign ) {
ff32e16e 4361 float_raise(float_flag_invalid, status);
af39bc8c 4362 return float64_default_nan(status);
374dfc33
AJ
4363 }
4364 if ( aExp == 0x7FF ) {
ff32e16e
PM
4365 if (aSig) {
4366 return propagateFloat64NaN(a, float64_zero, status);
4367 }
374dfc33
AJ
4368 return a;
4369 }
4370
4371 aExp -= 0x3FF;
4372 aSig |= LIT64( 0x0010000000000000 );
4373 zSign = aExp < 0;
bb98fe42 4374 zSig = (uint64_t)aExp << 52;
374dfc33
AJ
4375 for (i = 1LL << 51; i > 0; i >>= 1) {
4376 mul64To128( aSig, aSig, &aSig0, &aSig1 );
4377 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4378 if ( aSig & LIT64( 0x0020000000000000 ) ) {
4379 aSig >>= 1;
4380 zSig |= i;
4381 }
4382 }
4383
4384 if ( zSign )
4385 zSig = -zSig;
ff32e16e 4386 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
374dfc33
AJ
4387}
4388
158142c2
FB
4389/*----------------------------------------------------------------------------
4390| Returns 1 if the double-precision floating-point value `a' is equal to the
b689362d
AJ
4391| corresponding value `b', and 0 otherwise. The invalid exception is raised
4392| if either operand is a NaN. Otherwise, the comparison is performed
158142c2
FB
4393| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4394*----------------------------------------------------------------------------*/
4395
e5a41ffa 4396int float64_eq(float64 a, float64 b, float_status *status)
158142c2 4397{
bb98fe42 4398 uint64_t av, bv;
ff32e16e
PM
4399 a = float64_squash_input_denormal(a, status);
4400 b = float64_squash_input_denormal(b, status);
158142c2
FB
4401
4402 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4403 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4404 ) {
ff32e16e 4405 float_raise(float_flag_invalid, status);
158142c2
FB
4406 return 0;
4407 }
f090c9d4 4408 av = float64_val(a);
a1b91bb4 4409 bv = float64_val(b);
bb98fe42 4410 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
158142c2
FB
4411
4412}
4413
4414/*----------------------------------------------------------------------------
4415| Returns 1 if the double-precision floating-point value `a' is less than or
f5a64251
AJ
4416| equal to the corresponding value `b', and 0 otherwise. The invalid
4417| exception is raised if either operand is a NaN. The comparison is performed
4418| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
4419*----------------------------------------------------------------------------*/
4420
e5a41ffa 4421int float64_le(float64 a, float64 b, float_status *status)
158142c2
FB
4422{
4423 flag aSign, bSign;
bb98fe42 4424 uint64_t av, bv;
ff32e16e
PM
4425 a = float64_squash_input_denormal(a, status);
4426 b = float64_squash_input_denormal(b, status);
158142c2
FB
4427
4428 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4429 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4430 ) {
ff32e16e 4431 float_raise(float_flag_invalid, status);
158142c2
FB
4432 return 0;
4433 }
4434 aSign = extractFloat64Sign( a );
4435 bSign = extractFloat64Sign( b );
f090c9d4 4436 av = float64_val(a);
a1b91bb4 4437 bv = float64_val(b);
bb98fe42 4438 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 4439 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
4440
4441}
4442
4443/*----------------------------------------------------------------------------
4444| Returns 1 if the double-precision floating-point value `a' is less than
f5a64251
AJ
4445| the corresponding value `b', and 0 otherwise. The invalid exception is
4446| raised if either operand is a NaN. The comparison is performed according
4447| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
4448*----------------------------------------------------------------------------*/
4449
e5a41ffa 4450int float64_lt(float64 a, float64 b, float_status *status)
158142c2
FB
4451{
4452 flag aSign, bSign;
bb98fe42 4453 uint64_t av, bv;
158142c2 4454
ff32e16e
PM
4455 a = float64_squash_input_denormal(a, status);
4456 b = float64_squash_input_denormal(b, status);
158142c2
FB
4457 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4458 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4459 ) {
ff32e16e 4460 float_raise(float_flag_invalid, status);
158142c2
FB
4461 return 0;
4462 }
4463 aSign = extractFloat64Sign( a );
4464 bSign = extractFloat64Sign( b );
f090c9d4 4465 av = float64_val(a);
a1b91bb4 4466 bv = float64_val(b);
bb98fe42 4467 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 4468 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
4469
4470}
4471
67b7861d
AJ
4472/*----------------------------------------------------------------------------
4473| Returns 1 if the double-precision floating-point values `a' and `b' cannot
f5a64251
AJ
4474| be compared, and 0 otherwise. The invalid exception is raised if either
4475| operand is a NaN. The comparison is performed according to the IEC/IEEE
4476| Standard for Binary Floating-Point Arithmetic.
67b7861d
AJ
4477*----------------------------------------------------------------------------*/
4478
e5a41ffa 4479int float64_unordered(float64 a, float64 b, float_status *status)
67b7861d 4480{
ff32e16e
PM
4481 a = float64_squash_input_denormal(a, status);
4482 b = float64_squash_input_denormal(b, status);
67b7861d
AJ
4483
4484 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4485 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4486 ) {
ff32e16e 4487 float_raise(float_flag_invalid, status);
67b7861d
AJ
4488 return 1;
4489 }
4490 return 0;
4491}
4492
158142c2
FB
4493/*----------------------------------------------------------------------------
4494| Returns 1 if the double-precision floating-point value `a' is equal to the
f5a64251
AJ
4495| corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4496| exception.The comparison is performed according to the IEC/IEEE Standard
4497| for Binary Floating-Point Arithmetic.
158142c2
FB
4498*----------------------------------------------------------------------------*/
4499
e5a41ffa 4500int float64_eq_quiet(float64 a, float64 b, float_status *status)
158142c2 4501{
bb98fe42 4502 uint64_t av, bv;
ff32e16e
PM
4503 a = float64_squash_input_denormal(a, status);
4504 b = float64_squash_input_denormal(b, status);
158142c2
FB
4505
4506 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4507 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4508 ) {
af39bc8c
AM
4509 if (float64_is_signaling_nan(a, status)
4510 || float64_is_signaling_nan(b, status)) {
ff32e16e 4511 float_raise(float_flag_invalid, status);
b689362d 4512 }
158142c2
FB
4513 return 0;
4514 }
f090c9d4 4515 av = float64_val(a);
a1b91bb4 4516 bv = float64_val(b);
bb98fe42 4517 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
158142c2
FB
4518
4519}
4520
4521/*----------------------------------------------------------------------------
4522| Returns 1 if the double-precision floating-point value `a' is less than or
4523| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4524| cause an exception. Otherwise, the comparison is performed according to the
4525| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4526*----------------------------------------------------------------------------*/
4527
e5a41ffa 4528int float64_le_quiet(float64 a, float64 b, float_status *status)
158142c2
FB
4529{
4530 flag aSign, bSign;
bb98fe42 4531 uint64_t av, bv;
ff32e16e
PM
4532 a = float64_squash_input_denormal(a, status);
4533 b = float64_squash_input_denormal(b, status);
158142c2
FB
4534
4535 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4536 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4537 ) {
af39bc8c
AM
4538 if (float64_is_signaling_nan(a, status)
4539 || float64_is_signaling_nan(b, status)) {
ff32e16e 4540 float_raise(float_flag_invalid, status);
158142c2
FB
4541 }
4542 return 0;
4543 }
4544 aSign = extractFloat64Sign( a );
4545 bSign = extractFloat64Sign( b );
f090c9d4 4546 av = float64_val(a);
a1b91bb4 4547 bv = float64_val(b);
bb98fe42 4548 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 4549 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
4550
4551}
4552
4553/*----------------------------------------------------------------------------
4554| Returns 1 if the double-precision floating-point value `a' is less than
4555| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4556| exception. Otherwise, the comparison is performed according to the IEC/IEEE
4557| Standard for Binary Floating-Point Arithmetic.
4558*----------------------------------------------------------------------------*/
4559
e5a41ffa 4560int float64_lt_quiet(float64 a, float64 b, float_status *status)
158142c2
FB
4561{
4562 flag aSign, bSign;
bb98fe42 4563 uint64_t av, bv;
ff32e16e
PM
4564 a = float64_squash_input_denormal(a, status);
4565 b = float64_squash_input_denormal(b, status);
158142c2
FB
4566
4567 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4568 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4569 ) {
af39bc8c
AM
4570 if (float64_is_signaling_nan(a, status)
4571 || float64_is_signaling_nan(b, status)) {
ff32e16e 4572 float_raise(float_flag_invalid, status);
158142c2
FB
4573 }
4574 return 0;
4575 }
4576 aSign = extractFloat64Sign( a );
4577 bSign = extractFloat64Sign( b );
f090c9d4 4578 av = float64_val(a);
a1b91bb4 4579 bv = float64_val(b);
bb98fe42 4580 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 4581 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
4582
4583}
4584
67b7861d
AJ
4585/*----------------------------------------------------------------------------
4586| Returns 1 if the double-precision floating-point values `a' and `b' cannot
4587| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4588| comparison is performed according to the IEC/IEEE Standard for Binary
4589| Floating-Point Arithmetic.
4590*----------------------------------------------------------------------------*/
4591
e5a41ffa 4592int float64_unordered_quiet(float64 a, float64 b, float_status *status)
67b7861d 4593{
ff32e16e
PM
4594 a = float64_squash_input_denormal(a, status);
4595 b = float64_squash_input_denormal(b, status);
67b7861d
AJ
4596
4597 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4598 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4599 ) {
af39bc8c
AM
4600 if (float64_is_signaling_nan(a, status)
4601 || float64_is_signaling_nan(b, status)) {
ff32e16e 4602 float_raise(float_flag_invalid, status);
67b7861d
AJ
4603 }
4604 return 1;
4605 }
4606 return 0;
4607}
4608
158142c2
FB
4609/*----------------------------------------------------------------------------
4610| Returns the result of converting the extended double-precision floating-
4611| point value `a' to the 32-bit two's complement integer format. The
4612| conversion is performed according to the IEC/IEEE Standard for Binary
4613| Floating-Point Arithmetic---which means in particular that the conversion
4614| is rounded according to the current rounding mode. If `a' is a NaN, the
4615| largest positive integer is returned. Otherwise, if the conversion
4616| overflows, the largest integer with the same sign as `a' is returned.
4617*----------------------------------------------------------------------------*/
4618
f4014512 4619int32_t floatx80_to_int32(floatx80 a, float_status *status)
158142c2
FB
4620{
4621 flag aSign;
f4014512 4622 int32_t aExp, shiftCount;
bb98fe42 4623 uint64_t aSig;
158142c2 4624
d1eb8f2a
AD
4625 if (floatx80_invalid_encoding(a)) {
4626 float_raise(float_flag_invalid, status);
4627 return 1 << 31;
4628 }
158142c2
FB
4629 aSig = extractFloatx80Frac( a );
4630 aExp = extractFloatx80Exp( a );
4631 aSign = extractFloatx80Sign( a );
bb98fe42 4632 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
158142c2
FB
4633 shiftCount = 0x4037 - aExp;
4634 if ( shiftCount <= 0 ) shiftCount = 1;
4635 shift64RightJamming( aSig, shiftCount, &aSig );
ff32e16e 4636 return roundAndPackInt32(aSign, aSig, status);
158142c2
FB
4637
4638}
4639
4640/*----------------------------------------------------------------------------
4641| Returns the result of converting the extended double-precision floating-
4642| point value `a' to the 32-bit two's complement integer format. The
4643| conversion is performed according to the IEC/IEEE Standard for Binary
4644| Floating-Point Arithmetic, except that the conversion is always rounded
4645| toward zero. If `a' is a NaN, the largest positive integer is returned.
4646| Otherwise, if the conversion overflows, the largest integer with the same
4647| sign as `a' is returned.
4648*----------------------------------------------------------------------------*/
4649
f4014512 4650int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
158142c2
FB
4651{
4652 flag aSign;
f4014512 4653 int32_t aExp, shiftCount;
bb98fe42 4654 uint64_t aSig, savedASig;
b3a6a2e0 4655 int32_t z;
158142c2 4656
d1eb8f2a
AD
4657 if (floatx80_invalid_encoding(a)) {
4658 float_raise(float_flag_invalid, status);
4659 return 1 << 31;
4660 }
158142c2
FB
4661 aSig = extractFloatx80Frac( a );
4662 aExp = extractFloatx80Exp( a );
4663 aSign = extractFloatx80Sign( a );
4664 if ( 0x401E < aExp ) {
bb98fe42 4665 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
158142c2
FB
4666 goto invalid;
4667 }
4668 else if ( aExp < 0x3FFF ) {
a2f2d288
PM
4669 if (aExp || aSig) {
4670 status->float_exception_flags |= float_flag_inexact;
4671 }
158142c2
FB
4672 return 0;
4673 }
4674 shiftCount = 0x403E - aExp;
4675 savedASig = aSig;
4676 aSig >>= shiftCount;
4677 z = aSig;
4678 if ( aSign ) z = - z;
4679 if ( ( z < 0 ) ^ aSign ) {
4680 invalid:
ff32e16e 4681 float_raise(float_flag_invalid, status);
bb98fe42 4682 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2
FB
4683 }
4684 if ( ( aSig<<shiftCount ) != savedASig ) {
a2f2d288 4685 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
4686 }
4687 return z;
4688
4689}
4690
4691/*----------------------------------------------------------------------------
4692| Returns the result of converting the extended double-precision floating-
4693| point value `a' to the 64-bit two's complement integer format. The
4694| conversion is performed according to the IEC/IEEE Standard for Binary
4695| Floating-Point Arithmetic---which means in particular that the conversion
4696| is rounded according to the current rounding mode. If `a' is a NaN,
4697| the largest positive integer is returned. Otherwise, if the conversion
4698| overflows, the largest integer with the same sign as `a' is returned.
4699*----------------------------------------------------------------------------*/
4700
f42c2224 4701int64_t floatx80_to_int64(floatx80 a, float_status *status)
158142c2
FB
4702{
4703 flag aSign;
f4014512 4704 int32_t aExp, shiftCount;
bb98fe42 4705 uint64_t aSig, aSigExtra;
158142c2 4706
d1eb8f2a
AD
4707 if (floatx80_invalid_encoding(a)) {
4708 float_raise(float_flag_invalid, status);
4709 return 1ULL << 63;
4710 }
158142c2
FB
4711 aSig = extractFloatx80Frac( a );
4712 aExp = extractFloatx80Exp( a );
4713 aSign = extractFloatx80Sign( a );
4714 shiftCount = 0x403E - aExp;
4715 if ( shiftCount <= 0 ) {
4716 if ( shiftCount ) {
ff32e16e 4717 float_raise(float_flag_invalid, status);
158142c2
FB
4718 if ( ! aSign
4719 || ( ( aExp == 0x7FFF )
4720 && ( aSig != LIT64( 0x8000000000000000 ) ) )
4721 ) {
4722 return LIT64( 0x7FFFFFFFFFFFFFFF );
4723 }
bb98fe42 4724 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
4725 }
4726 aSigExtra = 0;
4727 }
4728 else {
4729 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4730 }
ff32e16e 4731 return roundAndPackInt64(aSign, aSig, aSigExtra, status);
158142c2
FB
4732
4733}
4734
4735/*----------------------------------------------------------------------------
4736| Returns the result of converting the extended double-precision floating-
4737| point value `a' to the 64-bit two's complement integer format. The
4738| conversion is performed according to the IEC/IEEE Standard for Binary
4739| Floating-Point Arithmetic, except that the conversion is always rounded
4740| toward zero. If `a' is a NaN, the largest positive integer is returned.
4741| Otherwise, if the conversion overflows, the largest integer with the same
4742| sign as `a' is returned.
4743*----------------------------------------------------------------------------*/
4744
f42c2224 4745int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
158142c2
FB
4746{
4747 flag aSign;
f4014512 4748 int32_t aExp, shiftCount;
bb98fe42 4749 uint64_t aSig;
f42c2224 4750 int64_t z;
158142c2 4751
d1eb8f2a
AD
4752 if (floatx80_invalid_encoding(a)) {
4753 float_raise(float_flag_invalid, status);
4754 return 1ULL << 63;
4755 }
158142c2
FB
4756 aSig = extractFloatx80Frac( a );
4757 aExp = extractFloatx80Exp( a );
4758 aSign = extractFloatx80Sign( a );
4759 shiftCount = aExp - 0x403E;
4760 if ( 0 <= shiftCount ) {
4761 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4762 if ( ( a.high != 0xC03E ) || aSig ) {
ff32e16e 4763 float_raise(float_flag_invalid, status);
158142c2
FB
4764 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4765 return LIT64( 0x7FFFFFFFFFFFFFFF );
4766 }
4767 }
bb98fe42 4768 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
4769 }
4770 else if ( aExp < 0x3FFF ) {
a2f2d288
PM
4771 if (aExp | aSig) {
4772 status->float_exception_flags |= float_flag_inexact;
4773 }
158142c2
FB
4774 return 0;
4775 }
4776 z = aSig>>( - shiftCount );
bb98fe42 4777 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
a2f2d288 4778 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
4779 }
4780 if ( aSign ) z = - z;
4781 return z;
4782
4783}
4784
4785/*----------------------------------------------------------------------------
4786| Returns the result of converting the extended double-precision floating-
4787| point value `a' to the single-precision floating-point format. The
4788| conversion is performed according to the IEC/IEEE Standard for Binary
4789| Floating-Point Arithmetic.
4790*----------------------------------------------------------------------------*/
4791
e5a41ffa 4792float32 floatx80_to_float32(floatx80 a, float_status *status)
158142c2
FB
4793{
4794 flag aSign;
f4014512 4795 int32_t aExp;
bb98fe42 4796 uint64_t aSig;
158142c2 4797
d1eb8f2a
AD
4798 if (floatx80_invalid_encoding(a)) {
4799 float_raise(float_flag_invalid, status);
4800 return float32_default_nan(status);
4801 }
158142c2
FB
4802 aSig = extractFloatx80Frac( a );
4803 aExp = extractFloatx80Exp( a );
4804 aSign = extractFloatx80Sign( a );
4805 if ( aExp == 0x7FFF ) {
bb98fe42 4806 if ( (uint64_t) ( aSig<<1 ) ) {
ff32e16e 4807 return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
158142c2
FB
4808 }
4809 return packFloat32( aSign, 0xFF, 0 );
4810 }
4811 shift64RightJamming( aSig, 33, &aSig );
4812 if ( aExp || aSig ) aExp -= 0x3F81;
ff32e16e 4813 return roundAndPackFloat32(aSign, aExp, aSig, status);
158142c2
FB
4814
4815}
4816
4817/*----------------------------------------------------------------------------
4818| Returns the result of converting the extended double-precision floating-
4819| point value `a' to the double-precision floating-point format. The
4820| conversion is performed according to the IEC/IEEE Standard for Binary
4821| Floating-Point Arithmetic.
4822*----------------------------------------------------------------------------*/
4823
e5a41ffa 4824float64 floatx80_to_float64(floatx80 a, float_status *status)
158142c2
FB
4825{
4826 flag aSign;
f4014512 4827 int32_t aExp;
bb98fe42 4828 uint64_t aSig, zSig;
158142c2 4829
d1eb8f2a
AD
4830 if (floatx80_invalid_encoding(a)) {
4831 float_raise(float_flag_invalid, status);
4832 return float64_default_nan(status);
4833 }
158142c2
FB
4834 aSig = extractFloatx80Frac( a );
4835 aExp = extractFloatx80Exp( a );
4836 aSign = extractFloatx80Sign( a );
4837 if ( aExp == 0x7FFF ) {
bb98fe42 4838 if ( (uint64_t) ( aSig<<1 ) ) {
ff32e16e 4839 return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
158142c2
FB
4840 }
4841 return packFloat64( aSign, 0x7FF, 0 );
4842 }
4843 shift64RightJamming( aSig, 1, &zSig );
4844 if ( aExp || aSig ) aExp -= 0x3C01;
ff32e16e 4845 return roundAndPackFloat64(aSign, aExp, zSig, status);
158142c2
FB
4846
4847}
4848
158142c2
FB
4849/*----------------------------------------------------------------------------
4850| Returns the result of converting the extended double-precision floating-
4851| point value `a' to the quadruple-precision floating-point format. The
4852| conversion is performed according to the IEC/IEEE Standard for Binary
4853| Floating-Point Arithmetic.
4854*----------------------------------------------------------------------------*/
4855
e5a41ffa 4856float128 floatx80_to_float128(floatx80 a, float_status *status)
158142c2
FB
4857{
4858 flag aSign;
0c48262d 4859 int aExp;
bb98fe42 4860 uint64_t aSig, zSig0, zSig1;
158142c2 4861
d1eb8f2a
AD
4862 if (floatx80_invalid_encoding(a)) {
4863 float_raise(float_flag_invalid, status);
4864 return float128_default_nan(status);
4865 }
158142c2
FB
4866 aSig = extractFloatx80Frac( a );
4867 aExp = extractFloatx80Exp( a );
4868 aSign = extractFloatx80Sign( a );
bb98fe42 4869 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
ff32e16e 4870 return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
158142c2
FB
4871 }
4872 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4873 return packFloat128( aSign, aExp, zSig0, zSig1 );
4874
4875}
4876
0f721292
LV
4877/*----------------------------------------------------------------------------
4878| Rounds the extended double-precision floating-point value `a'
4879| to the precision provided by floatx80_rounding_precision and returns the
4880| result as an extended double-precision floating-point value.
4881| The operation is performed according to the IEC/IEEE Standard for Binary
4882| Floating-Point Arithmetic.
4883*----------------------------------------------------------------------------*/
4884
4885floatx80 floatx80_round(floatx80 a, float_status *status)
4886{
4887 return roundAndPackFloatx80(status->floatx80_rounding_precision,
4888 extractFloatx80Sign(a),
4889 extractFloatx80Exp(a),
4890 extractFloatx80Frac(a), 0, status);
4891}
4892
158142c2
FB
4893/*----------------------------------------------------------------------------
4894| Rounds the extended double-precision floating-point value `a' to an integer,
4895| and returns the result as an extended quadruple-precision floating-point
4896| value. The operation is performed according to the IEC/IEEE Standard for
4897| Binary Floating-Point Arithmetic.
4898*----------------------------------------------------------------------------*/
4899
e5a41ffa 4900floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
158142c2
FB
4901{
4902 flag aSign;
f4014512 4903 int32_t aExp;
bb98fe42 4904 uint64_t lastBitMask, roundBitsMask;
158142c2
FB
4905 floatx80 z;
4906
d1eb8f2a
AD
4907 if (floatx80_invalid_encoding(a)) {
4908 float_raise(float_flag_invalid, status);
4909 return floatx80_default_nan(status);
4910 }
158142c2
FB
4911 aExp = extractFloatx80Exp( a );
4912 if ( 0x403E <= aExp ) {
bb98fe42 4913 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
ff32e16e 4914 return propagateFloatx80NaN(a, a, status);
158142c2
FB
4915 }
4916 return a;
4917 }
4918 if ( aExp < 0x3FFF ) {
4919 if ( ( aExp == 0 )
bb98fe42 4920 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
158142c2
FB
4921 return a;
4922 }
a2f2d288 4923 status->float_exception_flags |= float_flag_inexact;
158142c2 4924 aSign = extractFloatx80Sign( a );
a2f2d288 4925 switch (status->float_rounding_mode) {
158142c2 4926 case float_round_nearest_even:
bb98fe42 4927 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
158142c2
FB
4928 ) {
4929 return
4930 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4931 }
4932 break;
f9288a76
PM
4933 case float_round_ties_away:
4934 if (aExp == 0x3FFE) {
4935 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
4936 }
4937 break;
158142c2
FB
4938 case float_round_down:
4939 return
4940 aSign ?
4941 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4942 : packFloatx80( 0, 0, 0 );
4943 case float_round_up:
4944 return
4945 aSign ? packFloatx80( 1, 0, 0 )
4946 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4947 }
4948 return packFloatx80( aSign, 0, 0 );
4949 }
4950 lastBitMask = 1;
4951 lastBitMask <<= 0x403E - aExp;
4952 roundBitsMask = lastBitMask - 1;
4953 z = a;
a2f2d288 4954 switch (status->float_rounding_mode) {
dc355b76 4955 case float_round_nearest_even:
158142c2 4956 z.low += lastBitMask>>1;
dc355b76
PM
4957 if ((z.low & roundBitsMask) == 0) {
4958 z.low &= ~lastBitMask;
4959 }
4960 break;
f9288a76
PM
4961 case float_round_ties_away:
4962 z.low += lastBitMask >> 1;
4963 break;
dc355b76
PM
4964 case float_round_to_zero:
4965 break;
4966 case float_round_up:
4967 if (!extractFloatx80Sign(z)) {
4968 z.low += roundBitsMask;
4969 }
4970 break;
4971 case float_round_down:
4972 if (extractFloatx80Sign(z)) {
158142c2
FB
4973 z.low += roundBitsMask;
4974 }
dc355b76
PM
4975 break;
4976 default:
4977 abort();
158142c2
FB
4978 }
4979 z.low &= ~ roundBitsMask;
4980 if ( z.low == 0 ) {
4981 ++z.high;
4982 z.low = LIT64( 0x8000000000000000 );
4983 }
a2f2d288
PM
4984 if (z.low != a.low) {
4985 status->float_exception_flags |= float_flag_inexact;
4986 }
158142c2
FB
4987 return z;
4988
4989}
4990
4991/*----------------------------------------------------------------------------
4992| Returns the result of adding the absolute values of the extended double-
4993| precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
4994| negated before being returned. `zSign' is ignored if the result is a NaN.
4995| The addition is performed according to the IEC/IEEE Standard for Binary
4996| Floating-Point Arithmetic.
4997*----------------------------------------------------------------------------*/
4998
e5a41ffa
PM
4999static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5000 float_status *status)
158142c2 5001{
f4014512 5002 int32_t aExp, bExp, zExp;
bb98fe42 5003 uint64_t aSig, bSig, zSig0, zSig1;
f4014512 5004 int32_t expDiff;
158142c2
FB
5005
5006 aSig = extractFloatx80Frac( a );
5007 aExp = extractFloatx80Exp( a );
5008 bSig = extractFloatx80Frac( b );
5009 bExp = extractFloatx80Exp( b );
5010 expDiff = aExp - bExp;
5011 if ( 0 < expDiff ) {
5012 if ( aExp == 0x7FFF ) {
ff32e16e
PM
5013 if ((uint64_t)(aSig << 1)) {
5014 return propagateFloatx80NaN(a, b, status);
5015 }
158142c2
FB
5016 return a;
5017 }
5018 if ( bExp == 0 ) --expDiff;
5019 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5020 zExp = aExp;
5021 }
5022 else if ( expDiff < 0 ) {
5023 if ( bExp == 0x7FFF ) {
ff32e16e
PM
5024 if ((uint64_t)(bSig << 1)) {
5025 return propagateFloatx80NaN(a, b, status);
5026 }
158142c2
FB
5027 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5028 }
5029 if ( aExp == 0 ) ++expDiff;
5030 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5031 zExp = bExp;
5032 }
5033 else {
5034 if ( aExp == 0x7FFF ) {
bb98fe42 5035 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
ff32e16e 5036 return propagateFloatx80NaN(a, b, status);
158142c2
FB
5037 }
5038 return a;
5039 }
5040 zSig1 = 0;
5041 zSig0 = aSig + bSig;
5042 if ( aExp == 0 ) {
5043 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5044 goto roundAndPack;
5045 }
5046 zExp = aExp;
5047 goto shiftRight1;
5048 }
5049 zSig0 = aSig + bSig;
bb98fe42 5050 if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
158142c2
FB
5051 shiftRight1:
5052 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5053 zSig0 |= LIT64( 0x8000000000000000 );
5054 ++zExp;
5055 roundAndPack:
a2f2d288 5056 return roundAndPackFloatx80(status->floatx80_rounding_precision,
ff32e16e 5057 zSign, zExp, zSig0, zSig1, status);
158142c2
FB
5058}
5059
5060/*----------------------------------------------------------------------------
5061| Returns the result of subtracting the absolute values of the extended
5062| double-precision floating-point values `a' and `b'. If `zSign' is 1, the
5063| difference is negated before being returned. `zSign' is ignored if the
5064| result is a NaN. The subtraction is performed according to the IEC/IEEE
5065| Standard for Binary Floating-Point Arithmetic.
5066*----------------------------------------------------------------------------*/
5067
e5a41ffa
PM
5068static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5069 float_status *status)
158142c2 5070{
f4014512 5071 int32_t aExp, bExp, zExp;
bb98fe42 5072 uint64_t aSig, bSig, zSig0, zSig1;
f4014512 5073 int32_t expDiff;
158142c2
FB
5074
5075 aSig = extractFloatx80Frac( a );
5076 aExp = extractFloatx80Exp( a );
5077 bSig = extractFloatx80Frac( b );
5078 bExp = extractFloatx80Exp( b );
5079 expDiff = aExp - bExp;
5080 if ( 0 < expDiff ) goto aExpBigger;
5081 if ( expDiff < 0 ) goto bExpBigger;
5082 if ( aExp == 0x7FFF ) {
bb98fe42 5083 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
ff32e16e 5084 return propagateFloatx80NaN(a, b, status);
158142c2 5085 }
ff32e16e 5086 float_raise(float_flag_invalid, status);
af39bc8c 5087 return floatx80_default_nan(status);
158142c2
FB
5088 }
5089 if ( aExp == 0 ) {
5090 aExp = 1;
5091 bExp = 1;
5092 }
5093 zSig1 = 0;
5094 if ( bSig < aSig ) goto aBigger;
5095 if ( aSig < bSig ) goto bBigger;
a2f2d288 5096 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
158142c2
FB
5097 bExpBigger:
5098 if ( bExp == 0x7FFF ) {
ff32e16e
PM
5099 if ((uint64_t)(bSig << 1)) {
5100 return propagateFloatx80NaN(a, b, status);
5101 }
158142c2
FB
5102 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
5103 }
5104 if ( aExp == 0 ) ++expDiff;
5105 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5106 bBigger:
5107 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5108 zExp = bExp;
5109 zSign ^= 1;
5110 goto normalizeRoundAndPack;
5111 aExpBigger:
5112 if ( aExp == 0x7FFF ) {
ff32e16e
PM
5113 if ((uint64_t)(aSig << 1)) {
5114 return propagateFloatx80NaN(a, b, status);
5115 }
158142c2
FB
5116 return a;
5117 }
5118 if ( bExp == 0 ) --expDiff;
5119 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5120 aBigger:
5121 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5122 zExp = aExp;
5123 normalizeRoundAndPack:
a2f2d288 5124 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
ff32e16e 5125 zSign, zExp, zSig0, zSig1, status);
158142c2
FB
5126}
5127
5128/*----------------------------------------------------------------------------
5129| Returns the result of adding the extended double-precision floating-point
5130| values `a' and `b'. The operation is performed according to the IEC/IEEE
5131| Standard for Binary Floating-Point Arithmetic.
5132*----------------------------------------------------------------------------*/
5133
e5a41ffa 5134floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5135{
5136 flag aSign, bSign;
5137
d1eb8f2a
AD
5138 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5139 float_raise(float_flag_invalid, status);
5140 return floatx80_default_nan(status);
5141 }
158142c2
FB
5142 aSign = extractFloatx80Sign( a );
5143 bSign = extractFloatx80Sign( b );
5144 if ( aSign == bSign ) {
ff32e16e 5145 return addFloatx80Sigs(a, b, aSign, status);
158142c2
FB
5146 }
5147 else {
ff32e16e 5148 return subFloatx80Sigs(a, b, aSign, status);
158142c2
FB
5149 }
5150
5151}
5152
5153/*----------------------------------------------------------------------------
5154| Returns the result of subtracting the extended double-precision floating-
5155| point values `a' and `b'. The operation is performed according to the
5156| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5157*----------------------------------------------------------------------------*/
5158
e5a41ffa 5159floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5160{
5161 flag aSign, bSign;
5162
d1eb8f2a
AD
5163 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5164 float_raise(float_flag_invalid, status);
5165 return floatx80_default_nan(status);
5166 }
158142c2
FB
5167 aSign = extractFloatx80Sign( a );
5168 bSign = extractFloatx80Sign( b );
5169 if ( aSign == bSign ) {
ff32e16e 5170 return subFloatx80Sigs(a, b, aSign, status);
158142c2
FB
5171 }
5172 else {
ff32e16e 5173 return addFloatx80Sigs(a, b, aSign, status);
158142c2
FB
5174 }
5175
5176}
5177
5178/*----------------------------------------------------------------------------
5179| Returns the result of multiplying the extended double-precision floating-
5180| point values `a' and `b'. The operation is performed according to the
5181| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5182*----------------------------------------------------------------------------*/
5183
e5a41ffa 5184floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5185{
5186 flag aSign, bSign, zSign;
f4014512 5187 int32_t aExp, bExp, zExp;
bb98fe42 5188 uint64_t aSig, bSig, zSig0, zSig1;
158142c2 5189
d1eb8f2a
AD
5190 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5191 float_raise(float_flag_invalid, status);
5192 return floatx80_default_nan(status);
5193 }
158142c2
FB
5194 aSig = extractFloatx80Frac( a );
5195 aExp = extractFloatx80Exp( a );
5196 aSign = extractFloatx80Sign( a );
5197 bSig = extractFloatx80Frac( b );
5198 bExp = extractFloatx80Exp( b );
5199 bSign = extractFloatx80Sign( b );
5200 zSign = aSign ^ bSign;
5201 if ( aExp == 0x7FFF ) {
bb98fe42
AF
5202 if ( (uint64_t) ( aSig<<1 )
5203 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
ff32e16e 5204 return propagateFloatx80NaN(a, b, status);
158142c2
FB
5205 }
5206 if ( ( bExp | bSig ) == 0 ) goto invalid;
5207 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5208 }
5209 if ( bExp == 0x7FFF ) {
ff32e16e
PM
5210 if ((uint64_t)(bSig << 1)) {
5211 return propagateFloatx80NaN(a, b, status);
5212 }
158142c2
FB
5213 if ( ( aExp | aSig ) == 0 ) {
5214 invalid:
ff32e16e 5215 float_raise(float_flag_invalid, status);
af39bc8c 5216 return floatx80_default_nan(status);
158142c2
FB
5217 }
5218 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5219 }
5220 if ( aExp == 0 ) {
5221 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5222 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5223 }
5224 if ( bExp == 0 ) {
5225 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5226 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5227 }
5228 zExp = aExp + bExp - 0x3FFE;
5229 mul64To128( aSig, bSig, &zSig0, &zSig1 );
bb98fe42 5230 if ( 0 < (int64_t) zSig0 ) {
158142c2
FB
5231 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5232 --zExp;
5233 }
a2f2d288 5234 return roundAndPackFloatx80(status->floatx80_rounding_precision,
ff32e16e 5235 zSign, zExp, zSig0, zSig1, status);
158142c2
FB
5236}
5237
5238/*----------------------------------------------------------------------------
5239| Returns the result of dividing the extended double-precision floating-point
5240| value `a' by the corresponding value `b'. The operation is performed
5241| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5242*----------------------------------------------------------------------------*/
5243
e5a41ffa 5244floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5245{
5246 flag aSign, bSign, zSign;
f4014512 5247 int32_t aExp, bExp, zExp;
bb98fe42
AF
5248 uint64_t aSig, bSig, zSig0, zSig1;
5249 uint64_t rem0, rem1, rem2, term0, term1, term2;
158142c2 5250
d1eb8f2a
AD
5251 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5252 float_raise(float_flag_invalid, status);
5253 return floatx80_default_nan(status);
5254 }
158142c2
FB
5255 aSig = extractFloatx80Frac( a );
5256 aExp = extractFloatx80Exp( a );
5257 aSign = extractFloatx80Sign( a );
5258 bSig = extractFloatx80Frac( b );
5259 bExp = extractFloatx80Exp( b );
5260 bSign = extractFloatx80Sign( b );
5261 zSign = aSign ^ bSign;
5262 if ( aExp == 0x7FFF ) {
ff32e16e
PM
5263 if ((uint64_t)(aSig << 1)) {
5264 return propagateFloatx80NaN(a, b, status);
5265 }
158142c2 5266 if ( bExp == 0x7FFF ) {
ff32e16e
PM
5267 if ((uint64_t)(bSig << 1)) {
5268 return propagateFloatx80NaN(a, b, status);
5269 }
158142c2
FB
5270 goto invalid;
5271 }
5272 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5273 }
5274 if ( bExp == 0x7FFF ) {
ff32e16e
PM
5275 if ((uint64_t)(bSig << 1)) {
5276 return propagateFloatx80NaN(a, b, status);
5277 }
158142c2
FB
5278 return packFloatx80( zSign, 0, 0 );
5279 }
5280 if ( bExp == 0 ) {
5281 if ( bSig == 0 ) {
5282 if ( ( aExp | aSig ) == 0 ) {
5283 invalid:
ff32e16e 5284 float_raise(float_flag_invalid, status);
af39bc8c 5285 return floatx80_default_nan(status);
158142c2 5286 }
ff32e16e 5287 float_raise(float_flag_divbyzero, status);
158142c2
FB
5288 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5289 }
5290 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5291 }
5292 if ( aExp == 0 ) {
5293 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5294 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5295 }
5296 zExp = aExp - bExp + 0x3FFE;
5297 rem1 = 0;
5298 if ( bSig <= aSig ) {
5299 shift128Right( aSig, 0, 1, &aSig, &rem1 );
5300 ++zExp;
5301 }
5302 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5303 mul64To128( bSig, zSig0, &term0, &term1 );
5304 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
bb98fe42 5305 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
5306 --zSig0;
5307 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5308 }
5309 zSig1 = estimateDiv128To64( rem1, 0, bSig );
bb98fe42 5310 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
158142c2
FB
5311 mul64To128( bSig, zSig1, &term1, &term2 );
5312 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
bb98fe42 5313 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
5314 --zSig1;
5315 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5316 }
5317 zSig1 |= ( ( rem1 | rem2 ) != 0 );
5318 }
a2f2d288 5319 return roundAndPackFloatx80(status->floatx80_rounding_precision,
ff32e16e 5320 zSign, zExp, zSig0, zSig1, status);
158142c2
FB
5321}
5322
5323/*----------------------------------------------------------------------------
5324| Returns the remainder of the extended double-precision floating-point value
5325| `a' with respect to the corresponding value `b'. The operation is performed
5326| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5327*----------------------------------------------------------------------------*/
5328
e5a41ffa 5329floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
158142c2 5330{
ed086f3d 5331 flag aSign, zSign;
f4014512 5332 int32_t aExp, bExp, expDiff;
bb98fe42
AF
5333 uint64_t aSig0, aSig1, bSig;
5334 uint64_t q, term0, term1, alternateASig0, alternateASig1;
158142c2 5335
d1eb8f2a
AD
5336 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5337 float_raise(float_flag_invalid, status);
5338 return floatx80_default_nan(status);
5339 }
158142c2
FB
5340 aSig0 = extractFloatx80Frac( a );
5341 aExp = extractFloatx80Exp( a );
5342 aSign = extractFloatx80Sign( a );
5343 bSig = extractFloatx80Frac( b );
5344 bExp = extractFloatx80Exp( b );
158142c2 5345 if ( aExp == 0x7FFF ) {
bb98fe42
AF
5346 if ( (uint64_t) ( aSig0<<1 )
5347 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
ff32e16e 5348 return propagateFloatx80NaN(a, b, status);
158142c2
FB
5349 }
5350 goto invalid;
5351 }
5352 if ( bExp == 0x7FFF ) {
ff32e16e
PM
5353 if ((uint64_t)(bSig << 1)) {
5354 return propagateFloatx80NaN(a, b, status);
5355 }
158142c2
FB
5356 return a;
5357 }
5358 if ( bExp == 0 ) {
5359 if ( bSig == 0 ) {
5360 invalid:
ff32e16e 5361 float_raise(float_flag_invalid, status);
af39bc8c 5362 return floatx80_default_nan(status);
158142c2
FB
5363 }
5364 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5365 }
5366 if ( aExp == 0 ) {
bb98fe42 5367 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
158142c2
FB
5368 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5369 }
5370 bSig |= LIT64( 0x8000000000000000 );
5371 zSign = aSign;
5372 expDiff = aExp - bExp;
5373 aSig1 = 0;
5374 if ( expDiff < 0 ) {
5375 if ( expDiff < -1 ) return a;
5376 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5377 expDiff = 0;
5378 }
5379 q = ( bSig <= aSig0 );
5380 if ( q ) aSig0 -= bSig;
5381 expDiff -= 64;
5382 while ( 0 < expDiff ) {
5383 q = estimateDiv128To64( aSig0, aSig1, bSig );
5384 q = ( 2 < q ) ? q - 2 : 0;
5385 mul64To128( bSig, q, &term0, &term1 );
5386 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5387 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5388 expDiff -= 62;
5389 }
5390 expDiff += 64;
5391 if ( 0 < expDiff ) {
5392 q = estimateDiv128To64( aSig0, aSig1, bSig );
5393 q = ( 2 < q ) ? q - 2 : 0;
5394 q >>= 64 - expDiff;
5395 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5396 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5397 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5398 while ( le128( term0, term1, aSig0, aSig1 ) ) {
5399 ++q;
5400 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5401 }
5402 }
5403 else {
5404 term1 = 0;
5405 term0 = bSig;
5406 }
5407 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5408 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5409 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5410 && ( q & 1 ) )
5411 ) {
5412 aSig0 = alternateASig0;
5413 aSig1 = alternateASig1;
5414 zSign = ! zSign;
5415 }
5416 return
5417 normalizeRoundAndPackFloatx80(
ff32e16e 5418 80, zSign, bExp + expDiff, aSig0, aSig1, status);
158142c2
FB
5419
5420}
5421
5422/*----------------------------------------------------------------------------
5423| Returns the square root of the extended double-precision floating-point
5424| value `a'. The operation is performed according to the IEC/IEEE Standard
5425| for Binary Floating-Point Arithmetic.
5426*----------------------------------------------------------------------------*/
5427
e5a41ffa 5428floatx80 floatx80_sqrt(floatx80 a, float_status *status)
158142c2
FB
5429{
5430 flag aSign;
f4014512 5431 int32_t aExp, zExp;
bb98fe42
AF
5432 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5433 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
158142c2 5434
d1eb8f2a
AD
5435 if (floatx80_invalid_encoding(a)) {
5436 float_raise(float_flag_invalid, status);
5437 return floatx80_default_nan(status);
5438 }
158142c2
FB
5439 aSig0 = extractFloatx80Frac( a );
5440 aExp = extractFloatx80Exp( a );
5441 aSign = extractFloatx80Sign( a );
5442 if ( aExp == 0x7FFF ) {
ff32e16e
PM
5443 if ((uint64_t)(aSig0 << 1)) {
5444 return propagateFloatx80NaN(a, a, status);
5445 }
158142c2
FB
5446 if ( ! aSign ) return a;
5447 goto invalid;
5448 }
5449 if ( aSign ) {
5450 if ( ( aExp | aSig0 ) == 0 ) return a;
5451 invalid:
ff32e16e 5452 float_raise(float_flag_invalid, status);
af39bc8c 5453 return floatx80_default_nan(status);
158142c2
FB
5454 }
5455 if ( aExp == 0 ) {
5456 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5457 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5458 }
5459 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5460 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5461 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5462 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5463 doubleZSig0 = zSig0<<1;
5464 mul64To128( zSig0, zSig0, &term0, &term1 );
5465 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
bb98fe42 5466 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
5467 --zSig0;
5468 doubleZSig0 -= 2;
5469 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5470 }
5471 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5472 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5473 if ( zSig1 == 0 ) zSig1 = 1;
5474 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5475 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5476 mul64To128( zSig1, zSig1, &term2, &term3 );
5477 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
bb98fe42 5478 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
5479 --zSig1;
5480 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5481 term3 |= 1;
5482 term2 |= doubleZSig0;
5483 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5484 }
5485 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5486 }
5487 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5488 zSig0 |= doubleZSig0;
a2f2d288
PM
5489 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5490 0, zExp, zSig0, zSig1, status);
158142c2
FB
5491}
5492
5493/*----------------------------------------------------------------------------
b689362d
AJ
5494| Returns 1 if the extended double-precision floating-point value `a' is equal
5495| to the corresponding value `b', and 0 otherwise. The invalid exception is
5496| raised if either operand is a NaN. Otherwise, the comparison is performed
5497| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
5498*----------------------------------------------------------------------------*/
5499
e5a41ffa 5500int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5501{
5502
d1eb8f2a
AD
5503 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5504 || (extractFloatx80Exp(a) == 0x7FFF
5505 && (uint64_t) (extractFloatx80Frac(a) << 1))
5506 || (extractFloatx80Exp(b) == 0x7FFF
5507 && (uint64_t) (extractFloatx80Frac(b) << 1))
158142c2 5508 ) {
ff32e16e 5509 float_raise(float_flag_invalid, status);
158142c2
FB
5510 return 0;
5511 }
5512 return
5513 ( a.low == b.low )
5514 && ( ( a.high == b.high )
5515 || ( ( a.low == 0 )
bb98fe42 5516 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
5517 );
5518
5519}
5520
5521/*----------------------------------------------------------------------------
5522| Returns 1 if the extended double-precision floating-point value `a' is
5523| less than or equal to the corresponding value `b', and 0 otherwise. The
f5a64251
AJ
5524| invalid exception is raised if either operand is a NaN. The comparison is
5525| performed according to the IEC/IEEE Standard for Binary Floating-Point
5526| Arithmetic.
158142c2
FB
5527*----------------------------------------------------------------------------*/
5528
e5a41ffa 5529int floatx80_le(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5530{
5531 flag aSign, bSign;
5532
d1eb8f2a
AD
5533 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5534 || (extractFloatx80Exp(a) == 0x7FFF
5535 && (uint64_t) (extractFloatx80Frac(a) << 1))
5536 || (extractFloatx80Exp(b) == 0x7FFF
5537 && (uint64_t) (extractFloatx80Frac(b) << 1))
158142c2 5538 ) {
ff32e16e 5539 float_raise(float_flag_invalid, status);
158142c2
FB
5540 return 0;
5541 }
5542 aSign = extractFloatx80Sign( a );
5543 bSign = extractFloatx80Sign( b );
5544 if ( aSign != bSign ) {
5545 return
5546 aSign
bb98fe42 5547 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5548 == 0 );
5549 }
5550 return
5551 aSign ? le128( b.high, b.low, a.high, a.low )
5552 : le128( a.high, a.low, b.high, b.low );
5553
5554}
5555
5556/*----------------------------------------------------------------------------
5557| Returns 1 if the extended double-precision floating-point value `a' is
f5a64251
AJ
5558| less than the corresponding value `b', and 0 otherwise. The invalid
5559| exception is raised if either operand is a NaN. The comparison is performed
5560| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
5561*----------------------------------------------------------------------------*/
5562
e5a41ffa 5563int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5564{
5565 flag aSign, bSign;
5566
d1eb8f2a
AD
5567 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5568 || (extractFloatx80Exp(a) == 0x7FFF
5569 && (uint64_t) (extractFloatx80Frac(a) << 1))
5570 || (extractFloatx80Exp(b) == 0x7FFF
5571 && (uint64_t) (extractFloatx80Frac(b) << 1))
158142c2 5572 ) {
ff32e16e 5573 float_raise(float_flag_invalid, status);
158142c2
FB
5574 return 0;
5575 }
5576 aSign = extractFloatx80Sign( a );
5577 bSign = extractFloatx80Sign( b );
5578 if ( aSign != bSign ) {
5579 return
5580 aSign
bb98fe42 5581 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5582 != 0 );
5583 }
5584 return
5585 aSign ? lt128( b.high, b.low, a.high, a.low )
5586 : lt128( a.high, a.low, b.high, b.low );
5587
5588}
5589
67b7861d
AJ
5590/*----------------------------------------------------------------------------
5591| Returns 1 if the extended double-precision floating-point values `a' and `b'
f5a64251
AJ
5592| cannot be compared, and 0 otherwise. The invalid exception is raised if
5593| either operand is a NaN. The comparison is performed according to the
5594| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
67b7861d 5595*----------------------------------------------------------------------------*/
e5a41ffa 5596int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
67b7861d 5597{
d1eb8f2a
AD
5598 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5599 || (extractFloatx80Exp(a) == 0x7FFF
5600 && (uint64_t) (extractFloatx80Frac(a) << 1))
5601 || (extractFloatx80Exp(b) == 0x7FFF
5602 && (uint64_t) (extractFloatx80Frac(b) << 1))
67b7861d 5603 ) {
ff32e16e 5604 float_raise(float_flag_invalid, status);
67b7861d
AJ
5605 return 1;
5606 }
5607 return 0;
5608}
5609
158142c2 5610/*----------------------------------------------------------------------------
b689362d 5611| Returns 1 if the extended double-precision floating-point value `a' is
f5a64251
AJ
5612| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5613| cause an exception. The comparison is performed according to the IEC/IEEE
5614| Standard for Binary Floating-Point Arithmetic.
158142c2
FB
5615*----------------------------------------------------------------------------*/
5616
e5a41ffa 5617int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5618{
5619
d1eb8f2a
AD
5620 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5621 float_raise(float_flag_invalid, status);
5622 return 0;
5623 }
158142c2 5624 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5625 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5626 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5627 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2 5628 ) {
af39bc8c
AM
5629 if (floatx80_is_signaling_nan(a, status)
5630 || floatx80_is_signaling_nan(b, status)) {
ff32e16e 5631 float_raise(float_flag_invalid, status);
b689362d 5632 }
158142c2
FB
5633 return 0;
5634 }
5635 return
5636 ( a.low == b.low )
5637 && ( ( a.high == b.high )
5638 || ( ( a.low == 0 )
bb98fe42 5639 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
5640 );
5641
5642}
5643
5644/*----------------------------------------------------------------------------
5645| Returns 1 if the extended double-precision floating-point value `a' is less
5646| than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
5647| do not cause an exception. Otherwise, the comparison is performed according
5648| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5649*----------------------------------------------------------------------------*/
5650
e5a41ffa 5651int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5652{
5653 flag aSign, bSign;
5654
d1eb8f2a
AD
5655 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5656 float_raise(float_flag_invalid, status);
5657 return 0;
5658 }
158142c2 5659 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5660 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5661 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5662 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2 5663 ) {
af39bc8c
AM
5664 if (floatx80_is_signaling_nan(a, status)
5665 || floatx80_is_signaling_nan(b, status)) {
ff32e16e 5666 float_raise(float_flag_invalid, status);
158142c2
FB
5667 }
5668 return 0;
5669 }
5670 aSign = extractFloatx80Sign( a );
5671 bSign = extractFloatx80Sign( b );
5672 if ( aSign != bSign ) {
5673 return
5674 aSign
bb98fe42 5675 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5676 == 0 );
5677 }
5678 return
5679 aSign ? le128( b.high, b.low, a.high, a.low )
5680 : le128( a.high, a.low, b.high, b.low );
5681
5682}
5683
5684/*----------------------------------------------------------------------------
5685| Returns 1 if the extended double-precision floating-point value `a' is less
5686| than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
5687| an exception. Otherwise, the comparison is performed according to the
5688| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5689*----------------------------------------------------------------------------*/
5690
e5a41ffa 5691int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5692{
5693 flag aSign, bSign;
5694
d1eb8f2a
AD
5695 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5696 float_raise(float_flag_invalid, status);
5697 return 0;
5698 }
158142c2 5699 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5700 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5701 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5702 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2 5703 ) {
af39bc8c
AM
5704 if (floatx80_is_signaling_nan(a, status)
5705 || floatx80_is_signaling_nan(b, status)) {
ff32e16e 5706 float_raise(float_flag_invalid, status);
158142c2
FB
5707 }
5708 return 0;
5709 }
5710 aSign = extractFloatx80Sign( a );
5711 bSign = extractFloatx80Sign( b );
5712 if ( aSign != bSign ) {
5713 return
5714 aSign
bb98fe42 5715 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5716 != 0 );
5717 }
5718 return
5719 aSign ? lt128( b.high, b.low, a.high, a.low )
5720 : lt128( a.high, a.low, b.high, b.low );
5721
5722}
5723
67b7861d
AJ
5724/*----------------------------------------------------------------------------
5725| Returns 1 if the extended double-precision floating-point values `a' and `b'
5726| cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
5727| The comparison is performed according to the IEC/IEEE Standard for Binary
5728| Floating-Point Arithmetic.
5729*----------------------------------------------------------------------------*/
e5a41ffa 5730int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
67b7861d 5731{
d1eb8f2a
AD
5732 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5733 float_raise(float_flag_invalid, status);
5734 return 1;
5735 }
67b7861d
AJ
5736 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5737 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5738 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5739 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5740 ) {
af39bc8c
AM
5741 if (floatx80_is_signaling_nan(a, status)
5742 || floatx80_is_signaling_nan(b, status)) {
ff32e16e 5743 float_raise(float_flag_invalid, status);
67b7861d
AJ
5744 }
5745 return 1;
5746 }
5747 return 0;
5748}
5749
158142c2
FB
5750/*----------------------------------------------------------------------------
5751| Returns the result of converting the quadruple-precision floating-point
5752| value `a' to the 32-bit two's complement integer format. The conversion
5753| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5754| Arithmetic---which means in particular that the conversion is rounded
5755| according to the current rounding mode. If `a' is a NaN, the largest
5756| positive integer is returned. Otherwise, if the conversion overflows, the
5757| largest integer with the same sign as `a' is returned.
5758*----------------------------------------------------------------------------*/
5759
f4014512 5760int32_t float128_to_int32(float128 a, float_status *status)
158142c2
FB
5761{
5762 flag aSign;
f4014512 5763 int32_t aExp, shiftCount;
bb98fe42 5764 uint64_t aSig0, aSig1;
158142c2
FB
5765
5766 aSig1 = extractFloat128Frac1( a );
5767 aSig0 = extractFloat128Frac0( a );
5768 aExp = extractFloat128Exp( a );
5769 aSign = extractFloat128Sign( a );
5770 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5771 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5772 aSig0 |= ( aSig1 != 0 );
5773 shiftCount = 0x4028 - aExp;
5774 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
ff32e16e 5775 return roundAndPackInt32(aSign, aSig0, status);
158142c2
FB
5776
5777}
5778
5779/*----------------------------------------------------------------------------
5780| Returns the result of converting the quadruple-precision floating-point
5781| value `a' to the 32-bit two's complement integer format. The conversion
5782| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5783| Arithmetic, except that the conversion is always rounded toward zero. If
5784| `a' is a NaN, the largest positive integer is returned. Otherwise, if the
5785| conversion overflows, the largest integer with the same sign as `a' is
5786| returned.
5787*----------------------------------------------------------------------------*/
5788
f4014512 5789int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
158142c2
FB
5790{
5791 flag aSign;
f4014512 5792 int32_t aExp, shiftCount;
bb98fe42 5793 uint64_t aSig0, aSig1, savedASig;
b3a6a2e0 5794 int32_t z;
158142c2
FB
5795
5796 aSig1 = extractFloat128Frac1( a );
5797 aSig0 = extractFloat128Frac0( a );
5798 aExp = extractFloat128Exp( a );
5799 aSign = extractFloat128Sign( a );
5800 aSig0 |= ( aSig1 != 0 );
5801 if ( 0x401E < aExp ) {
5802 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5803 goto invalid;
5804 }
5805 else if ( aExp < 0x3FFF ) {
a2f2d288
PM
5806 if (aExp || aSig0) {
5807 status->float_exception_flags |= float_flag_inexact;
5808 }
158142c2
FB
5809 return 0;
5810 }
5811 aSig0 |= LIT64( 0x0001000000000000 );
5812 shiftCount = 0x402F - aExp;
5813 savedASig = aSig0;
5814 aSig0 >>= shiftCount;
5815 z = aSig0;
5816 if ( aSign ) z = - z;
5817 if ( ( z < 0 ) ^ aSign ) {
5818 invalid:
ff32e16e 5819 float_raise(float_flag_invalid, status);
bb98fe42 5820 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2
FB
5821 }
5822 if ( ( aSig0<<shiftCount ) != savedASig ) {
a2f2d288 5823 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
5824 }
5825 return z;
5826
5827}
5828
5829/*----------------------------------------------------------------------------
5830| Returns the result of converting the quadruple-precision floating-point
5831| value `a' to the 64-bit two's complement integer format. The conversion
5832| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5833| Arithmetic---which means in particular that the conversion is rounded
5834| according to the current rounding mode. If `a' is a NaN, the largest
5835| positive integer is returned. Otherwise, if the conversion overflows, the
5836| largest integer with the same sign as `a' is returned.
5837*----------------------------------------------------------------------------*/
5838
f42c2224 5839int64_t float128_to_int64(float128 a, float_status *status)
158142c2
FB
5840{
5841 flag aSign;
f4014512 5842 int32_t aExp, shiftCount;
bb98fe42 5843 uint64_t aSig0, aSig1;
158142c2
FB
5844
5845 aSig1 = extractFloat128Frac1( a );
5846 aSig0 = extractFloat128Frac0( a );
5847 aExp = extractFloat128Exp( a );
5848 aSign = extractFloat128Sign( a );
5849 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5850 shiftCount = 0x402F - aExp;
5851 if ( shiftCount <= 0 ) {
5852 if ( 0x403E < aExp ) {
ff32e16e 5853 float_raise(float_flag_invalid, status);
158142c2
FB
5854 if ( ! aSign
5855 || ( ( aExp == 0x7FFF )
5856 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5857 )
5858 ) {
5859 return LIT64( 0x7FFFFFFFFFFFFFFF );
5860 }
bb98fe42 5861 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
5862 }
5863 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5864 }
5865 else {
5866 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5867 }
ff32e16e 5868 return roundAndPackInt64(aSign, aSig0, aSig1, status);
158142c2
FB
5869
5870}
5871
5872/*----------------------------------------------------------------------------
5873| Returns the result of converting the quadruple-precision floating-point
5874| value `a' to the 64-bit two's complement integer format. The conversion
5875| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5876| Arithmetic, except that the conversion is always rounded toward zero.
5877| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
5878| the conversion overflows, the largest integer with the same sign as `a' is
5879| returned.
5880*----------------------------------------------------------------------------*/
5881
f42c2224 5882int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
158142c2
FB
5883{
5884 flag aSign;
f4014512 5885 int32_t aExp, shiftCount;
bb98fe42 5886 uint64_t aSig0, aSig1;
f42c2224 5887 int64_t z;
158142c2
FB
5888
5889 aSig1 = extractFloat128Frac1( a );
5890 aSig0 = extractFloat128Frac0( a );
5891 aExp = extractFloat128Exp( a );
5892 aSign = extractFloat128Sign( a );
5893 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5894 shiftCount = aExp - 0x402F;
5895 if ( 0 < shiftCount ) {
5896 if ( 0x403E <= aExp ) {
5897 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5898 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
5899 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
a2f2d288
PM
5900 if (aSig1) {
5901 status->float_exception_flags |= float_flag_inexact;
5902 }
158142c2
FB
5903 }
5904 else {
ff32e16e 5905 float_raise(float_flag_invalid, status);
158142c2
FB
5906 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5907 return LIT64( 0x7FFFFFFFFFFFFFFF );
5908 }
5909 }
bb98fe42 5910 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
5911 }
5912 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
bb98fe42 5913 if ( (uint64_t) ( aSig1<<shiftCount ) ) {
a2f2d288 5914 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
5915 }
5916 }
5917 else {
5918 if ( aExp < 0x3FFF ) {
5919 if ( aExp | aSig0 | aSig1 ) {
a2f2d288 5920 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
5921 }
5922 return 0;
5923 }
5924 z = aSig0>>( - shiftCount );
5925 if ( aSig1
bb98fe42 5926 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
a2f2d288 5927 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
5928 }
5929 }
5930 if ( aSign ) z = - z;
5931 return z;
5932
5933}
5934
2e6d8568
BR
5935/*----------------------------------------------------------------------------
5936| Returns the result of converting the quadruple-precision floating-point value
5937| `a' to the 64-bit unsigned integer format. The conversion is
5938| performed according to the IEC/IEEE Standard for Binary Floating-Point
5939| Arithmetic---which means in particular that the conversion is rounded
5940| according to the current rounding mode. If `a' is a NaN, the largest
5941| positive integer is returned. If the conversion overflows, the
5942| largest unsigned integer is returned. If 'a' is negative, the value is
5943| rounded and zero is returned; negative values that do not round to zero
5944| will raise the inexact exception.
5945*----------------------------------------------------------------------------*/
5946
5947uint64_t float128_to_uint64(float128 a, float_status *status)
5948{
5949 flag aSign;
5950 int aExp;
5951 int shiftCount;
5952 uint64_t aSig0, aSig1;
5953
5954 aSig0 = extractFloat128Frac0(a);
5955 aSig1 = extractFloat128Frac1(a);
5956 aExp = extractFloat128Exp(a);
5957 aSign = extractFloat128Sign(a);
5958 if (aSign && (aExp > 0x3FFE)) {
5959 float_raise(float_flag_invalid, status);
5960 if (float128_is_any_nan(a)) {
5961 return LIT64(0xFFFFFFFFFFFFFFFF);
5962 } else {
5963 return 0;
5964 }
5965 }
5966 if (aExp) {
5967 aSig0 |= LIT64(0x0001000000000000);
5968 }
5969 shiftCount = 0x402F - aExp;
5970 if (shiftCount <= 0) {
5971 if (0x403E < aExp) {
5972 float_raise(float_flag_invalid, status);
5973 return LIT64(0xFFFFFFFFFFFFFFFF);
5974 }
5975 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
5976 } else {
5977 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
5978 }
5979 return roundAndPackUint64(aSign, aSig0, aSig1, status);
5980}
5981
5982uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
5983{
5984 uint64_t v;
5985 signed char current_rounding_mode = status->float_rounding_mode;
5986
5987 set_float_rounding_mode(float_round_to_zero, status);
5988 v = float128_to_uint64(a, status);
5989 set_float_rounding_mode(current_rounding_mode, status);
5990
5991 return v;
5992}
5993
158142c2
FB
5994/*----------------------------------------------------------------------------
5995| Returns the result of converting the quadruple-precision floating-point
fd425037
BR
5996| value `a' to the 32-bit unsigned integer format. The conversion
5997| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5998| Arithmetic except that the conversion is always rounded toward zero.
5999| If `a' is a NaN, the largest positive integer is returned. Otherwise,
6000| if the conversion overflows, the largest unsigned integer is returned.
6001| If 'a' is negative, the value is rounded and zero is returned; negative
6002| values that do not round to zero will raise the inexact exception.
6003*----------------------------------------------------------------------------*/
6004
6005uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
6006{
6007 uint64_t v;
6008 uint32_t res;
6009 int old_exc_flags = get_float_exception_flags(status);
6010
6011 v = float128_to_uint64_round_to_zero(a, status);
6012 if (v > 0xffffffff) {
6013 res = 0xffffffff;
6014 } else {
6015 return v;
6016 }
6017 set_float_exception_flags(old_exc_flags, status);
6018 float_raise(float_flag_invalid, status);
6019 return res;
6020}
6021
6022/*----------------------------------------------------------------------------
6023| Returns the result of converting the quadruple-precision floating-point
158142c2
FB
6024| value `a' to the single-precision floating-point format. The conversion
6025| is performed according to the IEC/IEEE Standard for Binary Floating-Point
6026| Arithmetic.
6027*----------------------------------------------------------------------------*/
6028
e5a41ffa 6029float32 float128_to_float32(float128 a, float_status *status)
158142c2
FB
6030{
6031 flag aSign;
f4014512 6032 int32_t aExp;
bb98fe42
AF
6033 uint64_t aSig0, aSig1;
6034 uint32_t zSig;
158142c2
FB
6035
6036 aSig1 = extractFloat128Frac1( a );
6037 aSig0 = extractFloat128Frac0( a );
6038 aExp = extractFloat128Exp( a );
6039 aSign = extractFloat128Sign( a );
6040 if ( aExp == 0x7FFF ) {
6041 if ( aSig0 | aSig1 ) {
ff32e16e 6042 return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
158142c2
FB
6043 }
6044 return packFloat32( aSign, 0xFF, 0 );
6045 }
6046 aSig0 |= ( aSig1 != 0 );
6047 shift64RightJamming( aSig0, 18, &aSig0 );
6048 zSig = aSig0;
6049 if ( aExp || zSig ) {
6050 zSig |= 0x40000000;
6051 aExp -= 0x3F81;
6052 }
ff32e16e 6053 return roundAndPackFloat32(aSign, aExp, zSig, status);
158142c2
FB
6054
6055}
6056
6057/*----------------------------------------------------------------------------
6058| Returns the result of converting the quadruple-precision floating-point
6059| value `a' to the double-precision floating-point format. The conversion
6060| is performed according to the IEC/IEEE Standard for Binary Floating-Point
6061| Arithmetic.
6062*----------------------------------------------------------------------------*/
6063
e5a41ffa 6064float64 float128_to_float64(float128 a, float_status *status)
158142c2
FB
6065{
6066 flag aSign;
f4014512 6067 int32_t aExp;
bb98fe42 6068 uint64_t aSig0, aSig1;
158142c2
FB
6069
6070 aSig1 = extractFloat128Frac1( a );
6071 aSig0 = extractFloat128Frac0( a );
6072 aExp = extractFloat128Exp( a );
6073 aSign = extractFloat128Sign( a );
6074 if ( aExp == 0x7FFF ) {
6075 if ( aSig0 | aSig1 ) {
ff32e16e 6076 return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
158142c2
FB
6077 }
6078 return packFloat64( aSign, 0x7FF, 0 );
6079 }
6080 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6081 aSig0 |= ( aSig1 != 0 );
6082 if ( aExp || aSig0 ) {
6083 aSig0 |= LIT64( 0x4000000000000000 );
6084 aExp -= 0x3C01;
6085 }
ff32e16e 6086 return roundAndPackFloat64(aSign, aExp, aSig0, status);
158142c2
FB
6087
6088}
6089
158142c2
FB
6090/*----------------------------------------------------------------------------
6091| Returns the result of converting the quadruple-precision floating-point
6092| value `a' to the extended double-precision floating-point format. The
6093| conversion is performed according to the IEC/IEEE Standard for Binary
6094| Floating-Point Arithmetic.
6095*----------------------------------------------------------------------------*/
6096
e5a41ffa 6097floatx80 float128_to_floatx80(float128 a, float_status *status)
158142c2
FB
6098{
6099 flag aSign;
f4014512 6100 int32_t aExp;
bb98fe42 6101 uint64_t aSig0, aSig1;
158142c2
FB
6102
6103 aSig1 = extractFloat128Frac1( a );
6104 aSig0 = extractFloat128Frac0( a );
6105 aExp = extractFloat128Exp( a );
6106 aSign = extractFloat128Sign( a );
6107 if ( aExp == 0x7FFF ) {
6108 if ( aSig0 | aSig1 ) {
ff32e16e 6109 return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
158142c2
FB
6110 }
6111 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
6112 }
6113 if ( aExp == 0 ) {
6114 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6115 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6116 }
6117 else {
6118 aSig0 |= LIT64( 0x0001000000000000 );
6119 }
6120 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
ff32e16e 6121 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
158142c2
FB
6122
6123}
6124
158142c2
FB
6125/*----------------------------------------------------------------------------
6126| Rounds the quadruple-precision floating-point value `a' to an integer, and
6127| returns the result as a quadruple-precision floating-point value. The
6128| operation is performed according to the IEC/IEEE Standard for Binary
6129| Floating-Point Arithmetic.
6130*----------------------------------------------------------------------------*/
6131
e5a41ffa 6132float128 float128_round_to_int(float128 a, float_status *status)
158142c2
FB
6133{
6134 flag aSign;
f4014512 6135 int32_t aExp;
bb98fe42 6136 uint64_t lastBitMask, roundBitsMask;
158142c2
FB
6137 float128 z;
6138
6139 aExp = extractFloat128Exp( a );
6140 if ( 0x402F <= aExp ) {
6141 if ( 0x406F <= aExp ) {
6142 if ( ( aExp == 0x7FFF )
6143 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6144 ) {
ff32e16e 6145 return propagateFloat128NaN(a, a, status);
158142c2
FB
6146 }
6147 return a;
6148 }
6149 lastBitMask = 1;
6150 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6151 roundBitsMask = lastBitMask - 1;
6152 z = a;
a2f2d288 6153 switch (status->float_rounding_mode) {
dc355b76 6154 case float_round_nearest_even:
158142c2
FB
6155 if ( lastBitMask ) {
6156 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6157 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6158 }
6159 else {
bb98fe42 6160 if ( (int64_t) z.low < 0 ) {
158142c2 6161 ++z.high;
bb98fe42 6162 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
158142c2
FB
6163 }
6164 }
dc355b76 6165 break;
f9288a76
PM
6166 case float_round_ties_away:
6167 if (lastBitMask) {
6168 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6169 } else {
6170 if ((int64_t) z.low < 0) {
6171 ++z.high;
6172 }
6173 }
6174 break;
dc355b76
PM
6175 case float_round_to_zero:
6176 break;
6177 case float_round_up:
6178 if (!extractFloat128Sign(z)) {
6179 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6180 }
6181 break;
6182 case float_round_down:
6183 if (extractFloat128Sign(z)) {
6184 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
158142c2 6185 }
dc355b76
PM
6186 break;
6187 default:
6188 abort();
158142c2
FB
6189 }
6190 z.low &= ~ roundBitsMask;
6191 }
6192 else {
6193 if ( aExp < 0x3FFF ) {
bb98fe42 6194 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
a2f2d288 6195 status->float_exception_flags |= float_flag_inexact;
158142c2 6196 aSign = extractFloat128Sign( a );
a2f2d288 6197 switch (status->float_rounding_mode) {
158142c2
FB
6198 case float_round_nearest_even:
6199 if ( ( aExp == 0x3FFE )
6200 && ( extractFloat128Frac0( a )
6201 | extractFloat128Frac1( a ) )
6202 ) {
6203 return packFloat128( aSign, 0x3FFF, 0, 0 );
6204 }
6205 break;
f9288a76
PM
6206 case float_round_ties_away:
6207 if (aExp == 0x3FFE) {
6208 return packFloat128(aSign, 0x3FFF, 0, 0);
6209 }
6210 break;
158142c2
FB
6211 case float_round_down:
6212 return
6213 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6214 : packFloat128( 0, 0, 0, 0 );
6215 case float_round_up:
6216 return
6217 aSign ? packFloat128( 1, 0, 0, 0 )
6218 : packFloat128( 0, 0x3FFF, 0, 0 );
6219 }
6220 return packFloat128( aSign, 0, 0, 0 );
6221 }
6222 lastBitMask = 1;
6223 lastBitMask <<= 0x402F - aExp;
6224 roundBitsMask = lastBitMask - 1;
6225 z.low = 0;
6226 z.high = a.high;
a2f2d288 6227 switch (status->float_rounding_mode) {
dc355b76 6228 case float_round_nearest_even:
158142c2
FB
6229 z.high += lastBitMask>>1;
6230 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6231 z.high &= ~ lastBitMask;
6232 }
dc355b76 6233 break;
f9288a76
PM
6234 case float_round_ties_away:
6235 z.high += lastBitMask>>1;
6236 break;
dc355b76
PM
6237 case float_round_to_zero:
6238 break;
6239 case float_round_up:
6240 if (!extractFloat128Sign(z)) {
158142c2
FB
6241 z.high |= ( a.low != 0 );
6242 z.high += roundBitsMask;
6243 }
dc355b76
PM
6244 break;
6245 case float_round_down:
6246 if (extractFloat128Sign(z)) {
6247 z.high |= (a.low != 0);
6248 z.high += roundBitsMask;
6249 }
6250 break;
6251 default:
6252 abort();
158142c2
FB
6253 }
6254 z.high &= ~ roundBitsMask;
6255 }
6256 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
a2f2d288 6257 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
6258 }
6259 return z;
6260
6261}
6262
6263/*----------------------------------------------------------------------------
6264| Returns the result of adding the absolute values of the quadruple-precision
6265| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
6266| before being returned. `zSign' is ignored if the result is a NaN.
6267| The addition is performed according to the IEC/IEEE Standard for Binary
6268| Floating-Point Arithmetic.
6269*----------------------------------------------------------------------------*/
6270
e5a41ffa
PM
6271static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
6272 float_status *status)
158142c2 6273{
f4014512 6274 int32_t aExp, bExp, zExp;
bb98fe42 6275 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
f4014512 6276 int32_t expDiff;
158142c2
FB
6277
6278 aSig1 = extractFloat128Frac1( a );
6279 aSig0 = extractFloat128Frac0( a );
6280 aExp = extractFloat128Exp( a );
6281 bSig1 = extractFloat128Frac1( b );
6282 bSig0 = extractFloat128Frac0( b );
6283 bExp = extractFloat128Exp( b );
6284 expDiff = aExp - bExp;
6285 if ( 0 < expDiff ) {
6286 if ( aExp == 0x7FFF ) {
ff32e16e
PM
6287 if (aSig0 | aSig1) {
6288 return propagateFloat128NaN(a, b, status);
6289 }
158142c2
FB
6290 return a;
6291 }
6292 if ( bExp == 0 ) {
6293 --expDiff;
6294 }
6295 else {
6296 bSig0 |= LIT64( 0x0001000000000000 );
6297 }
6298 shift128ExtraRightJamming(
6299 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6300 zExp = aExp;
6301 }
6302 else if ( expDiff < 0 ) {
6303 if ( bExp == 0x7FFF ) {
ff32e16e
PM
6304 if (bSig0 | bSig1) {
6305 return propagateFloat128NaN(a, b, status);
6306 }
158142c2
FB
6307 return packFloat128( zSign, 0x7FFF, 0, 0 );
6308 }
6309 if ( aExp == 0 ) {
6310 ++expDiff;
6311 }
6312 else {
6313 aSig0 |= LIT64( 0x0001000000000000 );
6314 }
6315 shift128ExtraRightJamming(
6316 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6317 zExp = bExp;
6318 }
6319 else {
6320 if ( aExp == 0x7FFF ) {
6321 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
ff32e16e 6322 return propagateFloat128NaN(a, b, status);
158142c2
FB
6323 }
6324 return a;
6325 }
6326 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
fe76d976 6327 if ( aExp == 0 ) {
a2f2d288 6328 if (status->flush_to_zero) {
e6afc87f 6329 if (zSig0 | zSig1) {
ff32e16e 6330 float_raise(float_flag_output_denormal, status);
e6afc87f
PM
6331 }
6332 return packFloat128(zSign, 0, 0, 0);
6333 }
fe76d976
PB
6334 return packFloat128( zSign, 0, zSig0, zSig1 );
6335 }
158142c2
FB
6336 zSig2 = 0;
6337 zSig0 |= LIT64( 0x0002000000000000 );
6338 zExp = aExp;
6339 goto shiftRight1;
6340 }
6341 aSig0 |= LIT64( 0x0001000000000000 );
6342 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6343 --zExp;
6344 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6345 ++zExp;
6346 shiftRight1:
6347 shift128ExtraRightJamming(
6348 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6349 roundAndPack:
ff32e16e 6350 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
158142c2
FB
6351
6352}
6353
6354/*----------------------------------------------------------------------------
6355| Returns the result of subtracting the absolute values of the quadruple-
6356| precision floating-point values `a' and `b'. If `zSign' is 1, the
6357| difference is negated before being returned. `zSign' is ignored if the
6358| result is a NaN. The subtraction is performed according to the IEC/IEEE
6359| Standard for Binary Floating-Point Arithmetic.
6360*----------------------------------------------------------------------------*/
6361
e5a41ffa
PM
6362static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6363 float_status *status)
158142c2 6364{
f4014512 6365 int32_t aExp, bExp, zExp;
bb98fe42 6366 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
f4014512 6367 int32_t expDiff;
158142c2
FB
6368
6369 aSig1 = extractFloat128Frac1( a );
6370 aSig0 = extractFloat128Frac0( a );
6371 aExp = extractFloat128Exp( a );
6372 bSig1 = extractFloat128Frac1( b );
6373 bSig0 = extractFloat128Frac0( b );
6374 bExp = extractFloat128Exp( b );
6375 expDiff = aExp - bExp;
6376 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6377 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6378 if ( 0 < expDiff ) goto aExpBigger;
6379 if ( expDiff < 0 ) goto bExpBigger;
6380 if ( aExp == 0x7FFF ) {
6381 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
ff32e16e 6382 return propagateFloat128NaN(a, b, status);
158142c2 6383 }
ff32e16e 6384 float_raise(float_flag_invalid, status);
af39bc8c 6385 return float128_default_nan(status);
158142c2
FB
6386 }
6387 if ( aExp == 0 ) {
6388 aExp = 1;
6389 bExp = 1;
6390 }
6391 if ( bSig0 < aSig0 ) goto aBigger;
6392 if ( aSig0 < bSig0 ) goto bBigger;
6393 if ( bSig1 < aSig1 ) goto aBigger;
6394 if ( aSig1 < bSig1 ) goto bBigger;
a2f2d288
PM
6395 return packFloat128(status->float_rounding_mode == float_round_down,
6396 0, 0, 0);
158142c2
FB
6397 bExpBigger:
6398 if ( bExp == 0x7FFF ) {
ff32e16e
PM
6399 if (bSig0 | bSig1) {
6400 return propagateFloat128NaN(a, b, status);
6401 }
158142c2
FB
6402 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6403 }
6404 if ( aExp == 0 ) {
6405 ++expDiff;
6406 }
6407 else {
6408 aSig0 |= LIT64( 0x4000000000000000 );
6409 }
6410 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6411 bSig0 |= LIT64( 0x4000000000000000 );
6412 bBigger:
6413 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6414 zExp = bExp;
6415 zSign ^= 1;
6416 goto normalizeRoundAndPack;
6417 aExpBigger:
6418 if ( aExp == 0x7FFF ) {
ff32e16e
PM
6419 if (aSig0 | aSig1) {
6420 return propagateFloat128NaN(a, b, status);
6421 }
158142c2
FB
6422 return a;
6423 }
6424 if ( bExp == 0 ) {
6425 --expDiff;
6426 }
6427 else {
6428 bSig0 |= LIT64( 0x4000000000000000 );
6429 }
6430 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6431 aSig0 |= LIT64( 0x4000000000000000 );
6432 aBigger:
6433 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6434 zExp = aExp;
6435 normalizeRoundAndPack:
6436 --zExp;
ff32e16e
PM
6437 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6438 status);
158142c2
FB
6439
6440}
6441
6442/*----------------------------------------------------------------------------
6443| Returns the result of adding the quadruple-precision floating-point values
6444| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
6445| for Binary Floating-Point Arithmetic.
6446*----------------------------------------------------------------------------*/
6447
e5a41ffa 6448float128 float128_add(float128 a, float128 b, float_status *status)
158142c2
FB
6449{
6450 flag aSign, bSign;
6451
6452 aSign = extractFloat128Sign( a );
6453 bSign = extractFloat128Sign( b );
6454 if ( aSign == bSign ) {
ff32e16e 6455 return addFloat128Sigs(a, b, aSign, status);
158142c2
FB
6456 }
6457 else {
ff32e16e 6458 return subFloat128Sigs(a, b, aSign, status);
158142c2
FB
6459 }
6460
6461}
6462
6463/*----------------------------------------------------------------------------
6464| Returns the result of subtracting the quadruple-precision floating-point
6465| values `a' and `b'. The operation is performed according to the IEC/IEEE
6466| Standard for Binary Floating-Point Arithmetic.
6467*----------------------------------------------------------------------------*/
6468
e5a41ffa 6469float128 float128_sub(float128 a, float128 b, float_status *status)
158142c2
FB
6470{
6471 flag aSign, bSign;
6472
6473 aSign = extractFloat128Sign( a );
6474 bSign = extractFloat128Sign( b );
6475 if ( aSign == bSign ) {
ff32e16e 6476 return subFloat128Sigs(a, b, aSign, status);
158142c2
FB
6477 }
6478 else {
ff32e16e 6479 return addFloat128Sigs(a, b, aSign, status);
158142c2
FB
6480 }
6481
6482}
6483
6484/*----------------------------------------------------------------------------
6485| Returns the result of multiplying the quadruple-precision floating-point
6486| values `a' and `b'. The operation is performed according to the IEC/IEEE
6487| Standard for Binary Floating-Point Arithmetic.
6488*----------------------------------------------------------------------------*/
6489
e5a41ffa 6490float128 float128_mul(float128 a, float128 b, float_status *status)
158142c2
FB
6491{
6492 flag aSign, bSign, zSign;
f4014512 6493 int32_t aExp, bExp, zExp;
bb98fe42 6494 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
158142c2
FB
6495
6496 aSig1 = extractFloat128Frac1( a );
6497 aSig0 = extractFloat128Frac0( a );
6498 aExp = extractFloat128Exp( a );
6499 aSign = extractFloat128Sign( a );
6500 bSig1 = extractFloat128Frac1( b );
6501 bSig0 = extractFloat128Frac0( b );
6502 bExp = extractFloat128Exp( b );
6503 bSign = extractFloat128Sign( b );
6504 zSign = aSign ^ bSign;
6505 if ( aExp == 0x7FFF ) {
6506 if ( ( aSig0 | aSig1 )
6507 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
ff32e16e 6508 return propagateFloat128NaN(a, b, status);
158142c2
FB
6509 }
6510 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6511 return packFloat128( zSign, 0x7FFF, 0, 0 );
6512 }
6513 if ( bExp == 0x7FFF ) {
ff32e16e
PM
6514 if (bSig0 | bSig1) {
6515 return propagateFloat128NaN(a, b, status);
6516 }
158142c2
FB
6517 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6518 invalid:
ff32e16e 6519 float_raise(float_flag_invalid, status);
af39bc8c 6520 return float128_default_nan(status);
158142c2
FB
6521 }
6522 return packFloat128( zSign, 0x7FFF, 0, 0 );
6523 }
6524 if ( aExp == 0 ) {
6525 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6526 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6527 }
6528 if ( bExp == 0 ) {
6529 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6530 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6531 }
6532 zExp = aExp + bExp - 0x4000;
6533 aSig0 |= LIT64( 0x0001000000000000 );
6534 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6535 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6536 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6537 zSig2 |= ( zSig3 != 0 );
6538 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6539 shift128ExtraRightJamming(
6540 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6541 ++zExp;
6542 }
ff32e16e 6543 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
158142c2
FB
6544
6545}
6546
6547/*----------------------------------------------------------------------------
6548| Returns the result of dividing the quadruple-precision floating-point value
6549| `a' by the corresponding value `b'. The operation is performed according to
6550| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6551*----------------------------------------------------------------------------*/
6552
e5a41ffa 6553float128 float128_div(float128 a, float128 b, float_status *status)
158142c2
FB
6554{
6555 flag aSign, bSign, zSign;
f4014512 6556 int32_t aExp, bExp, zExp;
bb98fe42
AF
6557 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6558 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
158142c2
FB
6559
6560 aSig1 = extractFloat128Frac1( a );
6561 aSig0 = extractFloat128Frac0( a );
6562 aExp = extractFloat128Exp( a );
6563 aSign = extractFloat128Sign( a );
6564 bSig1 = extractFloat128Frac1( b );
6565 bSig0 = extractFloat128Frac0( b );
6566 bExp = extractFloat128Exp( b );
6567 bSign = extractFloat128Sign( b );
6568 zSign = aSign ^ bSign;
6569 if ( aExp == 0x7FFF ) {
ff32e16e
PM
6570 if (aSig0 | aSig1) {
6571 return propagateFloat128NaN(a, b, status);
6572 }
158142c2 6573 if ( bExp == 0x7FFF ) {
ff32e16e
PM
6574 if (bSig0 | bSig1) {
6575 return propagateFloat128NaN(a, b, status);
6576 }
158142c2
FB
6577 goto invalid;
6578 }
6579 return packFloat128( zSign, 0x7FFF, 0, 0 );
6580 }
6581 if ( bExp == 0x7FFF ) {
ff32e16e
PM
6582 if (bSig0 | bSig1) {
6583 return propagateFloat128NaN(a, b, status);
6584 }
158142c2
FB
6585 return packFloat128( zSign, 0, 0, 0 );
6586 }
6587 if ( bExp == 0 ) {
6588 if ( ( bSig0 | bSig1 ) == 0 ) {
6589 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6590 invalid:
ff32e16e 6591 float_raise(float_flag_invalid, status);
af39bc8c 6592 return float128_default_nan(status);
158142c2 6593 }
ff32e16e 6594 float_raise(float_flag_divbyzero, status);
158142c2
FB
6595 return packFloat128( zSign, 0x7FFF, 0, 0 );
6596 }
6597 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6598 }
6599 if ( aExp == 0 ) {
6600 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6601 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6602 }
6603 zExp = aExp - bExp + 0x3FFD;
6604 shortShift128Left(
6605 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6606 shortShift128Left(
6607 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6608 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6609 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6610 ++zExp;
6611 }
6612 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6613 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6614 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
bb98fe42 6615 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
6616 --zSig0;
6617 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6618 }
6619 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6620 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6621 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6622 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
bb98fe42 6623 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
6624 --zSig1;
6625 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6626 }
6627 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6628 }
6629 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
ff32e16e 6630 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
158142c2
FB
6631
6632}
6633
6634/*----------------------------------------------------------------------------
6635| Returns the remainder of the quadruple-precision floating-point value `a'
6636| with respect to the corresponding value `b'. The operation is performed
6637| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6638*----------------------------------------------------------------------------*/
6639
e5a41ffa 6640float128 float128_rem(float128 a, float128 b, float_status *status)
158142c2 6641{
ed086f3d 6642 flag aSign, zSign;
f4014512 6643 int32_t aExp, bExp, expDiff;
bb98fe42
AF
6644 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6645 uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6646 int64_t sigMean0;
158142c2
FB
6647
6648 aSig1 = extractFloat128Frac1( a );
6649 aSig0 = extractFloat128Frac0( a );
6650 aExp = extractFloat128Exp( a );
6651 aSign = extractFloat128Sign( a );
6652 bSig1 = extractFloat128Frac1( b );
6653 bSig0 = extractFloat128Frac0( b );
6654 bExp = extractFloat128Exp( b );
158142c2
FB
6655 if ( aExp == 0x7FFF ) {
6656 if ( ( aSig0 | aSig1 )
6657 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
ff32e16e 6658 return propagateFloat128NaN(a, b, status);
158142c2
FB
6659 }
6660 goto invalid;
6661 }
6662 if ( bExp == 0x7FFF ) {
ff32e16e
PM
6663 if (bSig0 | bSig1) {
6664 return propagateFloat128NaN(a, b, status);
6665 }
158142c2
FB
6666 return a;
6667 }
6668 if ( bExp == 0 ) {
6669 if ( ( bSig0 | bSig1 ) == 0 ) {
6670 invalid:
ff32e16e 6671 float_raise(float_flag_invalid, status);
af39bc8c 6672 return float128_default_nan(status);
158142c2
FB
6673 }
6674 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6675 }
6676 if ( aExp == 0 ) {
6677 if ( ( aSig0 | aSig1 ) == 0 ) return a;
6678 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6679 }
6680 expDiff = aExp - bExp;
6681 if ( expDiff < -1 ) return a;
6682 shortShift128Left(
6683 aSig0 | LIT64( 0x0001000000000000 ),
6684 aSig1,
6685 15 - ( expDiff < 0 ),
6686 &aSig0,
6687 &aSig1
6688 );
6689 shortShift128Left(
6690 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6691 q = le128( bSig0, bSig1, aSig0, aSig1 );
6692 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6693 expDiff -= 64;
6694 while ( 0 < expDiff ) {
6695 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6696 q = ( 4 < q ) ? q - 4 : 0;
6697 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6698 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6699 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6700 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6701 expDiff -= 61;
6702 }
6703 if ( -64 < expDiff ) {
6704 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6705 q = ( 4 < q ) ? q - 4 : 0;
6706 q >>= - expDiff;
6707 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6708 expDiff += 52;
6709 if ( expDiff < 0 ) {
6710 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6711 }
6712 else {
6713 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6714 }
6715 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6716 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6717 }
6718 else {
6719 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6720 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6721 }
6722 do {
6723 alternateASig0 = aSig0;
6724 alternateASig1 = aSig1;
6725 ++q;
6726 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
bb98fe42 6727 } while ( 0 <= (int64_t) aSig0 );
158142c2 6728 add128(
bb98fe42 6729 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
158142c2
FB
6730 if ( ( sigMean0 < 0 )
6731 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6732 aSig0 = alternateASig0;
6733 aSig1 = alternateASig1;
6734 }
bb98fe42 6735 zSign = ( (int64_t) aSig0 < 0 );
158142c2 6736 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
ff32e16e
PM
6737 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
6738 status);
158142c2
FB
6739}
6740
6741/*----------------------------------------------------------------------------
6742| Returns the square root of the quadruple-precision floating-point value `a'.
6743| The operation is performed according to the IEC/IEEE Standard for Binary
6744| Floating-Point Arithmetic.
6745*----------------------------------------------------------------------------*/
6746
e5a41ffa 6747float128 float128_sqrt(float128 a, float_status *status)
158142c2
FB
6748{
6749 flag aSign;
f4014512 6750 int32_t aExp, zExp;
bb98fe42
AF
6751 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6752 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
158142c2
FB
6753
6754 aSig1 = extractFloat128Frac1( a );
6755 aSig0 = extractFloat128Frac0( a );
6756 aExp = extractFloat128Exp( a );
6757 aSign = extractFloat128Sign( a );
6758 if ( aExp == 0x7FFF ) {
ff32e16e
PM
6759 if (aSig0 | aSig1) {
6760 return propagateFloat128NaN(a, a, status);
6761 }
158142c2
FB
6762 if ( ! aSign ) return a;
6763 goto invalid;
6764 }
6765 if ( aSign ) {
6766 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6767 invalid:
ff32e16e 6768 float_raise(float_flag_invalid, status);
af39bc8c 6769 return float128_default_nan(status);
158142c2
FB
6770 }
6771 if ( aExp == 0 ) {
6772 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6773 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6774 }
6775 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6776 aSig0 |= LIT64( 0x0001000000000000 );
6777 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6778 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6779 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6780 doubleZSig0 = zSig0<<1;
6781 mul64To128( zSig0, zSig0, &term0, &term1 );
6782 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
bb98fe42 6783 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
6784 --zSig0;
6785 doubleZSig0 -= 2;
6786 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6787 }
6788 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6789 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6790 if ( zSig1 == 0 ) zSig1 = 1;
6791 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6792 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6793 mul64To128( zSig1, zSig1, &term2, &term3 );
6794 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
bb98fe42 6795 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
6796 --zSig1;
6797 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6798 term3 |= 1;
6799 term2 |= doubleZSig0;
6800 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6801 }
6802 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6803 }
6804 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
ff32e16e 6805 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
158142c2
FB
6806
6807}
6808
6809/*----------------------------------------------------------------------------
6810| Returns 1 if the quadruple-precision floating-point value `a' is equal to
b689362d
AJ
6811| the corresponding value `b', and 0 otherwise. The invalid exception is
6812| raised if either operand is a NaN. Otherwise, the comparison is performed
158142c2
FB
6813| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6814*----------------------------------------------------------------------------*/
6815
e5a41ffa 6816int float128_eq(float128 a, float128 b, float_status *status)
158142c2
FB
6817{
6818
6819 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6820 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6821 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6822 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6823 ) {
ff32e16e 6824 float_raise(float_flag_invalid, status);
158142c2
FB
6825 return 0;
6826 }
6827 return
6828 ( a.low == b.low )
6829 && ( ( a.high == b.high )
6830 || ( ( a.low == 0 )
bb98fe42 6831 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
6832 );
6833
6834}
6835
6836/*----------------------------------------------------------------------------
6837| Returns 1 if the quadruple-precision floating-point value `a' is less than
f5a64251
AJ
6838| or equal to the corresponding value `b', and 0 otherwise. The invalid
6839| exception is raised if either operand is a NaN. The comparison is performed
6840| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
6841*----------------------------------------------------------------------------*/
6842
e5a41ffa 6843int float128_le(float128 a, float128 b, float_status *status)
158142c2
FB
6844{
6845 flag aSign, bSign;
6846
6847 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6848 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6849 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6850 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6851 ) {
ff32e16e 6852 float_raise(float_flag_invalid, status);
158142c2
FB
6853 return 0;
6854 }
6855 aSign = extractFloat128Sign( a );
6856 bSign = extractFloat128Sign( b );
6857 if ( aSign != bSign ) {
6858 return
6859 aSign
bb98fe42 6860 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
6861 == 0 );
6862 }
6863 return
6864 aSign ? le128( b.high, b.low, a.high, a.low )
6865 : le128( a.high, a.low, b.high, b.low );
6866
6867}
6868
6869/*----------------------------------------------------------------------------
6870| Returns 1 if the quadruple-precision floating-point value `a' is less than
f5a64251
AJ
6871| the corresponding value `b', and 0 otherwise. The invalid exception is
6872| raised if either operand is a NaN. The comparison is performed according
6873| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
6874*----------------------------------------------------------------------------*/
6875
e5a41ffa 6876int float128_lt(float128 a, float128 b, float_status *status)
158142c2
FB
6877{
6878 flag aSign, bSign;
6879
6880 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6881 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6882 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6883 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6884 ) {
ff32e16e 6885 float_raise(float_flag_invalid, status);
158142c2
FB
6886 return 0;
6887 }
6888 aSign = extractFloat128Sign( a );
6889 bSign = extractFloat128Sign( b );
6890 if ( aSign != bSign ) {
6891 return
6892 aSign
bb98fe42 6893 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
6894 != 0 );
6895 }
6896 return
6897 aSign ? lt128( b.high, b.low, a.high, a.low )
6898 : lt128( a.high, a.low, b.high, b.low );
6899
6900}
6901
67b7861d
AJ
6902/*----------------------------------------------------------------------------
6903| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
f5a64251
AJ
6904| be compared, and 0 otherwise. The invalid exception is raised if either
6905| operand is a NaN. The comparison is performed according to the IEC/IEEE
6906| Standard for Binary Floating-Point Arithmetic.
67b7861d
AJ
6907*----------------------------------------------------------------------------*/
6908
e5a41ffa 6909int float128_unordered(float128 a, float128 b, float_status *status)
67b7861d
AJ
6910{
6911 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6912 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6913 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6914 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6915 ) {
ff32e16e 6916 float_raise(float_flag_invalid, status);
67b7861d
AJ
6917 return 1;
6918 }
6919 return 0;
6920}
6921
158142c2
FB
6922/*----------------------------------------------------------------------------
6923| Returns 1 if the quadruple-precision floating-point value `a' is equal to
f5a64251
AJ
6924| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6925| exception. The comparison is performed according to the IEC/IEEE Standard
6926| for Binary Floating-Point Arithmetic.
158142c2
FB
6927*----------------------------------------------------------------------------*/
6928
e5a41ffa 6929int float128_eq_quiet(float128 a, float128 b, float_status *status)
158142c2
FB
6930{
6931
6932 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6933 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6934 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6935 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6936 ) {
af39bc8c
AM
6937 if (float128_is_signaling_nan(a, status)
6938 || float128_is_signaling_nan(b, status)) {
ff32e16e 6939 float_raise(float_flag_invalid, status);
b689362d 6940 }
158142c2
FB
6941 return 0;
6942 }
6943 return
6944 ( a.low == b.low )
6945 && ( ( a.high == b.high )
6946 || ( ( a.low == 0 )
bb98fe42 6947 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
6948 );
6949
6950}
6951
6952/*----------------------------------------------------------------------------
6953| Returns 1 if the quadruple-precision floating-point value `a' is less than
6954| or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
6955| cause an exception. Otherwise, the comparison is performed according to the
6956| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6957*----------------------------------------------------------------------------*/
6958
e5a41ffa 6959int float128_le_quiet(float128 a, float128 b, float_status *status)
158142c2
FB
6960{
6961 flag aSign, bSign;
6962
6963 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6964 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6965 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6966 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6967 ) {
af39bc8c
AM
6968 if (float128_is_signaling_nan(a, status)
6969 || float128_is_signaling_nan(b, status)) {
ff32e16e 6970 float_raise(float_flag_invalid, status);
158142c2
FB
6971 }
6972 return 0;
6973 }
6974 aSign = extractFloat128Sign( a );
6975 bSign = extractFloat128Sign( b );
6976 if ( aSign != bSign ) {
6977 return
6978 aSign
bb98fe42 6979 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
6980 == 0 );
6981 }
6982 return
6983 aSign ? le128( b.high, b.low, a.high, a.low )
6984 : le128( a.high, a.low, b.high, b.low );
6985
6986}
6987
6988/*----------------------------------------------------------------------------
6989| Returns 1 if the quadruple-precision floating-point value `a' is less than
6990| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6991| exception. Otherwise, the comparison is performed according to the IEC/IEEE
6992| Standard for Binary Floating-Point Arithmetic.
6993*----------------------------------------------------------------------------*/
6994
e5a41ffa 6995int float128_lt_quiet(float128 a, float128 b, float_status *status)
158142c2
FB
6996{
6997 flag aSign, bSign;
6998
6999 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
7000 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7001 || ( ( extractFloat128Exp( b ) == 0x7FFF )
7002 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7003 ) {
af39bc8c
AM
7004 if (float128_is_signaling_nan(a, status)
7005 || float128_is_signaling_nan(b, status)) {
ff32e16e 7006 float_raise(float_flag_invalid, status);
158142c2
FB
7007 }
7008 return 0;
7009 }
7010 aSign = extractFloat128Sign( a );
7011 bSign = extractFloat128Sign( b );
7012 if ( aSign != bSign ) {
7013 return
7014 aSign
bb98fe42 7015 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
7016 != 0 );
7017 }
7018 return
7019 aSign ? lt128( b.high, b.low, a.high, a.low )
7020 : lt128( a.high, a.low, b.high, b.low );
7021
7022}
7023
67b7861d
AJ
7024/*----------------------------------------------------------------------------
7025| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7026| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
7027| comparison is performed according to the IEC/IEEE Standard for Binary
7028| Floating-Point Arithmetic.
7029*----------------------------------------------------------------------------*/
7030
e5a41ffa 7031int float128_unordered_quiet(float128 a, float128 b, float_status *status)
67b7861d
AJ
7032{
7033 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
7034 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7035 || ( ( extractFloat128Exp( b ) == 0x7FFF )
7036 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7037 ) {
af39bc8c
AM
7038 if (float128_is_signaling_nan(a, status)
7039 || float128_is_signaling_nan(b, status)) {
ff32e16e 7040 float_raise(float_flag_invalid, status);
67b7861d
AJ
7041 }
7042 return 1;
7043 }
7044 return 0;
7045}
7046
1d6bda35 7047/* misc functions */
e5a41ffa 7048float32 uint32_to_float32(uint32_t a, float_status *status)
1d6bda35 7049{
ff32e16e 7050 return int64_to_float32(a, status);
1d6bda35
FB
7051}
7052
e5a41ffa 7053float64 uint32_to_float64(uint32_t a, float_status *status)
1d6bda35 7054{
ff32e16e 7055 return int64_to_float64(a, status);
1d6bda35
FB
7056}
7057
3a87d009 7058uint32_t float32_to_uint32(float32 a, float_status *status)
1d6bda35
FB
7059{
7060 int64_t v;
3a87d009 7061 uint32_t res;
34e1c27b 7062 int old_exc_flags = get_float_exception_flags(status);
1d6bda35 7063
ff32e16e 7064 v = float32_to_int64(a, status);
1d6bda35
FB
7065 if (v < 0) {
7066 res = 0;
1d6bda35
FB
7067 } else if (v > 0xffffffff) {
7068 res = 0xffffffff;
1d6bda35 7069 } else {
34e1c27b 7070 return v;
1d6bda35 7071 }
34e1c27b 7072 set_float_exception_flags(old_exc_flags, status);
ff32e16e 7073 float_raise(float_flag_invalid, status);
1d6bda35
FB
7074 return res;
7075}
7076
3a87d009 7077uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *status)
1d6bda35
FB
7078{
7079 int64_t v;
3a87d009 7080 uint32_t res;
34e1c27b 7081 int old_exc_flags = get_float_exception_flags(status);
1d6bda35 7082
ff32e16e 7083 v = float32_to_int64_round_to_zero(a, status);
1d6bda35
FB
7084 if (v < 0) {
7085 res = 0;
1d6bda35
FB
7086 } else if (v > 0xffffffff) {
7087 res = 0xffffffff;
1d6bda35 7088 } else {
34e1c27b 7089 return v;
1d6bda35 7090 }
34e1c27b 7091 set_float_exception_flags(old_exc_flags, status);
ff32e16e 7092 float_raise(float_flag_invalid, status);
1d6bda35
FB
7093 return res;
7094}
7095
0bb721d7 7096int16_t float32_to_int16(float32 a, float_status *status)
f581bf54
WN
7097{
7098 int32_t v;
0bb721d7 7099 int16_t res;
f581bf54
WN
7100 int old_exc_flags = get_float_exception_flags(status);
7101
ff32e16e 7102 v = float32_to_int32(a, status);
f581bf54
WN
7103 if (v < -0x8000) {
7104 res = -0x8000;
7105 } else if (v > 0x7fff) {
7106 res = 0x7fff;
7107 } else {
7108 return v;
7109 }
7110
7111 set_float_exception_flags(old_exc_flags, status);
ff32e16e 7112 float_raise(float_flag_invalid, status);
f581bf54
WN
7113 return res;
7114}
7115
0bb721d7 7116uint16_t float32_to_uint16(float32 a, float_status *status)
f581bf54
WN
7117{
7118 int32_t v;
0bb721d7 7119 uint16_t res;
f581bf54
WN
7120 int old_exc_flags = get_float_exception_flags(status);
7121
ff32e16e 7122 v = float32_to_int32(a, status);
f581bf54
WN
7123 if (v < 0) {
7124 res = 0;
7125 } else if (v > 0xffff) {
7126 res = 0xffff;
7127 } else {
7128 return v;
7129 }
7130
7131 set_float_exception_flags(old_exc_flags, status);
ff32e16e 7132 float_raise(float_flag_invalid, status);
f581bf54
WN
7133 return res;
7134}
7135
0bb721d7 7136uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *status)
cbcef455
PM
7137{
7138 int64_t v;
0bb721d7 7139 uint16_t res;
34e1c27b 7140 int old_exc_flags = get_float_exception_flags(status);
cbcef455 7141
ff32e16e 7142 v = float32_to_int64_round_to_zero(a, status);
cbcef455
PM
7143 if (v < 0) {
7144 res = 0;
cbcef455
PM
7145 } else if (v > 0xffff) {
7146 res = 0xffff;
cbcef455 7147 } else {
34e1c27b 7148 return v;
cbcef455 7149 }
34e1c27b 7150 set_float_exception_flags(old_exc_flags, status);
ff32e16e 7151 float_raise(float_flag_invalid, status);
cbcef455
PM
7152 return res;
7153}
7154
3a87d009 7155uint32_t float64_to_uint32(float64 a, float_status *status)
1d6bda35 7156{
5e7f654f 7157 uint64_t v;
3a87d009 7158 uint32_t res;
5e7f654f 7159 int old_exc_flags = get_float_exception_flags(status);
1d6bda35 7160
ff32e16e 7161 v = float64_to_uint64(a, status);
5e7f654f 7162 if (v > 0xffffffff) {
1d6bda35 7163 res = 0xffffffff;
1d6bda35 7164 } else {
5e7f654f 7165 return v;
1d6bda35 7166 }
5e7f654f 7167 set_float_exception_flags(old_exc_flags, status);
ff32e16e 7168 float_raise(float_flag_invalid, status);
1d6bda35
FB
7169 return res;
7170}
7171
3a87d009 7172uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *status)
1d6bda35 7173{
fd728f2f 7174 uint64_t v;
3a87d009 7175 uint32_t res;
fd728f2f 7176 int old_exc_flags = get_float_exception_flags(status);
1d6bda35 7177
ff32e16e 7178 v = float64_to_uint64_round_to_zero(a, status);
fd728f2f 7179 if (v > 0xffffffff) {
1d6bda35 7180 res = 0xffffffff;
1d6bda35 7181 } else {
fd728f2f 7182 return v;
1d6bda35 7183 }
fd728f2f 7184 set_float_exception_flags(old_exc_flags, status);
ff32e16e 7185 float_raise(float_flag_invalid, status);
1d6bda35
FB
7186 return res;
7187}
7188
0bb721d7 7189int16_t float64_to_int16(float64 a, float_status *status)
f581bf54
WN
7190{
7191 int64_t v;
0bb721d7 7192 int16_t res;
f581bf54
WN
7193 int old_exc_flags = get_float_exception_flags(status);
7194
ff32e16e 7195 v = float64_to_int32(a, status);
f581bf54
WN
7196 if (v < -0x8000) {
7197 res = -0x8000;
7198 } else if (v > 0x7fff) {
7199 res = 0x7fff;
7200 } else {
7201 return v;
7202 }
7203
7204 set_float_exception_flags(old_exc_flags, status);
ff32e16e 7205 float_raise(float_flag_invalid, status);
f581bf54
WN
7206 return res;
7207}
7208
0bb721d7 7209uint16_t float64_to_uint16(float64 a, float_status *status)
f581bf54
WN
7210{
7211 int64_t v;
0bb721d7 7212 uint16_t res;
f581bf54
WN
7213 int old_exc_flags = get_float_exception_flags(status);
7214
ff32e16e 7215 v = float64_to_int32(a, status);
f581bf54
WN
7216 if (v < 0) {
7217 res = 0;
7218 } else if (v > 0xffff) {
7219 res = 0xffff;
7220 } else {
7221 return v;
7222 }
7223
7224 set_float_exception_flags(old_exc_flags, status);
ff32e16e 7225 float_raise(float_flag_invalid, status);
f581bf54
WN
7226 return res;
7227}
7228
0bb721d7 7229uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *status)
cbcef455
PM
7230{
7231 int64_t v;
0bb721d7 7232 uint16_t res;
34e1c27b 7233 int old_exc_flags = get_float_exception_flags(status);
cbcef455 7234
ff32e16e 7235 v = float64_to_int64_round_to_zero(a, status);
cbcef455
PM
7236 if (v < 0) {
7237 res = 0;
cbcef455
PM
7238 } else if (v > 0xffff) {
7239 res = 0xffff;
cbcef455 7240 } else {
34e1c27b 7241 return v;
cbcef455 7242 }
34e1c27b 7243 set_float_exception_flags(old_exc_flags, status);
ff32e16e 7244 float_raise(float_flag_invalid, status);
cbcef455
PM
7245 return res;
7246}
7247
fb3ea83a
TM
7248/*----------------------------------------------------------------------------
7249| Returns the result of converting the double-precision floating-point value
7250| `a' to the 64-bit unsigned integer format. The conversion is
7251| performed according to the IEC/IEEE Standard for Binary Floating-Point
7252| Arithmetic---which means in particular that the conversion is rounded
7253| according to the current rounding mode. If `a' is a NaN, the largest
7254| positive integer is returned. If the conversion overflows, the
7255| largest unsigned integer is returned. If 'a' is negative, the value is
7256| rounded and zero is returned; negative values that do not round to zero
7257| will raise the inexact exception.
7258*----------------------------------------------------------------------------*/
75d62a58 7259
e5a41ffa 7260uint64_t float64_to_uint64(float64 a, float_status *status)
fb3ea83a
TM
7261{
7262 flag aSign;
0c48262d 7263 int aExp;
07d792d2 7264 int shiftCount;
fb3ea83a 7265 uint64_t aSig, aSigExtra;
ff32e16e 7266 a = float64_squash_input_denormal(a, status);
75d62a58 7267
fb3ea83a
TM
7268 aSig = extractFloat64Frac(a);
7269 aExp = extractFloat64Exp(a);
7270 aSign = extractFloat64Sign(a);
7271 if (aSign && (aExp > 1022)) {
ff32e16e 7272 float_raise(float_flag_invalid, status);
fb3ea83a
TM
7273 if (float64_is_any_nan(a)) {
7274 return LIT64(0xFFFFFFFFFFFFFFFF);
7275 } else {
7276 return 0;
7277 }
7278 }
7279 if (aExp) {
7280 aSig |= LIT64(0x0010000000000000);
7281 }
7282 shiftCount = 0x433 - aExp;
7283 if (shiftCount <= 0) {
7284 if (0x43E < aExp) {
ff32e16e 7285 float_raise(float_flag_invalid, status);
fb3ea83a
TM
7286 return LIT64(0xFFFFFFFFFFFFFFFF);
7287 }
7288 aSigExtra = 0;
7289 aSig <<= -shiftCount;
7290 } else {
7291 shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra);
7292 }
ff32e16e 7293 return roundAndPackUint64(aSign, aSig, aSigExtra, status);
75d62a58
JM
7294}
7295
e5a41ffa 7296uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *status)
75d62a58 7297{
a2f2d288 7298 signed char current_rounding_mode = status->float_rounding_mode;
ff32e16e 7299 set_float_rounding_mode(float_round_to_zero, status);
d000b477 7300 uint64_t v = float64_to_uint64(a, status);
ff32e16e 7301 set_float_rounding_mode(current_rounding_mode, status);
0a87a310 7302 return v;
75d62a58
JM
7303}
7304
1d6bda35 7305#define COMPARE(s, nan_exp) \
e5a41ffa
PM
7306static inline int float ## s ## _compare_internal(float ## s a, float ## s b,\
7307 int is_quiet, float_status *status) \
1d6bda35
FB
7308{ \
7309 flag aSign, bSign; \
bb98fe42 7310 uint ## s ## _t av, bv; \
ff32e16e
PM
7311 a = float ## s ## _squash_input_denormal(a, status); \
7312 b = float ## s ## _squash_input_denormal(b, status); \
1d6bda35
FB
7313 \
7314 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
7315 extractFloat ## s ## Frac( a ) ) || \
7316 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
7317 extractFloat ## s ## Frac( b ) )) { \
7318 if (!is_quiet || \
af39bc8c
AM
7319 float ## s ## _is_signaling_nan(a, status) || \
7320 float ## s ## _is_signaling_nan(b, status)) { \
ff32e16e 7321 float_raise(float_flag_invalid, status); \
1d6bda35
FB
7322 } \
7323 return float_relation_unordered; \
7324 } \
7325 aSign = extractFloat ## s ## Sign( a ); \
7326 bSign = extractFloat ## s ## Sign( b ); \
f090c9d4 7327 av = float ## s ## _val(a); \
cd8a2533 7328 bv = float ## s ## _val(b); \
1d6bda35 7329 if ( aSign != bSign ) { \
bb98fe42 7330 if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) { \
1d6bda35
FB
7331 /* zero case */ \
7332 return float_relation_equal; \
7333 } else { \
7334 return 1 - (2 * aSign); \
7335 } \
7336 } else { \
f090c9d4 7337 if (av == bv) { \
1d6bda35
FB
7338 return float_relation_equal; \
7339 } else { \
f090c9d4 7340 return 1 - 2 * (aSign ^ ( av < bv )); \
1d6bda35
FB
7341 } \
7342 } \
7343} \
7344 \
e5a41ffa 7345int float ## s ## _compare(float ## s a, float ## s b, float_status *status) \
1d6bda35 7346{ \
ff32e16e 7347 return float ## s ## _compare_internal(a, b, 0, status); \
1d6bda35
FB
7348} \
7349 \
e5a41ffa
PM
7350int float ## s ## _compare_quiet(float ## s a, float ## s b, \
7351 float_status *status) \
1d6bda35 7352{ \
ff32e16e 7353 return float ## s ## _compare_internal(a, b, 1, status); \
1d6bda35
FB
7354}
7355
7356COMPARE(32, 0xff)
7357COMPARE(64, 0x7ff)
9ee6e8bb 7358
e5a41ffa
PM
7359static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
7360 int is_quiet, float_status *status)
f6714d36
AJ
7361{
7362 flag aSign, bSign;
7363
d1eb8f2a
AD
7364 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
7365 float_raise(float_flag_invalid, status);
7366 return float_relation_unordered;
7367 }
f6714d36
AJ
7368 if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7369 ( extractFloatx80Frac( a )<<1 ) ) ||
7370 ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7371 ( extractFloatx80Frac( b )<<1 ) )) {
7372 if (!is_quiet ||
af39bc8c
AM
7373 floatx80_is_signaling_nan(a, status) ||
7374 floatx80_is_signaling_nan(b, status)) {
ff32e16e 7375 float_raise(float_flag_invalid, status);
f6714d36
AJ
7376 }
7377 return float_relation_unordered;
7378 }
7379 aSign = extractFloatx80Sign( a );
7380 bSign = extractFloatx80Sign( b );
7381 if ( aSign != bSign ) {
7382
7383 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7384 ( ( a.low | b.low ) == 0 ) ) {
7385 /* zero case */
7386 return float_relation_equal;
7387 } else {
7388 return 1 - (2 * aSign);
7389 }
7390 } else {
7391 if (a.low == b.low && a.high == b.high) {
7392 return float_relation_equal;
7393 } else {
7394 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7395 }
7396 }
7397}
7398
e5a41ffa 7399int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
f6714d36 7400{
ff32e16e 7401 return floatx80_compare_internal(a, b, 0, status);
f6714d36
AJ
7402}
7403
e5a41ffa 7404int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
f6714d36 7405{
ff32e16e 7406 return floatx80_compare_internal(a, b, 1, status);
f6714d36
AJ
7407}
7408
e5a41ffa
PM
7409static inline int float128_compare_internal(float128 a, float128 b,
7410 int is_quiet, float_status *status)
1f587329
BS
7411{
7412 flag aSign, bSign;
7413
7414 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7415 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7416 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7417 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7418 if (!is_quiet ||
af39bc8c
AM
7419 float128_is_signaling_nan(a, status) ||
7420 float128_is_signaling_nan(b, status)) {
ff32e16e 7421 float_raise(float_flag_invalid, status);
1f587329
BS
7422 }
7423 return float_relation_unordered;
7424 }
7425 aSign = extractFloat128Sign( a );
7426 bSign = extractFloat128Sign( b );
7427 if ( aSign != bSign ) {
7428 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7429 /* zero case */
7430 return float_relation_equal;
7431 } else {
7432 return 1 - (2 * aSign);
7433 }
7434 } else {
7435 if (a.low == b.low && a.high == b.high) {
7436 return float_relation_equal;
7437 } else {
7438 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7439 }
7440 }
7441}
7442
e5a41ffa 7443int float128_compare(float128 a, float128 b, float_status *status)
1f587329 7444{
ff32e16e 7445 return float128_compare_internal(a, b, 0, status);
1f587329
BS
7446}
7447
e5a41ffa 7448int float128_compare_quiet(float128 a, float128 b, float_status *status)
1f587329 7449{
ff32e16e 7450 return float128_compare_internal(a, b, 1, status);
1f587329
BS
7451}
7452
274f1b04
PM
7453/* min() and max() functions. These can't be implemented as
7454 * 'compare and pick one input' because that would mishandle
7455 * NaNs and +0 vs -0.
e17ab310
WN
7456 *
7457 * minnum() and maxnum() functions. These are similar to the min()
7458 * and max() functions but if one of the arguments is a QNaN and
7459 * the other is numerical then the numerical argument is returned.
7460 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
7461 * and maxNum() operations. min() and max() are the typical min/max
7462 * semantics provided by many CPUs which predate that specification.
2d31e060
LA
7463 *
7464 * minnummag() and maxnummag() functions correspond to minNumMag()
7465 * and minNumMag() from the IEEE-754 2008.
274f1b04 7466 */
e70614ea 7467#define MINMAX(s) \
a49db98d 7468static inline float ## s float ## s ## _minmax(float ## s a, float ## s b, \
2d31e060 7469 int ismin, int isieee, \
e5a41ffa
PM
7470 int ismag, \
7471 float_status *status) \
274f1b04
PM
7472{ \
7473 flag aSign, bSign; \
2d31e060 7474 uint ## s ## _t av, bv, aav, abv; \
ff32e16e
PM
7475 a = float ## s ## _squash_input_denormal(a, status); \
7476 b = float ## s ## _squash_input_denormal(b, status); \
274f1b04
PM
7477 if (float ## s ## _is_any_nan(a) || \
7478 float ## s ## _is_any_nan(b)) { \
e17ab310 7479 if (isieee) { \
af39bc8c 7480 if (float ## s ## _is_quiet_nan(a, status) && \
e17ab310
WN
7481 !float ## s ##_is_any_nan(b)) { \
7482 return b; \
af39bc8c
AM
7483 } else if (float ## s ## _is_quiet_nan(b, status) && \
7484 !float ## s ## _is_any_nan(a)) { \
e17ab310
WN
7485 return a; \
7486 } \
7487 } \
ff32e16e 7488 return propagateFloat ## s ## NaN(a, b, status); \
274f1b04
PM
7489 } \
7490 aSign = extractFloat ## s ## Sign(a); \
7491 bSign = extractFloat ## s ## Sign(b); \
7492 av = float ## s ## _val(a); \
7493 bv = float ## s ## _val(b); \
2d31e060
LA
7494 if (ismag) { \
7495 aav = float ## s ## _abs(av); \
7496 abv = float ## s ## _abs(bv); \
7497 if (aav != abv) { \
7498 if (ismin) { \
7499 return (aav < abv) ? a : b; \
7500 } else { \
7501 return (aav < abv) ? b : a; \
7502 } \
7503 } \
7504 } \
274f1b04
PM
7505 if (aSign != bSign) { \
7506 if (ismin) { \
7507 return aSign ? a : b; \
7508 } else { \
7509 return aSign ? b : a; \
7510 } \
7511 } else { \
7512 if (ismin) { \
7513 return (aSign ^ (av < bv)) ? a : b; \
7514 } else { \
7515 return (aSign ^ (av < bv)) ? b : a; \
7516 } \
7517 } \
7518} \
7519 \
e5a41ffa
PM
7520float ## s float ## s ## _min(float ## s a, float ## s b, \
7521 float_status *status) \
274f1b04 7522{ \
ff32e16e 7523 return float ## s ## _minmax(a, b, 1, 0, 0, status); \
274f1b04
PM
7524} \
7525 \
e5a41ffa
PM
7526float ## s float ## s ## _max(float ## s a, float ## s b, \
7527 float_status *status) \
274f1b04 7528{ \
ff32e16e 7529 return float ## s ## _minmax(a, b, 0, 0, 0, status); \
e17ab310
WN
7530} \
7531 \
e5a41ffa
PM
7532float ## s float ## s ## _minnum(float ## s a, float ## s b, \
7533 float_status *status) \
e17ab310 7534{ \
ff32e16e 7535 return float ## s ## _minmax(a, b, 1, 1, 0, status); \
e17ab310
WN
7536} \
7537 \
e5a41ffa
PM
7538float ## s float ## s ## _maxnum(float ## s a, float ## s b, \
7539 float_status *status) \
e17ab310 7540{ \
ff32e16e 7541 return float ## s ## _minmax(a, b, 0, 1, 0, status); \
2d31e060
LA
7542} \
7543 \
e5a41ffa
PM
7544float ## s float ## s ## _minnummag(float ## s a, float ## s b, \
7545 float_status *status) \
2d31e060 7546{ \
ff32e16e 7547 return float ## s ## _minmax(a, b, 1, 1, 1, status); \
2d31e060
LA
7548} \
7549 \
e5a41ffa
PM
7550float ## s float ## s ## _maxnummag(float ## s a, float ## s b, \
7551 float_status *status) \
2d31e060 7552{ \
ff32e16e 7553 return float ## s ## _minmax(a, b, 0, 1, 1, status); \
274f1b04
PM
7554}
7555
e70614ea
WN
7556MINMAX(32)
7557MINMAX(64)
274f1b04
PM
7558
7559
9ee6e8bb 7560/* Multiply A by 2 raised to the power N. */
e5a41ffa 7561float32 float32_scalbn(float32 a, int n, float_status *status)
9ee6e8bb
PB
7562{
7563 flag aSign;
326b9e98 7564 int16_t aExp;
bb98fe42 7565 uint32_t aSig;
9ee6e8bb 7566
ff32e16e 7567 a = float32_squash_input_denormal(a, status);
9ee6e8bb
PB
7568 aSig = extractFloat32Frac( a );
7569 aExp = extractFloat32Exp( a );
7570 aSign = extractFloat32Sign( a );
7571
7572 if ( aExp == 0xFF ) {
326b9e98 7573 if ( aSig ) {
ff32e16e 7574 return propagateFloat32NaN(a, a, status);
326b9e98 7575 }
9ee6e8bb
PB
7576 return a;
7577 }
3c85c37f 7578 if (aExp != 0) {
69397542 7579 aSig |= 0x00800000;
3c85c37f 7580 } else if (aSig == 0) {
69397542 7581 return a;
3c85c37f
PM
7582 } else {
7583 aExp++;
7584 }
69397542 7585
326b9e98
AJ
7586 if (n > 0x200) {
7587 n = 0x200;
7588 } else if (n < -0x200) {
7589 n = -0x200;
7590 }
7591
69397542
PB
7592 aExp += n - 1;
7593 aSig <<= 7;
ff32e16e 7594 return normalizeRoundAndPackFloat32(aSign, aExp, aSig, status);
9ee6e8bb
PB
7595}
7596
e5a41ffa 7597float64 float64_scalbn(float64 a, int n, float_status *status)
9ee6e8bb
PB
7598{
7599 flag aSign;
326b9e98 7600 int16_t aExp;
bb98fe42 7601 uint64_t aSig;
9ee6e8bb 7602
ff32e16e 7603 a = float64_squash_input_denormal(a, status);
9ee6e8bb
PB
7604 aSig = extractFloat64Frac( a );
7605 aExp = extractFloat64Exp( a );
7606 aSign = extractFloat64Sign( a );
7607
7608 if ( aExp == 0x7FF ) {
326b9e98 7609 if ( aSig ) {
ff32e16e 7610 return propagateFloat64NaN(a, a, status);
326b9e98 7611 }
9ee6e8bb
PB
7612 return a;
7613 }
3c85c37f 7614 if (aExp != 0) {
69397542 7615 aSig |= LIT64( 0x0010000000000000 );
3c85c37f 7616 } else if (aSig == 0) {
69397542 7617 return a;
3c85c37f
PM
7618 } else {
7619 aExp++;
7620 }
69397542 7621
326b9e98
AJ
7622 if (n > 0x1000) {
7623 n = 0x1000;
7624 } else if (n < -0x1000) {
7625 n = -0x1000;
7626 }
7627
69397542
PB
7628 aExp += n - 1;
7629 aSig <<= 10;
ff32e16e 7630 return normalizeRoundAndPackFloat64(aSign, aExp, aSig, status);
9ee6e8bb
PB
7631}
7632
e5a41ffa 7633floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
9ee6e8bb
PB
7634{
7635 flag aSign;
326b9e98 7636 int32_t aExp;
bb98fe42 7637 uint64_t aSig;
9ee6e8bb 7638
d1eb8f2a
AD
7639 if (floatx80_invalid_encoding(a)) {
7640 float_raise(float_flag_invalid, status);
7641 return floatx80_default_nan(status);
7642 }
9ee6e8bb
PB
7643 aSig = extractFloatx80Frac( a );
7644 aExp = extractFloatx80Exp( a );
7645 aSign = extractFloatx80Sign( a );
7646
326b9e98
AJ
7647 if ( aExp == 0x7FFF ) {
7648 if ( aSig<<1 ) {
ff32e16e 7649 return propagateFloatx80NaN(a, a, status);
326b9e98 7650 }
9ee6e8bb
PB
7651 return a;
7652 }
326b9e98 7653
3c85c37f
PM
7654 if (aExp == 0) {
7655 if (aSig == 0) {
7656 return a;
7657 }
7658 aExp++;
7659 }
69397542 7660
326b9e98
AJ
7661 if (n > 0x10000) {
7662 n = 0x10000;
7663 } else if (n < -0x10000) {
7664 n = -0x10000;
7665 }
7666
9ee6e8bb 7667 aExp += n;
a2f2d288
PM
7668 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7669 aSign, aExp, aSig, 0, status);
9ee6e8bb 7670}
9ee6e8bb 7671
e5a41ffa 7672float128 float128_scalbn(float128 a, int n, float_status *status)
9ee6e8bb
PB
7673{
7674 flag aSign;
326b9e98 7675 int32_t aExp;
bb98fe42 7676 uint64_t aSig0, aSig1;
9ee6e8bb
PB
7677
7678 aSig1 = extractFloat128Frac1( a );
7679 aSig0 = extractFloat128Frac0( a );
7680 aExp = extractFloat128Exp( a );
7681 aSign = extractFloat128Sign( a );
7682 if ( aExp == 0x7FFF ) {
326b9e98 7683 if ( aSig0 | aSig1 ) {
ff32e16e 7684 return propagateFloat128NaN(a, a, status);
326b9e98 7685 }
9ee6e8bb
PB
7686 return a;
7687 }
3c85c37f 7688 if (aExp != 0) {
69397542 7689 aSig0 |= LIT64( 0x0001000000000000 );
3c85c37f 7690 } else if (aSig0 == 0 && aSig1 == 0) {
69397542 7691 return a;
3c85c37f
PM
7692 } else {
7693 aExp++;
7694 }
69397542 7695
326b9e98
AJ
7696 if (n > 0x10000) {
7697 n = 0x10000;
7698 } else if (n < -0x10000) {
7699 n = -0x10000;
7700 }
7701
69397542
PB
7702 aExp += n - 1;
7703 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
ff32e16e 7704 , status);
9ee6e8bb
PB
7705
7706}