]> git.proxmox.com Git - mirror_qemu.git/blame - fpu/softfloat.c
fpu/softfloat: re-factor float to int/uint
[mirror_qemu.git] / fpu / softfloat.c
CommitLineData
8d725fac
AF
1/*
2 * QEMU float support
3 *
16017c48
PM
4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
10 * the BSD license
11 * GPL-v2-or-later
12 *
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
8d725fac 16 */
158142c2 17
a7d1ac78
PM
18/*
19===============================================================================
20This C source file is part of the SoftFloat IEC/IEEE Floating-point
21Arithmetic Package, Release 2a.
158142c2
FB
22
23Written by John R. Hauser. This work was made possible in part by the
24International Computer Science Institute, located at Suite 600, 1947 Center
25Street, Berkeley, California 94704. Funding was partially provided by the
26National Science Foundation under grant MIP-9311980. The original version
27of this code was written as part of a project to build a fixed-point vector
28processor in collaboration with the University of California at Berkeley,
29overseen by Profs. Nelson Morgan and John Wawrzynek. More information
a7d1ac78 30is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
158142c2
FB
31arithmetic/SoftFloat.html'.
32
a7d1ac78
PM
33THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
158142c2
FB
38
39Derivative works are acceptable, even for commercial purposes, so long as
a7d1ac78
PM
40(1) they include prominent notice that the work is derivative, and (2) they
41include prominent notice akin to these four paragraphs for those parts of
42this code that are retained.
158142c2 43
a7d1ac78
PM
44===============================================================================
45*/
158142c2 46
16017c48
PM
47/* BSD licensing:
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
50 *
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
53 *
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
56 *
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
60 *
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
64 *
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
76 */
77
78/* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
80 */
81
2ac8bd03
PM
82/* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
84 */
d38ea87a 85#include "qemu/osdep.h"
6fff2167 86#include "qemu/bitops.h"
6b4c305c 87#include "fpu/softfloat.h"
158142c2 88
dc355b76 89/* We only need stdlib for abort() */
dc355b76 90
158142c2
FB
91/*----------------------------------------------------------------------------
92| Primitive arithmetic functions, including multi-word arithmetic, and
93| division and square root approximations. (Can be specialized to target if
94| desired.)
95*----------------------------------------------------------------------------*/
96#include "softfloat-macros.h"
97
98/*----------------------------------------------------------------------------
99| Functions and definitions to determine: (1) whether tininess for underflow
100| is detected before or after rounding by default, (2) what (if anything)
101| happens when exceptions are raised, (3) how signaling NaNs are distinguished
102| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
103| are propagated from function inputs to output. These details are target-
104| specific.
105*----------------------------------------------------------------------------*/
106#include "softfloat-specialize.h"
107
bb4d4bb3
PM
108/*----------------------------------------------------------------------------
109| Returns the fraction bits of the half-precision floating-point value `a'.
110*----------------------------------------------------------------------------*/
111
a49db98d 112static inline uint32_t extractFloat16Frac(float16 a)
bb4d4bb3
PM
113{
114 return float16_val(a) & 0x3ff;
115}
116
117/*----------------------------------------------------------------------------
118| Returns the exponent bits of the half-precision floating-point value `a'.
119*----------------------------------------------------------------------------*/
120
0c48262d 121static inline int extractFloat16Exp(float16 a)
bb4d4bb3
PM
122{
123 return (float16_val(a) >> 10) & 0x1f;
124}
125
126/*----------------------------------------------------------------------------
127| Returns the sign bit of the single-precision floating-point value `a'.
128*----------------------------------------------------------------------------*/
129
a49db98d 130static inline flag extractFloat16Sign(float16 a)
bb4d4bb3
PM
131{
132 return float16_val(a)>>15;
133}
134
d97544c9
AB
135/*----------------------------------------------------------------------------
136| Returns the fraction bits of the single-precision floating-point value `a'.
137*----------------------------------------------------------------------------*/
138
139static inline uint32_t extractFloat32Frac(float32 a)
140{
141 return float32_val(a) & 0x007FFFFF;
142}
143
144/*----------------------------------------------------------------------------
145| Returns the exponent bits of the single-precision floating-point value `a'.
146*----------------------------------------------------------------------------*/
147
148static inline int extractFloat32Exp(float32 a)
149{
150 return (float32_val(a) >> 23) & 0xFF;
151}
152
153/*----------------------------------------------------------------------------
154| Returns the sign bit of the single-precision floating-point value `a'.
155*----------------------------------------------------------------------------*/
156
157static inline flag extractFloat32Sign(float32 a)
158{
159 return float32_val(a) >> 31;
160}
161
162/*----------------------------------------------------------------------------
163| Returns the fraction bits of the double-precision floating-point value `a'.
164*----------------------------------------------------------------------------*/
165
166static inline uint64_t extractFloat64Frac(float64 a)
167{
168 return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF);
169}
170
171/*----------------------------------------------------------------------------
172| Returns the exponent bits of the double-precision floating-point value `a'.
173*----------------------------------------------------------------------------*/
174
175static inline int extractFloat64Exp(float64 a)
176{
177 return (float64_val(a) >> 52) & 0x7FF;
178}
179
180/*----------------------------------------------------------------------------
181| Returns the sign bit of the double-precision floating-point value `a'.
182*----------------------------------------------------------------------------*/
183
184static inline flag extractFloat64Sign(float64 a)
185{
186 return float64_val(a) >> 63;
187}
188
a90119b5
AB
189/*
190 * Classify a floating point number. Everything above float_class_qnan
191 * is a NaN so cls >= float_class_qnan is any NaN.
192 */
193
194typedef enum __attribute__ ((__packed__)) {
195 float_class_unclassified,
196 float_class_zero,
197 float_class_normal,
198 float_class_inf,
199 float_class_qnan, /* all NaNs from here */
200 float_class_snan,
201 float_class_dnan,
202 float_class_msnan, /* maybe silenced */
203} FloatClass;
204
205/*
206 * Structure holding all of the decomposed parts of a float. The
207 * exponent is unbiased and the fraction is normalized. All
208 * calculations are done with a 64 bit fraction and then rounded as
209 * appropriate for the final format.
210 *
211 * Thanks to the packed FloatClass a decent compiler should be able to
212 * fit the whole structure into registers and avoid using the stack
213 * for parameter passing.
214 */
215
216typedef struct {
217 uint64_t frac;
218 int32_t exp;
219 FloatClass cls;
220 bool sign;
221} FloatParts;
222
223#define DECOMPOSED_BINARY_POINT (64 - 2)
224#define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT)
225#define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1)
226
227/* Structure holding all of the relevant parameters for a format.
228 * exp_size: the size of the exponent field
229 * exp_bias: the offset applied to the exponent field
230 * exp_max: the maximum normalised exponent
231 * frac_size: the size of the fraction field
232 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
233 * The following are computed based the size of fraction
234 * frac_lsb: least significant bit of fraction
235 * fram_lsbm1: the bit bellow the least significant bit (for rounding)
236 * round_mask/roundeven_mask: masks used for rounding
237 */
238typedef struct {
239 int exp_size;
240 int exp_bias;
241 int exp_max;
242 int frac_size;
243 int frac_shift;
244 uint64_t frac_lsb;
245 uint64_t frac_lsbm1;
246 uint64_t round_mask;
247 uint64_t roundeven_mask;
248} FloatFmt;
249
250/* Expand fields based on the size of exponent and fraction */
251#define FLOAT_PARAMS(E, F) \
252 .exp_size = E, \
253 .exp_bias = ((1 << E) - 1) >> 1, \
254 .exp_max = (1 << E) - 1, \
255 .frac_size = F, \
256 .frac_shift = DECOMPOSED_BINARY_POINT - F, \
257 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \
258 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \
259 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \
260 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
261
262static const FloatFmt float16_params = {
263 FLOAT_PARAMS(5, 10)
264};
265
266static const FloatFmt float32_params = {
267 FLOAT_PARAMS(8, 23)
268};
269
270static const FloatFmt float64_params = {
271 FLOAT_PARAMS(11, 52)
272};
273
6fff2167
AB
274/* Unpack a float to parts, but do not canonicalize. */
275static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
276{
277 const int sign_pos = fmt.frac_size + fmt.exp_size;
278
279 return (FloatParts) {
280 .cls = float_class_unclassified,
281 .sign = extract64(raw, sign_pos, 1),
282 .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
283 .frac = extract64(raw, 0, fmt.frac_size),
284 };
285}
286
287static inline FloatParts float16_unpack_raw(float16 f)
288{
289 return unpack_raw(float16_params, f);
290}
291
292static inline FloatParts float32_unpack_raw(float32 f)
293{
294 return unpack_raw(float32_params, f);
295}
296
297static inline FloatParts float64_unpack_raw(float64 f)
298{
299 return unpack_raw(float64_params, f);
300}
301
302/* Pack a float from parts, but do not canonicalize. */
303static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
304{
305 const int sign_pos = fmt.frac_size + fmt.exp_size;
306 uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
307 return deposit64(ret, sign_pos, 1, p.sign);
308}
309
310static inline float16 float16_pack_raw(FloatParts p)
311{
312 return make_float16(pack_raw(float16_params, p));
313}
314
315static inline float32 float32_pack_raw(FloatParts p)
316{
317 return make_float32(pack_raw(float32_params, p));
318}
319
320static inline float64 float64_pack_raw(FloatParts p)
321{
322 return make_float64(pack_raw(float64_params, p));
323}
324
325/* Canonicalize EXP and FRAC, setting CLS. */
326static FloatParts canonicalize(FloatParts part, const FloatFmt *parm,
327 float_status *status)
328{
329 if (part.exp == parm->exp_max) {
330 if (part.frac == 0) {
331 part.cls = float_class_inf;
332 } else {
333#ifdef NO_SIGNALING_NANS
334 part.cls = float_class_qnan;
335#else
336 int64_t msb = part.frac << (parm->frac_shift + 2);
337 if ((msb < 0) == status->snan_bit_is_one) {
338 part.cls = float_class_snan;
339 } else {
340 part.cls = float_class_qnan;
341 }
342#endif
343 }
344 } else if (part.exp == 0) {
345 if (likely(part.frac == 0)) {
346 part.cls = float_class_zero;
347 } else if (status->flush_inputs_to_zero) {
348 float_raise(float_flag_input_denormal, status);
349 part.cls = float_class_zero;
350 part.frac = 0;
351 } else {
352 int shift = clz64(part.frac) - 1;
353 part.cls = float_class_normal;
354 part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
355 part.frac <<= shift;
356 }
357 } else {
358 part.cls = float_class_normal;
359 part.exp -= parm->exp_bias;
360 part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
361 }
362 return part;
363}
364
365/* Round and uncanonicalize a floating-point number by parts. There
366 * are FRAC_SHIFT bits that may require rounding at the bottom of the
367 * fraction; these bits will be removed. The exponent will be biased
368 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
369 */
370
371static FloatParts round_canonical(FloatParts p, float_status *s,
372 const FloatFmt *parm)
373{
374 const uint64_t frac_lsbm1 = parm->frac_lsbm1;
375 const uint64_t round_mask = parm->round_mask;
376 const uint64_t roundeven_mask = parm->roundeven_mask;
377 const int exp_max = parm->exp_max;
378 const int frac_shift = parm->frac_shift;
379 uint64_t frac, inc;
380 int exp, flags = 0;
381 bool overflow_norm;
382
383 frac = p.frac;
384 exp = p.exp;
385
386 switch (p.cls) {
387 case float_class_normal:
388 switch (s->float_rounding_mode) {
389 case float_round_nearest_even:
390 overflow_norm = false;
391 inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
392 break;
393 case float_round_ties_away:
394 overflow_norm = false;
395 inc = frac_lsbm1;
396 break;
397 case float_round_to_zero:
398 overflow_norm = true;
399 inc = 0;
400 break;
401 case float_round_up:
402 inc = p.sign ? 0 : round_mask;
403 overflow_norm = p.sign;
404 break;
405 case float_round_down:
406 inc = p.sign ? round_mask : 0;
407 overflow_norm = !p.sign;
408 break;
409 default:
410 g_assert_not_reached();
411 }
412
413 exp += parm->exp_bias;
414 if (likely(exp > 0)) {
415 if (frac & round_mask) {
416 flags |= float_flag_inexact;
417 frac += inc;
418 if (frac & DECOMPOSED_OVERFLOW_BIT) {
419 frac >>= 1;
420 exp++;
421 }
422 }
423 frac >>= frac_shift;
424
425 if (unlikely(exp >= exp_max)) {
426 flags |= float_flag_overflow | float_flag_inexact;
427 if (overflow_norm) {
428 exp = exp_max - 1;
429 frac = -1;
430 } else {
431 p.cls = float_class_inf;
432 goto do_inf;
433 }
434 }
435 } else if (s->flush_to_zero) {
436 flags |= float_flag_output_denormal;
437 p.cls = float_class_zero;
438 goto do_zero;
439 } else {
440 bool is_tiny = (s->float_detect_tininess
441 == float_tininess_before_rounding)
442 || (exp < 0)
443 || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
444
445 shift64RightJamming(frac, 1 - exp, &frac);
446 if (frac & round_mask) {
447 /* Need to recompute round-to-even. */
448 if (s->float_rounding_mode == float_round_nearest_even) {
449 inc = ((frac & roundeven_mask) != frac_lsbm1
450 ? frac_lsbm1 : 0);
451 }
452 flags |= float_flag_inexact;
453 frac += inc;
454 }
455
456 exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
457 frac >>= frac_shift;
458
459 if (is_tiny && (flags & float_flag_inexact)) {
460 flags |= float_flag_underflow;
461 }
462 if (exp == 0 && frac == 0) {
463 p.cls = float_class_zero;
464 }
465 }
466 break;
467
468 case float_class_zero:
469 do_zero:
470 exp = 0;
471 frac = 0;
472 break;
473
474 case float_class_inf:
475 do_inf:
476 exp = exp_max;
477 frac = 0;
478 break;
479
480 case float_class_qnan:
481 case float_class_snan:
482 exp = exp_max;
483 break;
484
485 default:
486 g_assert_not_reached();
487 }
488
489 float_raise(flags, s);
490 p.exp = exp;
491 p.frac = frac;
492 return p;
493}
494
495static FloatParts float16_unpack_canonical(float16 f, float_status *s)
496{
497 return canonicalize(float16_unpack_raw(f), &float16_params, s);
498}
499
500static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
501{
502 switch (p.cls) {
503 case float_class_dnan:
504 return float16_default_nan(s);
505 case float_class_msnan:
506 return float16_maybe_silence_nan(float16_pack_raw(p), s);
507 default:
508 p = round_canonical(p, s, &float16_params);
509 return float16_pack_raw(p);
510 }
511}
512
513static FloatParts float32_unpack_canonical(float32 f, float_status *s)
514{
515 return canonicalize(float32_unpack_raw(f), &float32_params, s);
516}
517
518static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
519{
520 switch (p.cls) {
521 case float_class_dnan:
522 return float32_default_nan(s);
523 case float_class_msnan:
524 return float32_maybe_silence_nan(float32_pack_raw(p), s);
525 default:
526 p = round_canonical(p, s, &float32_params);
527 return float32_pack_raw(p);
528 }
529}
530
531static FloatParts float64_unpack_canonical(float64 f, float_status *s)
532{
533 return canonicalize(float64_unpack_raw(f), &float64_params, s);
534}
535
536static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
537{
538 switch (p.cls) {
539 case float_class_dnan:
540 return float64_default_nan(s);
541 case float_class_msnan:
542 return float64_maybe_silence_nan(float64_pack_raw(p), s);
543 default:
544 p = round_canonical(p, s, &float64_params);
545 return float64_pack_raw(p);
546 }
547}
548
549/* Simple helpers for checking if what NaN we have */
550static bool is_nan(FloatClass c)
551{
552 return unlikely(c >= float_class_qnan);
553}
554static bool is_snan(FloatClass c)
555{
556 return c == float_class_snan;
557}
558static bool is_qnan(FloatClass c)
559{
560 return c == float_class_qnan;
561}
562
dbe4d53a
AB
563static FloatParts return_nan(FloatParts a, float_status *s)
564{
565 switch (a.cls) {
566 case float_class_snan:
567 s->float_exception_flags |= float_flag_invalid;
568 a.cls = float_class_msnan;
569 /* fall through */
570 case float_class_qnan:
571 if (s->default_nan_mode) {
572 a.cls = float_class_dnan;
573 }
574 break;
575
576 default:
577 g_assert_not_reached();
578 }
579 return a;
580}
581
6fff2167
AB
582static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
583{
584 if (is_snan(a.cls) || is_snan(b.cls)) {
585 s->float_exception_flags |= float_flag_invalid;
586 }
587
588 if (s->default_nan_mode) {
589 a.cls = float_class_dnan;
590 } else {
591 if (pickNaN(is_qnan(a.cls), is_snan(a.cls),
592 is_qnan(b.cls), is_snan(b.cls),
593 a.frac > b.frac ||
594 (a.frac == b.frac && a.sign < b.sign))) {
595 a = b;
596 }
597 a.cls = float_class_msnan;
598 }
599 return a;
600}
601
d446830a
AB
602static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
603 bool inf_zero, float_status *s)
604{
605 if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
606 s->float_exception_flags |= float_flag_invalid;
607 }
608
609 if (s->default_nan_mode) {
610 a.cls = float_class_dnan;
611 } else {
612 switch (pickNaNMulAdd(is_qnan(a.cls), is_snan(a.cls),
613 is_qnan(b.cls), is_snan(b.cls),
614 is_qnan(c.cls), is_snan(c.cls),
615 inf_zero, s)) {
616 case 0:
617 break;
618 case 1:
619 a = b;
620 break;
621 case 2:
622 a = c;
623 break;
624 case 3:
625 a.cls = float_class_dnan;
626 return a;
627 default:
628 g_assert_not_reached();
629 }
630
631 a.cls = float_class_msnan;
632 }
633 return a;
634}
635
6fff2167
AB
636/*
637 * Returns the result of adding or subtracting the values of the
638 * floating-point values `a' and `b'. The operation is performed
639 * according to the IEC/IEEE Standard for Binary Floating-Point
640 * Arithmetic.
641 */
642
643static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
644 float_status *s)
645{
646 bool a_sign = a.sign;
647 bool b_sign = b.sign ^ subtract;
648
649 if (a_sign != b_sign) {
650 /* Subtraction */
651
652 if (a.cls == float_class_normal && b.cls == float_class_normal) {
653 if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
654 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
655 a.frac = a.frac - b.frac;
656 } else {
657 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
658 a.frac = b.frac - a.frac;
659 a.exp = b.exp;
660 a_sign ^= 1;
661 }
662
663 if (a.frac == 0) {
664 a.cls = float_class_zero;
665 a.sign = s->float_rounding_mode == float_round_down;
666 } else {
667 int shift = clz64(a.frac) - 1;
668 a.frac = a.frac << shift;
669 a.exp = a.exp - shift;
670 a.sign = a_sign;
671 }
672 return a;
673 }
674 if (is_nan(a.cls) || is_nan(b.cls)) {
675 return pick_nan(a, b, s);
676 }
677 if (a.cls == float_class_inf) {
678 if (b.cls == float_class_inf) {
679 float_raise(float_flag_invalid, s);
680 a.cls = float_class_dnan;
681 }
682 return a;
683 }
684 if (a.cls == float_class_zero && b.cls == float_class_zero) {
685 a.sign = s->float_rounding_mode == float_round_down;
686 return a;
687 }
688 if (a.cls == float_class_zero || b.cls == float_class_inf) {
689 b.sign = a_sign ^ 1;
690 return b;
691 }
692 if (b.cls == float_class_zero) {
693 return a;
694 }
695 } else {
696 /* Addition */
697 if (a.cls == float_class_normal && b.cls == float_class_normal) {
698 if (a.exp > b.exp) {
699 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
700 } else if (a.exp < b.exp) {
701 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
702 a.exp = b.exp;
703 }
704 a.frac += b.frac;
705 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
706 a.frac >>= 1;
707 a.exp += 1;
708 }
709 return a;
710 }
711 if (is_nan(a.cls) || is_nan(b.cls)) {
712 return pick_nan(a, b, s);
713 }
714 if (a.cls == float_class_inf || b.cls == float_class_zero) {
715 return a;
716 }
717 if (b.cls == float_class_inf || a.cls == float_class_zero) {
718 b.sign = b_sign;
719 return b;
720 }
721 }
722 g_assert_not_reached();
723}
724
725/*
726 * Returns the result of adding or subtracting the floating-point
727 * values `a' and `b'. The operation is performed according to the
728 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
729 */
730
731float16 __attribute__((flatten)) float16_add(float16 a, float16 b,
732 float_status *status)
733{
734 FloatParts pa = float16_unpack_canonical(a, status);
735 FloatParts pb = float16_unpack_canonical(b, status);
736 FloatParts pr = addsub_floats(pa, pb, false, status);
737
738 return float16_round_pack_canonical(pr, status);
739}
740
741float32 __attribute__((flatten)) float32_add(float32 a, float32 b,
742 float_status *status)
743{
744 FloatParts pa = float32_unpack_canonical(a, status);
745 FloatParts pb = float32_unpack_canonical(b, status);
746 FloatParts pr = addsub_floats(pa, pb, false, status);
747
748 return float32_round_pack_canonical(pr, status);
749}
750
751float64 __attribute__((flatten)) float64_add(float64 a, float64 b,
752 float_status *status)
753{
754 FloatParts pa = float64_unpack_canonical(a, status);
755 FloatParts pb = float64_unpack_canonical(b, status);
756 FloatParts pr = addsub_floats(pa, pb, false, status);
757
758 return float64_round_pack_canonical(pr, status);
759}
760
761float16 __attribute__((flatten)) float16_sub(float16 a, float16 b,
762 float_status *status)
763{
764 FloatParts pa = float16_unpack_canonical(a, status);
765 FloatParts pb = float16_unpack_canonical(b, status);
766 FloatParts pr = addsub_floats(pa, pb, true, status);
767
768 return float16_round_pack_canonical(pr, status);
769}
770
771float32 __attribute__((flatten)) float32_sub(float32 a, float32 b,
772 float_status *status)
773{
774 FloatParts pa = float32_unpack_canonical(a, status);
775 FloatParts pb = float32_unpack_canonical(b, status);
776 FloatParts pr = addsub_floats(pa, pb, true, status);
777
778 return float32_round_pack_canonical(pr, status);
779}
780
781float64 __attribute__((flatten)) float64_sub(float64 a, float64 b,
782 float_status *status)
783{
784 FloatParts pa = float64_unpack_canonical(a, status);
785 FloatParts pb = float64_unpack_canonical(b, status);
786 FloatParts pr = addsub_floats(pa, pb, true, status);
787
788 return float64_round_pack_canonical(pr, status);
789}
790
74d707e2
AB
791/*
792 * Returns the result of multiplying the floating-point values `a' and
793 * `b'. The operation is performed according to the IEC/IEEE Standard
794 * for Binary Floating-Point Arithmetic.
795 */
796
797static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
798{
799 bool sign = a.sign ^ b.sign;
800
801 if (a.cls == float_class_normal && b.cls == float_class_normal) {
802 uint64_t hi, lo;
803 int exp = a.exp + b.exp;
804
805 mul64To128(a.frac, b.frac, &hi, &lo);
806 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
807 if (lo & DECOMPOSED_OVERFLOW_BIT) {
808 shift64RightJamming(lo, 1, &lo);
809 exp += 1;
810 }
811
812 /* Re-use a */
813 a.exp = exp;
814 a.sign = sign;
815 a.frac = lo;
816 return a;
817 }
818 /* handle all the NaN cases */
819 if (is_nan(a.cls) || is_nan(b.cls)) {
820 return pick_nan(a, b, s);
821 }
822 /* Inf * Zero == NaN */
823 if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
824 (a.cls == float_class_zero && b.cls == float_class_inf)) {
825 s->float_exception_flags |= float_flag_invalid;
826 a.cls = float_class_dnan;
827 a.sign = sign;
828 return a;
829 }
830 /* Multiply by 0 or Inf */
831 if (a.cls == float_class_inf || a.cls == float_class_zero) {
832 a.sign = sign;
833 return a;
834 }
835 if (b.cls == float_class_inf || b.cls == float_class_zero) {
836 b.sign = sign;
837 return b;
838 }
839 g_assert_not_reached();
840}
841
842float16 __attribute__((flatten)) float16_mul(float16 a, float16 b,
843 float_status *status)
844{
845 FloatParts pa = float16_unpack_canonical(a, status);
846 FloatParts pb = float16_unpack_canonical(b, status);
847 FloatParts pr = mul_floats(pa, pb, status);
848
849 return float16_round_pack_canonical(pr, status);
850}
851
852float32 __attribute__((flatten)) float32_mul(float32 a, float32 b,
853 float_status *status)
854{
855 FloatParts pa = float32_unpack_canonical(a, status);
856 FloatParts pb = float32_unpack_canonical(b, status);
857 FloatParts pr = mul_floats(pa, pb, status);
858
859 return float32_round_pack_canonical(pr, status);
860}
861
862float64 __attribute__((flatten)) float64_mul(float64 a, float64 b,
863 float_status *status)
864{
865 FloatParts pa = float64_unpack_canonical(a, status);
866 FloatParts pb = float64_unpack_canonical(b, status);
867 FloatParts pr = mul_floats(pa, pb, status);
868
869 return float64_round_pack_canonical(pr, status);
870}
871
d446830a
AB
872/*
873 * Returns the result of multiplying the floating-point values `a' and
874 * `b' then adding 'c', with no intermediate rounding step after the
875 * multiplication. The operation is performed according to the
876 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
877 * The flags argument allows the caller to select negation of the
878 * addend, the intermediate product, or the final result. (The
879 * difference between this and having the caller do a separate
880 * negation is that negating externally will flip the sign bit on
881 * NaNs.)
882 */
883
884static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
885 int flags, float_status *s)
886{
887 bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
888 ((1 << float_class_inf) | (1 << float_class_zero));
889 bool p_sign;
890 bool sign_flip = flags & float_muladd_negate_result;
891 FloatClass p_class;
892 uint64_t hi, lo;
893 int p_exp;
894
895 /* It is implementation-defined whether the cases of (0,inf,qnan)
896 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
897 * they return if they do), so we have to hand this information
898 * off to the target-specific pick-a-NaN routine.
899 */
900 if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
901 return pick_nan_muladd(a, b, c, inf_zero, s);
902 }
903
904 if (inf_zero) {
905 s->float_exception_flags |= float_flag_invalid;
906 a.cls = float_class_dnan;
907 return a;
908 }
909
910 if (flags & float_muladd_negate_c) {
911 c.sign ^= 1;
912 }
913
914 p_sign = a.sign ^ b.sign;
915
916 if (flags & float_muladd_negate_product) {
917 p_sign ^= 1;
918 }
919
920 if (a.cls == float_class_inf || b.cls == float_class_inf) {
921 p_class = float_class_inf;
922 } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
923 p_class = float_class_zero;
924 } else {
925 p_class = float_class_normal;
926 }
927
928 if (c.cls == float_class_inf) {
929 if (p_class == float_class_inf && p_sign != c.sign) {
930 s->float_exception_flags |= float_flag_invalid;
931 a.cls = float_class_dnan;
932 } else {
933 a.cls = float_class_inf;
934 a.sign = c.sign ^ sign_flip;
935 }
936 return a;
937 }
938
939 if (p_class == float_class_inf) {
940 a.cls = float_class_inf;
941 a.sign = p_sign ^ sign_flip;
942 return a;
943 }
944
945 if (p_class == float_class_zero) {
946 if (c.cls == float_class_zero) {
947 if (p_sign != c.sign) {
948 p_sign = s->float_rounding_mode == float_round_down;
949 }
950 c.sign = p_sign;
951 } else if (flags & float_muladd_halve_result) {
952 c.exp -= 1;
953 }
954 c.sign ^= sign_flip;
955 return c;
956 }
957
958 /* a & b should be normals now... */
959 assert(a.cls == float_class_normal &&
960 b.cls == float_class_normal);
961
962 p_exp = a.exp + b.exp;
963
964 /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
965 * result.
966 */
967 mul64To128(a.frac, b.frac, &hi, &lo);
968 /* binary point now at bit 124 */
969
970 /* check for overflow */
971 if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
972 shift128RightJamming(hi, lo, 1, &hi, &lo);
973 p_exp += 1;
974 }
975
976 /* + add/sub */
977 if (c.cls == float_class_zero) {
978 /* move binary point back to 62 */
979 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
980 } else {
981 int exp_diff = p_exp - c.exp;
982 if (p_sign == c.sign) {
983 /* Addition */
984 if (exp_diff <= 0) {
985 shift128RightJamming(hi, lo,
986 DECOMPOSED_BINARY_POINT - exp_diff,
987 &hi, &lo);
988 lo += c.frac;
989 p_exp = c.exp;
990 } else {
991 uint64_t c_hi, c_lo;
992 /* shift c to the same binary point as the product (124) */
993 c_hi = c.frac >> 2;
994 c_lo = 0;
995 shift128RightJamming(c_hi, c_lo,
996 exp_diff,
997 &c_hi, &c_lo);
998 add128(hi, lo, c_hi, c_lo, &hi, &lo);
999 /* move binary point back to 62 */
1000 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1001 }
1002
1003 if (lo & DECOMPOSED_OVERFLOW_BIT) {
1004 shift64RightJamming(lo, 1, &lo);
1005 p_exp += 1;
1006 }
1007
1008 } else {
1009 /* Subtraction */
1010 uint64_t c_hi, c_lo;
1011 /* make C binary point match product at bit 124 */
1012 c_hi = c.frac >> 2;
1013 c_lo = 0;
1014
1015 if (exp_diff <= 0) {
1016 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1017 if (exp_diff == 0
1018 &&
1019 (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1020 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1021 } else {
1022 sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1023 p_sign ^= 1;
1024 p_exp = c.exp;
1025 }
1026 } else {
1027 shift128RightJamming(c_hi, c_lo,
1028 exp_diff,
1029 &c_hi, &c_lo);
1030 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1031 }
1032
1033 if (hi == 0 && lo == 0) {
1034 a.cls = float_class_zero;
1035 a.sign = s->float_rounding_mode == float_round_down;
1036 a.sign ^= sign_flip;
1037 return a;
1038 } else {
1039 int shift;
1040 if (hi != 0) {
1041 shift = clz64(hi);
1042 } else {
1043 shift = clz64(lo) + 64;
1044 }
1045 /* Normalizing to a binary point of 124 is the
1046 correct adjust for the exponent. However since we're
1047 shifting, we might as well put the binary point back
1048 at 62 where we really want it. Therefore shift as
1049 if we're leaving 1 bit at the top of the word, but
1050 adjust the exponent as if we're leaving 3 bits. */
1051 shift -= 1;
1052 if (shift >= 64) {
1053 lo = lo << (shift - 64);
1054 } else {
1055 hi = (hi << shift) | (lo >> (64 - shift));
1056 lo = hi | ((lo << shift) != 0);
1057 }
1058 p_exp -= shift - 2;
1059 }
1060 }
1061 }
1062
1063 if (flags & float_muladd_halve_result) {
1064 p_exp -= 1;
1065 }
1066
1067 /* finally prepare our result */
1068 a.cls = float_class_normal;
1069 a.sign = p_sign ^ sign_flip;
1070 a.exp = p_exp;
1071 a.frac = lo;
1072
1073 return a;
1074}
1075
1076float16 __attribute__((flatten)) float16_muladd(float16 a, float16 b, float16 c,
1077 int flags, float_status *status)
1078{
1079 FloatParts pa = float16_unpack_canonical(a, status);
1080 FloatParts pb = float16_unpack_canonical(b, status);
1081 FloatParts pc = float16_unpack_canonical(c, status);
1082 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1083
1084 return float16_round_pack_canonical(pr, status);
1085}
1086
1087float32 __attribute__((flatten)) float32_muladd(float32 a, float32 b, float32 c,
1088 int flags, float_status *status)
1089{
1090 FloatParts pa = float32_unpack_canonical(a, status);
1091 FloatParts pb = float32_unpack_canonical(b, status);
1092 FloatParts pc = float32_unpack_canonical(c, status);
1093 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1094
1095 return float32_round_pack_canonical(pr, status);
1096}
1097
1098float64 __attribute__((flatten)) float64_muladd(float64 a, float64 b, float64 c,
1099 int flags, float_status *status)
1100{
1101 FloatParts pa = float64_unpack_canonical(a, status);
1102 FloatParts pb = float64_unpack_canonical(b, status);
1103 FloatParts pc = float64_unpack_canonical(c, status);
1104 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1105
1106 return float64_round_pack_canonical(pr, status);
1107}
1108
cf07323d
AB
1109/*
1110 * Returns the result of dividing the floating-point value `a' by the
1111 * corresponding value `b'. The operation is performed according to
1112 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1113 */
1114
1115static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1116{
1117 bool sign = a.sign ^ b.sign;
1118
1119 if (a.cls == float_class_normal && b.cls == float_class_normal) {
1120 uint64_t temp_lo, temp_hi;
1121 int exp = a.exp - b.exp;
1122 if (a.frac < b.frac) {
1123 exp -= 1;
1124 shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1,
1125 &temp_hi, &temp_lo);
1126 } else {
1127 shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT,
1128 &temp_hi, &temp_lo);
1129 }
1130 /* LSB of quot is set if inexact which roundandpack will use
1131 * to set flags. Yet again we re-use a for the result */
1132 a.frac = div128To64(temp_lo, temp_hi, b.frac);
1133 a.sign = sign;
1134 a.exp = exp;
1135 return a;
1136 }
1137 /* handle all the NaN cases */
1138 if (is_nan(a.cls) || is_nan(b.cls)) {
1139 return pick_nan(a, b, s);
1140 }
1141 /* 0/0 or Inf/Inf */
1142 if (a.cls == b.cls
1143 &&
1144 (a.cls == float_class_inf || a.cls == float_class_zero)) {
1145 s->float_exception_flags |= float_flag_invalid;
1146 a.cls = float_class_dnan;
1147 return a;
1148 }
1149 /* Div 0 => Inf */
1150 if (b.cls == float_class_zero) {
1151 s->float_exception_flags |= float_flag_divbyzero;
1152 a.cls = float_class_inf;
1153 a.sign = sign;
1154 return a;
1155 }
1156 /* Inf / x or 0 / x */
1157 if (a.cls == float_class_inf || a.cls == float_class_zero) {
1158 a.sign = sign;
1159 return a;
1160 }
1161 /* Div by Inf */
1162 if (b.cls == float_class_inf) {
1163 a.cls = float_class_zero;
1164 a.sign = sign;
1165 return a;
1166 }
1167 g_assert_not_reached();
1168}
1169
1170float16 float16_div(float16 a, float16 b, float_status *status)
1171{
1172 FloatParts pa = float16_unpack_canonical(a, status);
1173 FloatParts pb = float16_unpack_canonical(b, status);
1174 FloatParts pr = div_floats(pa, pb, status);
1175
1176 return float16_round_pack_canonical(pr, status);
1177}
1178
1179float32 float32_div(float32 a, float32 b, float_status *status)
1180{
1181 FloatParts pa = float32_unpack_canonical(a, status);
1182 FloatParts pb = float32_unpack_canonical(b, status);
1183 FloatParts pr = div_floats(pa, pb, status);
1184
1185 return float32_round_pack_canonical(pr, status);
1186}
1187
1188float64 float64_div(float64 a, float64 b, float_status *status)
1189{
1190 FloatParts pa = float64_unpack_canonical(a, status);
1191 FloatParts pb = float64_unpack_canonical(b, status);
1192 FloatParts pr = div_floats(pa, pb, status);
1193
1194 return float64_round_pack_canonical(pr, status);
1195}
1196
dbe4d53a
AB
1197/*
1198 * Rounds the floating-point value `a' to an integer, and returns the
1199 * result as a floating-point value. The operation is performed
1200 * according to the IEC/IEEE Standard for Binary Floating-Point
1201 * Arithmetic.
1202 */
1203
1204static FloatParts round_to_int(FloatParts a, int rounding_mode, float_status *s)
1205{
1206 if (is_nan(a.cls)) {
1207 return return_nan(a, s);
1208 }
1209
1210 switch (a.cls) {
1211 case float_class_zero:
1212 case float_class_inf:
1213 case float_class_qnan:
1214 /* already "integral" */
1215 break;
1216 case float_class_normal:
1217 if (a.exp >= DECOMPOSED_BINARY_POINT) {
1218 /* already integral */
1219 break;
1220 }
1221 if (a.exp < 0) {
1222 bool one;
1223 /* all fractional */
1224 s->float_exception_flags |= float_flag_inexact;
1225 switch (rounding_mode) {
1226 case float_round_nearest_even:
1227 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
1228 break;
1229 case float_round_ties_away:
1230 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
1231 break;
1232 case float_round_to_zero:
1233 one = false;
1234 break;
1235 case float_round_up:
1236 one = !a.sign;
1237 break;
1238 case float_round_down:
1239 one = a.sign;
1240 break;
1241 default:
1242 g_assert_not_reached();
1243 }
1244
1245 if (one) {
1246 a.frac = DECOMPOSED_IMPLICIT_BIT;
1247 a.exp = 0;
1248 } else {
1249 a.cls = float_class_zero;
1250 }
1251 } else {
1252 uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
1253 uint64_t frac_lsbm1 = frac_lsb >> 1;
1254 uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
1255 uint64_t rnd_mask = rnd_even_mask >> 1;
1256 uint64_t inc;
1257
1258 switch (rounding_mode) {
1259 case float_round_nearest_even:
1260 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
1261 break;
1262 case float_round_ties_away:
1263 inc = frac_lsbm1;
1264 break;
1265 case float_round_to_zero:
1266 inc = 0;
1267 break;
1268 case float_round_up:
1269 inc = a.sign ? 0 : rnd_mask;
1270 break;
1271 case float_round_down:
1272 inc = a.sign ? rnd_mask : 0;
1273 break;
1274 default:
1275 g_assert_not_reached();
1276 }
1277
1278 if (a.frac & rnd_mask) {
1279 s->float_exception_flags |= float_flag_inexact;
1280 a.frac += inc;
1281 a.frac &= ~rnd_mask;
1282 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1283 a.frac >>= 1;
1284 a.exp++;
1285 }
1286 }
1287 }
1288 break;
1289 default:
1290 g_assert_not_reached();
1291 }
1292 return a;
1293}
1294
1295float16 float16_round_to_int(float16 a, float_status *s)
1296{
1297 FloatParts pa = float16_unpack_canonical(a, s);
1298 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1299 return float16_round_pack_canonical(pr, s);
1300}
1301
1302float32 float32_round_to_int(float32 a, float_status *s)
1303{
1304 FloatParts pa = float32_unpack_canonical(a, s);
1305 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1306 return float32_round_pack_canonical(pr, s);
1307}
1308
1309float64 float64_round_to_int(float64 a, float_status *s)
1310{
1311 FloatParts pa = float64_unpack_canonical(a, s);
1312 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1313 return float64_round_pack_canonical(pr, s);
1314}
1315
1316float64 float64_trunc_to_int(float64 a, float_status *s)
1317{
1318 FloatParts pa = float64_unpack_canonical(a, s);
1319 FloatParts pr = round_to_int(pa, float_round_to_zero, s);
1320 return float64_round_pack_canonical(pr, s);
1321}
1322
ab52f973
AB
1323/*
1324 * Returns the result of converting the floating-point value `a' to
1325 * the two's complement integer format. The conversion is performed
1326 * according to the IEC/IEEE Standard for Binary Floating-Point
1327 * Arithmetic---which means in particular that the conversion is
1328 * rounded according to the current rounding mode. If `a' is a NaN,
1329 * the largest positive integer is returned. Otherwise, if the
1330 * conversion overflows, the largest integer with the same sign as `a'
1331 * is returned.
1332*/
1333
1334static int64_t round_to_int_and_pack(FloatParts in, int rmode,
1335 int64_t min, int64_t max,
1336 float_status *s)
1337{
1338 uint64_t r;
1339 int orig_flags = get_float_exception_flags(s);
1340 FloatParts p = round_to_int(in, rmode, s);
1341
1342 switch (p.cls) {
1343 case float_class_snan:
1344 case float_class_qnan:
1345 return max;
1346 case float_class_inf:
1347 return p.sign ? min : max;
1348 case float_class_zero:
1349 return 0;
1350 case float_class_normal:
1351 if (p.exp < DECOMPOSED_BINARY_POINT) {
1352 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1353 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1354 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1355 } else {
1356 r = UINT64_MAX;
1357 }
1358 if (p.sign) {
1359 if (r < -(uint64_t) min) {
1360 return -r;
1361 } else {
1362 s->float_exception_flags = orig_flags | float_flag_invalid;
1363 return min;
1364 }
1365 } else {
1366 if (r < max) {
1367 return r;
1368 } else {
1369 s->float_exception_flags = orig_flags | float_flag_invalid;
1370 return max;
1371 }
1372 }
1373 default:
1374 g_assert_not_reached();
1375 }
1376}
1377
1378#define FLOAT_TO_INT(fsz, isz) \
1379int ## isz ## _t float ## fsz ## _to_int ## isz(float ## fsz a, \
1380 float_status *s) \
1381{ \
1382 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1383 return round_to_int_and_pack(p, s->float_rounding_mode, \
1384 INT ## isz ## _MIN, INT ## isz ## _MAX,\
1385 s); \
1386} \
1387 \
1388int ## isz ## _t float ## fsz ## _to_int ## isz ## _round_to_zero \
1389 (float ## fsz a, float_status *s) \
1390{ \
1391 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1392 return round_to_int_and_pack(p, float_round_to_zero, \
1393 INT ## isz ## _MIN, INT ## isz ## _MAX,\
1394 s); \
1395}
1396
1397FLOAT_TO_INT(16, 16)
1398FLOAT_TO_INT(16, 32)
1399FLOAT_TO_INT(16, 64)
1400
1401FLOAT_TO_INT(32, 16)
1402FLOAT_TO_INT(32, 32)
1403FLOAT_TO_INT(32, 64)
1404
1405FLOAT_TO_INT(64, 16)
1406FLOAT_TO_INT(64, 32)
1407FLOAT_TO_INT(64, 64)
1408
1409#undef FLOAT_TO_INT
1410
1411/*
1412 * Returns the result of converting the floating-point value `a' to
1413 * the unsigned integer format. The conversion is performed according
1414 * to the IEC/IEEE Standard for Binary Floating-Point
1415 * Arithmetic---which means in particular that the conversion is
1416 * rounded according to the current rounding mode. If `a' is a NaN,
1417 * the largest unsigned integer is returned. Otherwise, if the
1418 * conversion overflows, the largest unsigned integer is returned. If
1419 * the 'a' is negative, the result is rounded and zero is returned;
1420 * values that do not round to zero will raise the inexact exception
1421 * flag.
1422 */
1423
1424static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, uint64_t max,
1425 float_status *s)
1426{
1427 int orig_flags = get_float_exception_flags(s);
1428 FloatParts p = round_to_int(in, rmode, s);
1429
1430 switch (p.cls) {
1431 case float_class_snan:
1432 case float_class_qnan:
1433 s->float_exception_flags = orig_flags | float_flag_invalid;
1434 return max;
1435 case float_class_inf:
1436 return p.sign ? 0 : max;
1437 case float_class_zero:
1438 return 0;
1439 case float_class_normal:
1440 {
1441 uint64_t r;
1442 if (p.sign) {
1443 s->float_exception_flags = orig_flags | float_flag_invalid;
1444 return 0;
1445 }
1446
1447 if (p.exp < DECOMPOSED_BINARY_POINT) {
1448 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1449 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1450 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1451 } else {
1452 s->float_exception_flags = orig_flags | float_flag_invalid;
1453 return max;
1454 }
1455
1456 /* For uint64 this will never trip, but if p.exp is too large
1457 * to shift a decomposed fraction we shall have exited via the
1458 * 3rd leg above.
1459 */
1460 if (r > max) {
1461 s->float_exception_flags = orig_flags | float_flag_invalid;
1462 return max;
1463 } else {
1464 return r;
1465 }
1466 }
1467 default:
1468 g_assert_not_reached();
1469 }
1470}
1471
1472#define FLOAT_TO_UINT(fsz, isz) \
1473uint ## isz ## _t float ## fsz ## _to_uint ## isz(float ## fsz a, \
1474 float_status *s) \
1475{ \
1476 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1477 return round_to_uint_and_pack(p, s->float_rounding_mode, \
1478 UINT ## isz ## _MAX, s); \
1479} \
1480 \
1481uint ## isz ## _t float ## fsz ## _to_uint ## isz ## _round_to_zero \
1482 (float ## fsz a, float_status *s) \
1483{ \
1484 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1485 return round_to_uint_and_pack(p, s->float_rounding_mode, \
1486 UINT ## isz ## _MAX, s); \
1487}
1488
1489FLOAT_TO_UINT(16, 16)
1490FLOAT_TO_UINT(16, 32)
1491FLOAT_TO_UINT(16, 64)
1492
1493FLOAT_TO_UINT(32, 16)
1494FLOAT_TO_UINT(32, 32)
1495FLOAT_TO_UINT(32, 64)
1496
1497FLOAT_TO_UINT(64, 16)
1498FLOAT_TO_UINT(64, 32)
1499FLOAT_TO_UINT(64, 64)
1500
1501#undef FLOAT_TO_UINT
1502
158142c2
FB
1503/*----------------------------------------------------------------------------
1504| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
1505| and 7, and returns the properly rounded 32-bit integer corresponding to the
1506| input. If `zSign' is 1, the input is negated before being converted to an
1507| integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
1508| is simply rounded to an integer, with the inexact exception raised if the
1509| input cannot be represented exactly as an integer. However, if the fixed-
1510| point input is too large, the invalid exception is raised and the largest
1511| positive or negative integer is returned.
1512*----------------------------------------------------------------------------*/
1513
f4014512 1514static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
158142c2 1515{
8f506c70 1516 int8_t roundingMode;
158142c2 1517 flag roundNearestEven;
8f506c70 1518 int8_t roundIncrement, roundBits;
760e1416 1519 int32_t z;
158142c2 1520
a2f2d288 1521 roundingMode = status->float_rounding_mode;
158142c2 1522 roundNearestEven = ( roundingMode == float_round_nearest_even );
dc355b76
PM
1523 switch (roundingMode) {
1524 case float_round_nearest_even:
f9288a76 1525 case float_round_ties_away:
dc355b76
PM
1526 roundIncrement = 0x40;
1527 break;
1528 case float_round_to_zero:
1529 roundIncrement = 0;
1530 break;
1531 case float_round_up:
1532 roundIncrement = zSign ? 0 : 0x7f;
1533 break;
1534 case float_round_down:
1535 roundIncrement = zSign ? 0x7f : 0;
1536 break;
1537 default:
1538 abort();
158142c2
FB
1539 }
1540 roundBits = absZ & 0x7F;
1541 absZ = ( absZ + roundIncrement )>>7;
1542 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
1543 z = absZ;
1544 if ( zSign ) z = - z;
1545 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
ff32e16e 1546 float_raise(float_flag_invalid, status);
bb98fe42 1547 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2 1548 }
a2f2d288
PM
1549 if (roundBits) {
1550 status->float_exception_flags |= float_flag_inexact;
1551 }
158142c2
FB
1552 return z;
1553
1554}
1555
1556/*----------------------------------------------------------------------------
1557| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
1558| `absZ1', with binary point between bits 63 and 64 (between the input words),
1559| and returns the properly rounded 64-bit integer corresponding to the input.
1560| If `zSign' is 1, the input is negated before being converted to an integer.
1561| Ordinarily, the fixed-point input is simply rounded to an integer, with
1562| the inexact exception raised if the input cannot be represented exactly as
1563| an integer. However, if the fixed-point input is too large, the invalid
1564| exception is raised and the largest positive or negative integer is
1565| returned.
1566*----------------------------------------------------------------------------*/
1567
f42c2224 1568static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
e5a41ffa 1569 float_status *status)
158142c2 1570{
8f506c70 1571 int8_t roundingMode;
158142c2 1572 flag roundNearestEven, increment;
760e1416 1573 int64_t z;
158142c2 1574
a2f2d288 1575 roundingMode = status->float_rounding_mode;
158142c2 1576 roundNearestEven = ( roundingMode == float_round_nearest_even );
dc355b76
PM
1577 switch (roundingMode) {
1578 case float_round_nearest_even:
f9288a76 1579 case float_round_ties_away:
dc355b76
PM
1580 increment = ((int64_t) absZ1 < 0);
1581 break;
1582 case float_round_to_zero:
1583 increment = 0;
1584 break;
1585 case float_round_up:
1586 increment = !zSign && absZ1;
1587 break;
1588 case float_round_down:
1589 increment = zSign && absZ1;
1590 break;
1591 default:
1592 abort();
158142c2
FB
1593 }
1594 if ( increment ) {
1595 ++absZ0;
1596 if ( absZ0 == 0 ) goto overflow;
bb98fe42 1597 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
158142c2
FB
1598 }
1599 z = absZ0;
1600 if ( zSign ) z = - z;
1601 if ( z && ( ( z < 0 ) ^ zSign ) ) {
1602 overflow:
ff32e16e 1603 float_raise(float_flag_invalid, status);
158142c2 1604 return
bb98fe42 1605 zSign ? (int64_t) LIT64( 0x8000000000000000 )
158142c2
FB
1606 : LIT64( 0x7FFFFFFFFFFFFFFF );
1607 }
a2f2d288
PM
1608 if (absZ1) {
1609 status->float_exception_flags |= float_flag_inexact;
1610 }
158142c2
FB
1611 return z;
1612
1613}
1614
fb3ea83a
TM
1615/*----------------------------------------------------------------------------
1616| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
1617| `absZ1', with binary point between bits 63 and 64 (between the input words),
1618| and returns the properly rounded 64-bit unsigned integer corresponding to the
1619| input. Ordinarily, the fixed-point input is simply rounded to an integer,
1620| with the inexact exception raised if the input cannot be represented exactly
1621| as an integer. However, if the fixed-point input is too large, the invalid
1622| exception is raised and the largest unsigned integer is returned.
1623*----------------------------------------------------------------------------*/
1624
f42c2224 1625static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
e5a41ffa 1626 uint64_t absZ1, float_status *status)
fb3ea83a 1627{
8f506c70 1628 int8_t roundingMode;
fb3ea83a
TM
1629 flag roundNearestEven, increment;
1630
a2f2d288 1631 roundingMode = status->float_rounding_mode;
fb3ea83a 1632 roundNearestEven = (roundingMode == float_round_nearest_even);
dc355b76
PM
1633 switch (roundingMode) {
1634 case float_round_nearest_even:
f9288a76 1635 case float_round_ties_away:
dc355b76
PM
1636 increment = ((int64_t)absZ1 < 0);
1637 break;
1638 case float_round_to_zero:
1639 increment = 0;
1640 break;
1641 case float_round_up:
1642 increment = !zSign && absZ1;
1643 break;
1644 case float_round_down:
1645 increment = zSign && absZ1;
1646 break;
1647 default:
1648 abort();
fb3ea83a
TM
1649 }
1650 if (increment) {
1651 ++absZ0;
1652 if (absZ0 == 0) {
ff32e16e 1653 float_raise(float_flag_invalid, status);
fb3ea83a
TM
1654 return LIT64(0xFFFFFFFFFFFFFFFF);
1655 }
1656 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
1657 }
1658
1659 if (zSign && absZ0) {
ff32e16e 1660 float_raise(float_flag_invalid, status);
fb3ea83a
TM
1661 return 0;
1662 }
1663
1664 if (absZ1) {
a2f2d288 1665 status->float_exception_flags |= float_flag_inexact;
fb3ea83a
TM
1666 }
1667 return absZ0;
1668}
1669
37d18660
PM
1670/*----------------------------------------------------------------------------
1671| If `a' is denormal and we are in flush-to-zero mode then set the
1672| input-denormal exception and return zero. Otherwise just return the value.
1673*----------------------------------------------------------------------------*/
e5a41ffa 1674float32 float32_squash_input_denormal(float32 a, float_status *status)
37d18660 1675{
a2f2d288 1676 if (status->flush_inputs_to_zero) {
37d18660 1677 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
ff32e16e 1678 float_raise(float_flag_input_denormal, status);
37d18660
PM
1679 return make_float32(float32_val(a) & 0x80000000);
1680 }
1681 }
1682 return a;
1683}
1684
158142c2
FB
1685/*----------------------------------------------------------------------------
1686| Normalizes the subnormal single-precision floating-point value represented
1687| by the denormalized significand `aSig'. The normalized exponent and
1688| significand are stored at the locations pointed to by `zExpPtr' and
1689| `zSigPtr', respectively.
1690*----------------------------------------------------------------------------*/
1691
1692static void
0c48262d 1693 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
158142c2 1694{
8f506c70 1695 int8_t shiftCount;
158142c2
FB
1696
1697 shiftCount = countLeadingZeros32( aSig ) - 8;
1698 *zSigPtr = aSig<<shiftCount;
1699 *zExpPtr = 1 - shiftCount;
1700
1701}
1702
1703/*----------------------------------------------------------------------------
1704| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
1705| single-precision floating-point value, returning the result. After being
1706| shifted into the proper positions, the three fields are simply added
1707| together to form the result. This means that any integer portion of `zSig'
1708| will be added into the exponent. Since a properly normalized significand
1709| will have an integer portion equal to 1, the `zExp' input should be 1 less
1710| than the desired result exponent whenever `zSig' is a complete, normalized
1711| significand.
1712*----------------------------------------------------------------------------*/
1713
0c48262d 1714static inline float32 packFloat32(flag zSign, int zExp, uint32_t zSig)
158142c2
FB
1715{
1716
f090c9d4 1717 return make_float32(
bb98fe42 1718 ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
158142c2
FB
1719
1720}
1721
1722/*----------------------------------------------------------------------------
1723| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1724| and significand `zSig', and returns the proper single-precision floating-
1725| point value corresponding to the abstract input. Ordinarily, the abstract
1726| value is simply rounded and packed into the single-precision format, with
1727| the inexact exception raised if the abstract input cannot be represented
1728| exactly. However, if the abstract value is too large, the overflow and
1729| inexact exceptions are raised and an infinity or maximal finite value is
1730| returned. If the abstract value is too small, the input value is rounded to
1731| a subnormal number, and the underflow and inexact exceptions are raised if
1732| the abstract input cannot be represented exactly as a subnormal single-
1733| precision floating-point number.
1734| The input significand `zSig' has its binary point between bits 30
1735| and 29, which is 7 bits to the left of the usual location. This shifted
1736| significand must be normalized or smaller. If `zSig' is not normalized,
1737| `zExp' must be 0; in that case, the result returned is a subnormal number,
1738| and it must not require rounding. In the usual case that `zSig' is
1739| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
1740| The handling of underflow and overflow follows the IEC/IEEE Standard for
1741| Binary Floating-Point Arithmetic.
1742*----------------------------------------------------------------------------*/
1743
0c48262d 1744static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
e5a41ffa 1745 float_status *status)
158142c2 1746{
8f506c70 1747 int8_t roundingMode;
158142c2 1748 flag roundNearestEven;
8f506c70 1749 int8_t roundIncrement, roundBits;
158142c2
FB
1750 flag isTiny;
1751
a2f2d288 1752 roundingMode = status->float_rounding_mode;
158142c2 1753 roundNearestEven = ( roundingMode == float_round_nearest_even );
dc355b76
PM
1754 switch (roundingMode) {
1755 case float_round_nearest_even:
f9288a76 1756 case float_round_ties_away:
dc355b76
PM
1757 roundIncrement = 0x40;
1758 break;
1759 case float_round_to_zero:
1760 roundIncrement = 0;
1761 break;
1762 case float_round_up:
1763 roundIncrement = zSign ? 0 : 0x7f;
1764 break;
1765 case float_round_down:
1766 roundIncrement = zSign ? 0x7f : 0;
1767 break;
1768 default:
1769 abort();
1770 break;
158142c2
FB
1771 }
1772 roundBits = zSig & 0x7F;
bb98fe42 1773 if ( 0xFD <= (uint16_t) zExp ) {
158142c2
FB
1774 if ( ( 0xFD < zExp )
1775 || ( ( zExp == 0xFD )
bb98fe42 1776 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
158142c2 1777 ) {
ff32e16e 1778 float_raise(float_flag_overflow | float_flag_inexact, status);
f090c9d4 1779 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
158142c2
FB
1780 }
1781 if ( zExp < 0 ) {
a2f2d288 1782 if (status->flush_to_zero) {
ff32e16e 1783 float_raise(float_flag_output_denormal, status);
e6afc87f
PM
1784 return packFloat32(zSign, 0, 0);
1785 }
158142c2 1786 isTiny =
a2f2d288
PM
1787 (status->float_detect_tininess
1788 == float_tininess_before_rounding)
158142c2
FB
1789 || ( zExp < -1 )
1790 || ( zSig + roundIncrement < 0x80000000 );
1791 shift32RightJamming( zSig, - zExp, &zSig );
1792 zExp = 0;
1793 roundBits = zSig & 0x7F;
ff32e16e
PM
1794 if (isTiny && roundBits) {
1795 float_raise(float_flag_underflow, status);
1796 }
158142c2
FB
1797 }
1798 }
a2f2d288
PM
1799 if (roundBits) {
1800 status->float_exception_flags |= float_flag_inexact;
1801 }
158142c2
FB
1802 zSig = ( zSig + roundIncrement )>>7;
1803 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
1804 if ( zSig == 0 ) zExp = 0;
1805 return packFloat32( zSign, zExp, zSig );
1806
1807}
1808
1809/*----------------------------------------------------------------------------
1810| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1811| and significand `zSig', and returns the proper single-precision floating-
1812| point value corresponding to the abstract input. This routine is just like
1813| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
1814| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
1815| floating-point exponent.
1816*----------------------------------------------------------------------------*/
1817
1818static float32
0c48262d 1819 normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
e5a41ffa 1820 float_status *status)
158142c2 1821{
8f506c70 1822 int8_t shiftCount;
158142c2
FB
1823
1824 shiftCount = countLeadingZeros32( zSig ) - 1;
ff32e16e
PM
1825 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
1826 status);
158142c2
FB
1827
1828}
1829
37d18660
PM
1830/*----------------------------------------------------------------------------
1831| If `a' is denormal and we are in flush-to-zero mode then set the
1832| input-denormal exception and return zero. Otherwise just return the value.
1833*----------------------------------------------------------------------------*/
e5a41ffa 1834float64 float64_squash_input_denormal(float64 a, float_status *status)
37d18660 1835{
a2f2d288 1836 if (status->flush_inputs_to_zero) {
37d18660 1837 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
ff32e16e 1838 float_raise(float_flag_input_denormal, status);
37d18660
PM
1839 return make_float64(float64_val(a) & (1ULL << 63));
1840 }
1841 }
1842 return a;
1843}
1844
158142c2
FB
1845/*----------------------------------------------------------------------------
1846| Normalizes the subnormal double-precision floating-point value represented
1847| by the denormalized significand `aSig'. The normalized exponent and
1848| significand are stored at the locations pointed to by `zExpPtr' and
1849| `zSigPtr', respectively.
1850*----------------------------------------------------------------------------*/
1851
1852static void
0c48262d 1853 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
158142c2 1854{
8f506c70 1855 int8_t shiftCount;
158142c2
FB
1856
1857 shiftCount = countLeadingZeros64( aSig ) - 11;
1858 *zSigPtr = aSig<<shiftCount;
1859 *zExpPtr = 1 - shiftCount;
1860
1861}
1862
1863/*----------------------------------------------------------------------------
1864| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
1865| double-precision floating-point value, returning the result. After being
1866| shifted into the proper positions, the three fields are simply added
1867| together to form the result. This means that any integer portion of `zSig'
1868| will be added into the exponent. Since a properly normalized significand
1869| will have an integer portion equal to 1, the `zExp' input should be 1 less
1870| than the desired result exponent whenever `zSig' is a complete, normalized
1871| significand.
1872*----------------------------------------------------------------------------*/
1873
0c48262d 1874static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
158142c2
FB
1875{
1876
f090c9d4 1877 return make_float64(
bb98fe42 1878 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
158142c2
FB
1879
1880}
1881
1882/*----------------------------------------------------------------------------
1883| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1884| and significand `zSig', and returns the proper double-precision floating-
1885| point value corresponding to the abstract input. Ordinarily, the abstract
1886| value is simply rounded and packed into the double-precision format, with
1887| the inexact exception raised if the abstract input cannot be represented
1888| exactly. However, if the abstract value is too large, the overflow and
1889| inexact exceptions are raised and an infinity or maximal finite value is
a7d1ac78
PM
1890| returned. If the abstract value is too small, the input value is rounded to
1891| a subnormal number, and the underflow and inexact exceptions are raised if
1892| the abstract input cannot be represented exactly as a subnormal double-
158142c2
FB
1893| precision floating-point number.
1894| The input significand `zSig' has its binary point between bits 62
1895| and 61, which is 10 bits to the left of the usual location. This shifted
1896| significand must be normalized or smaller. If `zSig' is not normalized,
1897| `zExp' must be 0; in that case, the result returned is a subnormal number,
1898| and it must not require rounding. In the usual case that `zSig' is
1899| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
1900| The handling of underflow and overflow follows the IEC/IEEE Standard for
1901| Binary Floating-Point Arithmetic.
1902*----------------------------------------------------------------------------*/
1903
0c48262d 1904static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
e5a41ffa 1905 float_status *status)
158142c2 1906{
8f506c70 1907 int8_t roundingMode;
158142c2 1908 flag roundNearestEven;
0c48262d 1909 int roundIncrement, roundBits;
158142c2
FB
1910 flag isTiny;
1911
a2f2d288 1912 roundingMode = status->float_rounding_mode;
158142c2 1913 roundNearestEven = ( roundingMode == float_round_nearest_even );
dc355b76
PM
1914 switch (roundingMode) {
1915 case float_round_nearest_even:
f9288a76 1916 case float_round_ties_away:
dc355b76
PM
1917 roundIncrement = 0x200;
1918 break;
1919 case float_round_to_zero:
1920 roundIncrement = 0;
1921 break;
1922 case float_round_up:
1923 roundIncrement = zSign ? 0 : 0x3ff;
1924 break;
1925 case float_round_down:
1926 roundIncrement = zSign ? 0x3ff : 0;
1927 break;
9ee6f678
BR
1928 case float_round_to_odd:
1929 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
1930 break;
dc355b76
PM
1931 default:
1932 abort();
158142c2
FB
1933 }
1934 roundBits = zSig & 0x3FF;
bb98fe42 1935 if ( 0x7FD <= (uint16_t) zExp ) {
158142c2
FB
1936 if ( ( 0x7FD < zExp )
1937 || ( ( zExp == 0x7FD )
bb98fe42 1938 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
158142c2 1939 ) {
9ee6f678
BR
1940 bool overflow_to_inf = roundingMode != float_round_to_odd &&
1941 roundIncrement != 0;
ff32e16e 1942 float_raise(float_flag_overflow | float_flag_inexact, status);
9ee6f678 1943 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
158142c2
FB
1944 }
1945 if ( zExp < 0 ) {
a2f2d288 1946 if (status->flush_to_zero) {
ff32e16e 1947 float_raise(float_flag_output_denormal, status);
e6afc87f
PM
1948 return packFloat64(zSign, 0, 0);
1949 }
158142c2 1950 isTiny =
a2f2d288
PM
1951 (status->float_detect_tininess
1952 == float_tininess_before_rounding)
158142c2
FB
1953 || ( zExp < -1 )
1954 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
1955 shift64RightJamming( zSig, - zExp, &zSig );
1956 zExp = 0;
1957 roundBits = zSig & 0x3FF;
ff32e16e
PM
1958 if (isTiny && roundBits) {
1959 float_raise(float_flag_underflow, status);
1960 }
9ee6f678
BR
1961 if (roundingMode == float_round_to_odd) {
1962 /*
1963 * For round-to-odd case, the roundIncrement depends on
1964 * zSig which just changed.
1965 */
1966 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
1967 }
158142c2
FB
1968 }
1969 }
a2f2d288
PM
1970 if (roundBits) {
1971 status->float_exception_flags |= float_flag_inexact;
1972 }
158142c2
FB
1973 zSig = ( zSig + roundIncrement )>>10;
1974 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
1975 if ( zSig == 0 ) zExp = 0;
1976 return packFloat64( zSign, zExp, zSig );
1977
1978}
1979
1980/*----------------------------------------------------------------------------
1981| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1982| and significand `zSig', and returns the proper double-precision floating-
1983| point value corresponding to the abstract input. This routine is just like
1984| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
1985| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
1986| floating-point exponent.
1987*----------------------------------------------------------------------------*/
1988
1989static float64
0c48262d 1990 normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
e5a41ffa 1991 float_status *status)
158142c2 1992{
8f506c70 1993 int8_t shiftCount;
158142c2
FB
1994
1995 shiftCount = countLeadingZeros64( zSig ) - 1;
ff32e16e
PM
1996 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
1997 status);
158142c2
FB
1998
1999}
2000
158142c2
FB
2001/*----------------------------------------------------------------------------
2002| Returns the fraction bits of the extended double-precision floating-point
2003| value `a'.
2004*----------------------------------------------------------------------------*/
2005
a49db98d 2006static inline uint64_t extractFloatx80Frac( floatx80 a )
158142c2
FB
2007{
2008
2009 return a.low;
2010
2011}
2012
2013/*----------------------------------------------------------------------------
2014| Returns the exponent bits of the extended double-precision floating-point
2015| value `a'.
2016*----------------------------------------------------------------------------*/
2017
f4014512 2018static inline int32_t extractFloatx80Exp( floatx80 a )
158142c2
FB
2019{
2020
2021 return a.high & 0x7FFF;
2022
2023}
2024
2025/*----------------------------------------------------------------------------
2026| Returns the sign bit of the extended double-precision floating-point value
2027| `a'.
2028*----------------------------------------------------------------------------*/
2029
a49db98d 2030static inline flag extractFloatx80Sign( floatx80 a )
158142c2
FB
2031{
2032
2033 return a.high>>15;
2034
2035}
2036
2037/*----------------------------------------------------------------------------
2038| Normalizes the subnormal extended double-precision floating-point value
2039| represented by the denormalized significand `aSig'. The normalized exponent
2040| and significand are stored at the locations pointed to by `zExpPtr' and
2041| `zSigPtr', respectively.
2042*----------------------------------------------------------------------------*/
2043
2044static void
f4014512 2045 normalizeFloatx80Subnormal( uint64_t aSig, int32_t *zExpPtr, uint64_t *zSigPtr )
158142c2 2046{
8f506c70 2047 int8_t shiftCount;
158142c2
FB
2048
2049 shiftCount = countLeadingZeros64( aSig );
2050 *zSigPtr = aSig<<shiftCount;
2051 *zExpPtr = 1 - shiftCount;
2052
2053}
2054
2055/*----------------------------------------------------------------------------
2056| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
2057| extended double-precision floating-point value, returning the result.
2058*----------------------------------------------------------------------------*/
2059
f4014512 2060static inline floatx80 packFloatx80( flag zSign, int32_t zExp, uint64_t zSig )
158142c2
FB
2061{
2062 floatx80 z;
2063
2064 z.low = zSig;
bb98fe42 2065 z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
158142c2
FB
2066 return z;
2067
2068}
2069
2070/*----------------------------------------------------------------------------
2071| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2072| and extended significand formed by the concatenation of `zSig0' and `zSig1',
2073| and returns the proper extended double-precision floating-point value
2074| corresponding to the abstract input. Ordinarily, the abstract value is
2075| rounded and packed into the extended double-precision format, with the
2076| inexact exception raised if the abstract input cannot be represented
2077| exactly. However, if the abstract value is too large, the overflow and
2078| inexact exceptions are raised and an infinity or maximal finite value is
2079| returned. If the abstract value is too small, the input value is rounded to
2080| a subnormal number, and the underflow and inexact exceptions are raised if
2081| the abstract input cannot be represented exactly as a subnormal extended
2082| double-precision floating-point number.
2083| If `roundingPrecision' is 32 or 64, the result is rounded to the same
2084| number of bits as single or double precision, respectively. Otherwise, the
2085| result is rounded to the full precision of the extended double-precision
2086| format.
2087| The input significand must be normalized or smaller. If the input
2088| significand is not normalized, `zExp' must be 0; in that case, the result
2089| returned is a subnormal number, and it must not require rounding. The
2090| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
2091| Floating-Point Arithmetic.
2092*----------------------------------------------------------------------------*/
2093
8f506c70 2094static floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
f4014512 2095 int32_t zExp, uint64_t zSig0, uint64_t zSig1,
e5a41ffa 2096 float_status *status)
158142c2 2097{
8f506c70 2098 int8_t roundingMode;
158142c2 2099 flag roundNearestEven, increment, isTiny;
f42c2224 2100 int64_t roundIncrement, roundMask, roundBits;
158142c2 2101
a2f2d288 2102 roundingMode = status->float_rounding_mode;
158142c2
FB
2103 roundNearestEven = ( roundingMode == float_round_nearest_even );
2104 if ( roundingPrecision == 80 ) goto precision80;
2105 if ( roundingPrecision == 64 ) {
2106 roundIncrement = LIT64( 0x0000000000000400 );
2107 roundMask = LIT64( 0x00000000000007FF );
2108 }
2109 else if ( roundingPrecision == 32 ) {
2110 roundIncrement = LIT64( 0x0000008000000000 );
2111 roundMask = LIT64( 0x000000FFFFFFFFFF );
2112 }
2113 else {
2114 goto precision80;
2115 }
2116 zSig0 |= ( zSig1 != 0 );
dc355b76
PM
2117 switch (roundingMode) {
2118 case float_round_nearest_even:
f9288a76 2119 case float_round_ties_away:
dc355b76
PM
2120 break;
2121 case float_round_to_zero:
2122 roundIncrement = 0;
2123 break;
2124 case float_round_up:
2125 roundIncrement = zSign ? 0 : roundMask;
2126 break;
2127 case float_round_down:
2128 roundIncrement = zSign ? roundMask : 0;
2129 break;
2130 default:
2131 abort();
158142c2
FB
2132 }
2133 roundBits = zSig0 & roundMask;
bb98fe42 2134 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
158142c2
FB
2135 if ( ( 0x7FFE < zExp )
2136 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
2137 ) {
2138 goto overflow;
2139 }
2140 if ( zExp <= 0 ) {
a2f2d288 2141 if (status->flush_to_zero) {
ff32e16e 2142 float_raise(float_flag_output_denormal, status);
e6afc87f
PM
2143 return packFloatx80(zSign, 0, 0);
2144 }
158142c2 2145 isTiny =
a2f2d288
PM
2146 (status->float_detect_tininess
2147 == float_tininess_before_rounding)
158142c2
FB
2148 || ( zExp < 0 )
2149 || ( zSig0 <= zSig0 + roundIncrement );
2150 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
2151 zExp = 0;
2152 roundBits = zSig0 & roundMask;
ff32e16e
PM
2153 if (isTiny && roundBits) {
2154 float_raise(float_flag_underflow, status);
2155 }
a2f2d288
PM
2156 if (roundBits) {
2157 status->float_exception_flags |= float_flag_inexact;
2158 }
158142c2 2159 zSig0 += roundIncrement;
bb98fe42 2160 if ( (int64_t) zSig0 < 0 ) zExp = 1;
158142c2
FB
2161 roundIncrement = roundMask + 1;
2162 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
2163 roundMask |= roundIncrement;
2164 }
2165 zSig0 &= ~ roundMask;
2166 return packFloatx80( zSign, zExp, zSig0 );
2167 }
2168 }
a2f2d288
PM
2169 if (roundBits) {
2170 status->float_exception_flags |= float_flag_inexact;
2171 }
158142c2
FB
2172 zSig0 += roundIncrement;
2173 if ( zSig0 < roundIncrement ) {
2174 ++zExp;
2175 zSig0 = LIT64( 0x8000000000000000 );
2176 }
2177 roundIncrement = roundMask + 1;
2178 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
2179 roundMask |= roundIncrement;
2180 }
2181 zSig0 &= ~ roundMask;
2182 if ( zSig0 == 0 ) zExp = 0;
2183 return packFloatx80( zSign, zExp, zSig0 );
2184 precision80:
dc355b76
PM
2185 switch (roundingMode) {
2186 case float_round_nearest_even:
f9288a76 2187 case float_round_ties_away:
dc355b76
PM
2188 increment = ((int64_t)zSig1 < 0);
2189 break;
2190 case float_round_to_zero:
2191 increment = 0;
2192 break;
2193 case float_round_up:
2194 increment = !zSign && zSig1;
2195 break;
2196 case float_round_down:
2197 increment = zSign && zSig1;
2198 break;
2199 default:
2200 abort();
158142c2 2201 }
bb98fe42 2202 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
158142c2
FB
2203 if ( ( 0x7FFE < zExp )
2204 || ( ( zExp == 0x7FFE )
2205 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
2206 && increment
2207 )
2208 ) {
2209 roundMask = 0;
2210 overflow:
ff32e16e 2211 float_raise(float_flag_overflow | float_flag_inexact, status);
158142c2
FB
2212 if ( ( roundingMode == float_round_to_zero )
2213 || ( zSign && ( roundingMode == float_round_up ) )
2214 || ( ! zSign && ( roundingMode == float_round_down ) )
2215 ) {
2216 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
2217 }
2218 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2219 }
2220 if ( zExp <= 0 ) {
2221 isTiny =
a2f2d288
PM
2222 (status->float_detect_tininess
2223 == float_tininess_before_rounding)
158142c2
FB
2224 || ( zExp < 0 )
2225 || ! increment
2226 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
2227 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
2228 zExp = 0;
ff32e16e
PM
2229 if (isTiny && zSig1) {
2230 float_raise(float_flag_underflow, status);
2231 }
a2f2d288
PM
2232 if (zSig1) {
2233 status->float_exception_flags |= float_flag_inexact;
2234 }
dc355b76
PM
2235 switch (roundingMode) {
2236 case float_round_nearest_even:
f9288a76 2237 case float_round_ties_away:
dc355b76
PM
2238 increment = ((int64_t)zSig1 < 0);
2239 break;
2240 case float_round_to_zero:
2241 increment = 0;
2242 break;
2243 case float_round_up:
2244 increment = !zSign && zSig1;
2245 break;
2246 case float_round_down:
2247 increment = zSign && zSig1;
2248 break;
2249 default:
2250 abort();
158142c2
FB
2251 }
2252 if ( increment ) {
2253 ++zSig0;
2254 zSig0 &=
bb98fe42
AF
2255 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
2256 if ( (int64_t) zSig0 < 0 ) zExp = 1;
158142c2
FB
2257 }
2258 return packFloatx80( zSign, zExp, zSig0 );
2259 }
2260 }
a2f2d288
PM
2261 if (zSig1) {
2262 status->float_exception_flags |= float_flag_inexact;
2263 }
158142c2
FB
2264 if ( increment ) {
2265 ++zSig0;
2266 if ( zSig0 == 0 ) {
2267 ++zExp;
2268 zSig0 = LIT64( 0x8000000000000000 );
2269 }
2270 else {
bb98fe42 2271 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
158142c2
FB
2272 }
2273 }
2274 else {
2275 if ( zSig0 == 0 ) zExp = 0;
2276 }
2277 return packFloatx80( zSign, zExp, zSig0 );
2278
2279}
2280
2281/*----------------------------------------------------------------------------
2282| Takes an abstract floating-point value having sign `zSign', exponent
2283| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
2284| and returns the proper extended double-precision floating-point value
2285| corresponding to the abstract input. This routine is just like
2286| `roundAndPackFloatx80' except that the input significand does not have to be
2287| normalized.
2288*----------------------------------------------------------------------------*/
2289
8f506c70 2290static floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
f4014512 2291 flag zSign, int32_t zExp,
e5a41ffa
PM
2292 uint64_t zSig0, uint64_t zSig1,
2293 float_status *status)
158142c2 2294{
8f506c70 2295 int8_t shiftCount;
158142c2
FB
2296
2297 if ( zSig0 == 0 ) {
2298 zSig0 = zSig1;
2299 zSig1 = 0;
2300 zExp -= 64;
2301 }
2302 shiftCount = countLeadingZeros64( zSig0 );
2303 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
2304 zExp -= shiftCount;
ff32e16e
PM
2305 return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
2306 zSig0, zSig1, status);
158142c2
FB
2307
2308}
2309
158142c2
FB
2310/*----------------------------------------------------------------------------
2311| Returns the least-significant 64 fraction bits of the quadruple-precision
2312| floating-point value `a'.
2313*----------------------------------------------------------------------------*/
2314
a49db98d 2315static inline uint64_t extractFloat128Frac1( float128 a )
158142c2
FB
2316{
2317
2318 return a.low;
2319
2320}
2321
2322/*----------------------------------------------------------------------------
2323| Returns the most-significant 48 fraction bits of the quadruple-precision
2324| floating-point value `a'.
2325*----------------------------------------------------------------------------*/
2326
a49db98d 2327static inline uint64_t extractFloat128Frac0( float128 a )
158142c2
FB
2328{
2329
2330 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
2331
2332}
2333
2334/*----------------------------------------------------------------------------
2335| Returns the exponent bits of the quadruple-precision floating-point value
2336| `a'.
2337*----------------------------------------------------------------------------*/
2338
f4014512 2339static inline int32_t extractFloat128Exp( float128 a )
158142c2
FB
2340{
2341
2342 return ( a.high>>48 ) & 0x7FFF;
2343
2344}
2345
2346/*----------------------------------------------------------------------------
2347| Returns the sign bit of the quadruple-precision floating-point value `a'.
2348*----------------------------------------------------------------------------*/
2349
a49db98d 2350static inline flag extractFloat128Sign( float128 a )
158142c2
FB
2351{
2352
2353 return a.high>>63;
2354
2355}
2356
2357/*----------------------------------------------------------------------------
2358| Normalizes the subnormal quadruple-precision floating-point value
2359| represented by the denormalized significand formed by the concatenation of
2360| `aSig0' and `aSig1'. The normalized exponent is stored at the location
2361| pointed to by `zExpPtr'. The most significant 49 bits of the normalized
2362| significand are stored at the location pointed to by `zSig0Ptr', and the
2363| least significant 64 bits of the normalized significand are stored at the
2364| location pointed to by `zSig1Ptr'.
2365*----------------------------------------------------------------------------*/
2366
2367static void
2368 normalizeFloat128Subnormal(
bb98fe42
AF
2369 uint64_t aSig0,
2370 uint64_t aSig1,
f4014512 2371 int32_t *zExpPtr,
bb98fe42
AF
2372 uint64_t *zSig0Ptr,
2373 uint64_t *zSig1Ptr
158142c2
FB
2374 )
2375{
8f506c70 2376 int8_t shiftCount;
158142c2
FB
2377
2378 if ( aSig0 == 0 ) {
2379 shiftCount = countLeadingZeros64( aSig1 ) - 15;
2380 if ( shiftCount < 0 ) {
2381 *zSig0Ptr = aSig1>>( - shiftCount );
2382 *zSig1Ptr = aSig1<<( shiftCount & 63 );
2383 }
2384 else {
2385 *zSig0Ptr = aSig1<<shiftCount;
2386 *zSig1Ptr = 0;
2387 }
2388 *zExpPtr = - shiftCount - 63;
2389 }
2390 else {
2391 shiftCount = countLeadingZeros64( aSig0 ) - 15;
2392 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
2393 *zExpPtr = 1 - shiftCount;
2394 }
2395
2396}
2397
2398/*----------------------------------------------------------------------------
2399| Packs the sign `zSign', the exponent `zExp', and the significand formed
2400| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
2401| floating-point value, returning the result. After being shifted into the
2402| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
2403| added together to form the most significant 32 bits of the result. This
2404| means that any integer portion of `zSig0' will be added into the exponent.
2405| Since a properly normalized significand will have an integer portion equal
2406| to 1, the `zExp' input should be 1 less than the desired result exponent
2407| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
2408| significand.
2409*----------------------------------------------------------------------------*/
2410
a49db98d 2411static inline float128
f4014512 2412 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
158142c2
FB
2413{
2414 float128 z;
2415
2416 z.low = zSig1;
bb98fe42 2417 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
158142c2
FB
2418 return z;
2419
2420}
2421
2422/*----------------------------------------------------------------------------
2423| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2424| and extended significand formed by the concatenation of `zSig0', `zSig1',
2425| and `zSig2', and returns the proper quadruple-precision floating-point value
2426| corresponding to the abstract input. Ordinarily, the abstract value is
2427| simply rounded and packed into the quadruple-precision format, with the
2428| inexact exception raised if the abstract input cannot be represented
2429| exactly. However, if the abstract value is too large, the overflow and
2430| inexact exceptions are raised and an infinity or maximal finite value is
2431| returned. If the abstract value is too small, the input value is rounded to
2432| a subnormal number, and the underflow and inexact exceptions are raised if
2433| the abstract input cannot be represented exactly as a subnormal quadruple-
2434| precision floating-point number.
2435| The input significand must be normalized or smaller. If the input
2436| significand is not normalized, `zExp' must be 0; in that case, the result
2437| returned is a subnormal number, and it must not require rounding. In the
2438| usual case that the input significand is normalized, `zExp' must be 1 less
2439| than the ``true'' floating-point exponent. The handling of underflow and
2440| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2441*----------------------------------------------------------------------------*/
2442
f4014512 2443static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
e5a41ffa
PM
2444 uint64_t zSig0, uint64_t zSig1,
2445 uint64_t zSig2, float_status *status)
158142c2 2446{
8f506c70 2447 int8_t roundingMode;
158142c2
FB
2448 flag roundNearestEven, increment, isTiny;
2449
a2f2d288 2450 roundingMode = status->float_rounding_mode;
158142c2 2451 roundNearestEven = ( roundingMode == float_round_nearest_even );
dc355b76
PM
2452 switch (roundingMode) {
2453 case float_round_nearest_even:
f9288a76 2454 case float_round_ties_away:
dc355b76
PM
2455 increment = ((int64_t)zSig2 < 0);
2456 break;
2457 case float_round_to_zero:
2458 increment = 0;
2459 break;
2460 case float_round_up:
2461 increment = !zSign && zSig2;
2462 break;
2463 case float_round_down:
2464 increment = zSign && zSig2;
2465 break;
9ee6f678
BR
2466 case float_round_to_odd:
2467 increment = !(zSig1 & 0x1) && zSig2;
2468 break;
dc355b76
PM
2469 default:
2470 abort();
158142c2 2471 }
bb98fe42 2472 if ( 0x7FFD <= (uint32_t) zExp ) {
158142c2
FB
2473 if ( ( 0x7FFD < zExp )
2474 || ( ( zExp == 0x7FFD )
2475 && eq128(
2476 LIT64( 0x0001FFFFFFFFFFFF ),
2477 LIT64( 0xFFFFFFFFFFFFFFFF ),
2478 zSig0,
2479 zSig1
2480 )
2481 && increment
2482 )
2483 ) {
ff32e16e 2484 float_raise(float_flag_overflow | float_flag_inexact, status);
158142c2
FB
2485 if ( ( roundingMode == float_round_to_zero )
2486 || ( zSign && ( roundingMode == float_round_up ) )
2487 || ( ! zSign && ( roundingMode == float_round_down ) )
9ee6f678 2488 || (roundingMode == float_round_to_odd)
158142c2
FB
2489 ) {
2490 return
2491 packFloat128(
2492 zSign,
2493 0x7FFE,
2494 LIT64( 0x0000FFFFFFFFFFFF ),
2495 LIT64( 0xFFFFFFFFFFFFFFFF )
2496 );
2497 }
2498 return packFloat128( zSign, 0x7FFF, 0, 0 );
2499 }
2500 if ( zExp < 0 ) {
a2f2d288 2501 if (status->flush_to_zero) {
ff32e16e 2502 float_raise(float_flag_output_denormal, status);
e6afc87f
PM
2503 return packFloat128(zSign, 0, 0, 0);
2504 }
158142c2 2505 isTiny =
a2f2d288
PM
2506 (status->float_detect_tininess
2507 == float_tininess_before_rounding)
158142c2
FB
2508 || ( zExp < -1 )
2509 || ! increment
2510 || lt128(
2511 zSig0,
2512 zSig1,
2513 LIT64( 0x0001FFFFFFFFFFFF ),
2514 LIT64( 0xFFFFFFFFFFFFFFFF )
2515 );
2516 shift128ExtraRightJamming(
2517 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
2518 zExp = 0;
ff32e16e
PM
2519 if (isTiny && zSig2) {
2520 float_raise(float_flag_underflow, status);
2521 }
dc355b76
PM
2522 switch (roundingMode) {
2523 case float_round_nearest_even:
f9288a76 2524 case float_round_ties_away:
dc355b76
PM
2525 increment = ((int64_t)zSig2 < 0);
2526 break;
2527 case float_round_to_zero:
2528 increment = 0;
2529 break;
2530 case float_round_up:
2531 increment = !zSign && zSig2;
2532 break;
2533 case float_round_down:
2534 increment = zSign && zSig2;
2535 break;
9ee6f678
BR
2536 case float_round_to_odd:
2537 increment = !(zSig1 & 0x1) && zSig2;
2538 break;
dc355b76
PM
2539 default:
2540 abort();
158142c2
FB
2541 }
2542 }
2543 }
a2f2d288
PM
2544 if (zSig2) {
2545 status->float_exception_flags |= float_flag_inexact;
2546 }
158142c2
FB
2547 if ( increment ) {
2548 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
2549 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
2550 }
2551 else {
2552 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
2553 }
2554 return packFloat128( zSign, zExp, zSig0, zSig1 );
2555
2556}
2557
2558/*----------------------------------------------------------------------------
2559| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2560| and significand formed by the concatenation of `zSig0' and `zSig1', and
2561| returns the proper quadruple-precision floating-point value corresponding
2562| to the abstract input. This routine is just like `roundAndPackFloat128'
2563| except that the input significand has fewer bits and does not have to be
2564| normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
2565| point exponent.
2566*----------------------------------------------------------------------------*/
2567
f4014512 2568static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
e5a41ffa
PM
2569 uint64_t zSig0, uint64_t zSig1,
2570 float_status *status)
158142c2 2571{
8f506c70 2572 int8_t shiftCount;
bb98fe42 2573 uint64_t zSig2;
158142c2
FB
2574
2575 if ( zSig0 == 0 ) {
2576 zSig0 = zSig1;
2577 zSig1 = 0;
2578 zExp -= 64;
2579 }
2580 shiftCount = countLeadingZeros64( zSig0 ) - 15;
2581 if ( 0 <= shiftCount ) {
2582 zSig2 = 0;
2583 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
2584 }
2585 else {
2586 shift128ExtraRightJamming(
2587 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
2588 }
2589 zExp -= shiftCount;
ff32e16e 2590 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
158142c2
FB
2591
2592}
2593
158142c2
FB
2594/*----------------------------------------------------------------------------
2595| Returns the result of converting the 32-bit two's complement integer `a'
2596| to the single-precision floating-point format. The conversion is performed
2597| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2598*----------------------------------------------------------------------------*/
2599
e5a41ffa 2600float32 int32_to_float32(int32_t a, float_status *status)
158142c2
FB
2601{
2602 flag zSign;
2603
f090c9d4 2604 if ( a == 0 ) return float32_zero;
bb98fe42 2605 if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
158142c2 2606 zSign = ( a < 0 );
ff32e16e 2607 return normalizeRoundAndPackFloat32(zSign, 0x9C, zSign ? -a : a, status);
158142c2
FB
2608}
2609
2610/*----------------------------------------------------------------------------
2611| Returns the result of converting the 32-bit two's complement integer `a'
2612| to the double-precision floating-point format. The conversion is performed
2613| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2614*----------------------------------------------------------------------------*/
2615
e5a41ffa 2616float64 int32_to_float64(int32_t a, float_status *status)
158142c2
FB
2617{
2618 flag zSign;
3a87d009 2619 uint32_t absA;
8f506c70 2620 int8_t shiftCount;
bb98fe42 2621 uint64_t zSig;
158142c2 2622
f090c9d4 2623 if ( a == 0 ) return float64_zero;
158142c2
FB
2624 zSign = ( a < 0 );
2625 absA = zSign ? - a : a;
2626 shiftCount = countLeadingZeros32( absA ) + 21;
2627 zSig = absA;
2628 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
2629
2630}
2631
158142c2
FB
2632/*----------------------------------------------------------------------------
2633| Returns the result of converting the 32-bit two's complement integer `a'
2634| to the extended double-precision floating-point format. The conversion
2635| is performed according to the IEC/IEEE Standard for Binary Floating-Point
2636| Arithmetic.
2637*----------------------------------------------------------------------------*/
2638
e5a41ffa 2639floatx80 int32_to_floatx80(int32_t a, float_status *status)
158142c2
FB
2640{
2641 flag zSign;
3a87d009 2642 uint32_t absA;
8f506c70 2643 int8_t shiftCount;
bb98fe42 2644 uint64_t zSig;
158142c2
FB
2645
2646 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
2647 zSign = ( a < 0 );
2648 absA = zSign ? - a : a;
2649 shiftCount = countLeadingZeros32( absA ) + 32;
2650 zSig = absA;
2651 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
2652
2653}
2654
158142c2
FB
2655/*----------------------------------------------------------------------------
2656| Returns the result of converting the 32-bit two's complement integer `a' to
2657| the quadruple-precision floating-point format. The conversion is performed
2658| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2659*----------------------------------------------------------------------------*/
2660
e5a41ffa 2661float128 int32_to_float128(int32_t a, float_status *status)
158142c2
FB
2662{
2663 flag zSign;
3a87d009 2664 uint32_t absA;
8f506c70 2665 int8_t shiftCount;
bb98fe42 2666 uint64_t zSig0;
158142c2
FB
2667
2668 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
2669 zSign = ( a < 0 );
2670 absA = zSign ? - a : a;
2671 shiftCount = countLeadingZeros32( absA ) + 17;
2672 zSig0 = absA;
2673 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
2674
2675}
2676
158142c2
FB
2677/*----------------------------------------------------------------------------
2678| Returns the result of converting the 64-bit two's complement integer `a'
2679| to the single-precision floating-point format. The conversion is performed
2680| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2681*----------------------------------------------------------------------------*/
2682
e5a41ffa 2683float32 int64_to_float32(int64_t a, float_status *status)
158142c2
FB
2684{
2685 flag zSign;
182f42fd 2686 uint64_t absA;
8f506c70 2687 int8_t shiftCount;
158142c2 2688
f090c9d4 2689 if ( a == 0 ) return float32_zero;
158142c2
FB
2690 zSign = ( a < 0 );
2691 absA = zSign ? - a : a;
2692 shiftCount = countLeadingZeros64( absA ) - 40;
2693 if ( 0 <= shiftCount ) {
2694 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
2695 }
2696 else {
2697 shiftCount += 7;
2698 if ( shiftCount < 0 ) {
2699 shift64RightJamming( absA, - shiftCount, &absA );
2700 }
2701 else {
2702 absA <<= shiftCount;
2703 }
ff32e16e 2704 return roundAndPackFloat32(zSign, 0x9C - shiftCount, absA, status);
158142c2
FB
2705 }
2706
2707}
2708
2709/*----------------------------------------------------------------------------
2710| Returns the result of converting the 64-bit two's complement integer `a'
2711| to the double-precision floating-point format. The conversion is performed
2712| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2713*----------------------------------------------------------------------------*/
2714
e5a41ffa 2715float64 int64_to_float64(int64_t a, float_status *status)
158142c2
FB
2716{
2717 flag zSign;
2718
f090c9d4 2719 if ( a == 0 ) return float64_zero;
bb98fe42 2720 if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
158142c2
FB
2721 return packFloat64( 1, 0x43E, 0 );
2722 }
2723 zSign = ( a < 0 );
ff32e16e 2724 return normalizeRoundAndPackFloat64(zSign, 0x43C, zSign ? -a : a, status);
158142c2
FB
2725}
2726
158142c2
FB
2727/*----------------------------------------------------------------------------
2728| Returns the result of converting the 64-bit two's complement integer `a'
2729| to the extended double-precision floating-point format. The conversion
2730| is performed according to the IEC/IEEE Standard for Binary Floating-Point
2731| Arithmetic.
2732*----------------------------------------------------------------------------*/
2733
e5a41ffa 2734floatx80 int64_to_floatx80(int64_t a, float_status *status)
158142c2
FB
2735{
2736 flag zSign;
182f42fd 2737 uint64_t absA;
8f506c70 2738 int8_t shiftCount;
158142c2
FB
2739
2740 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
2741 zSign = ( a < 0 );
2742 absA = zSign ? - a : a;
2743 shiftCount = countLeadingZeros64( absA );
2744 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
2745
2746}
2747
158142c2
FB
2748/*----------------------------------------------------------------------------
2749| Returns the result of converting the 64-bit two's complement integer `a' to
2750| the quadruple-precision floating-point format. The conversion is performed
2751| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2752*----------------------------------------------------------------------------*/
2753
e5a41ffa 2754float128 int64_to_float128(int64_t a, float_status *status)
158142c2
FB
2755{
2756 flag zSign;
182f42fd 2757 uint64_t absA;
8f506c70 2758 int8_t shiftCount;
f4014512 2759 int32_t zExp;
bb98fe42 2760 uint64_t zSig0, zSig1;
158142c2
FB
2761
2762 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
2763 zSign = ( a < 0 );
2764 absA = zSign ? - a : a;
2765 shiftCount = countLeadingZeros64( absA ) + 49;
2766 zExp = 0x406E - shiftCount;
2767 if ( 64 <= shiftCount ) {
2768 zSig1 = 0;
2769 zSig0 = absA;
2770 shiftCount -= 64;
2771 }
2772 else {
2773 zSig1 = absA;
2774 zSig0 = 0;
2775 }
2776 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
2777 return packFloat128( zSign, zExp, zSig0, zSig1 );
2778
2779}
2780
6bb8e0f1
PM
2781/*----------------------------------------------------------------------------
2782| Returns the result of converting the 64-bit unsigned integer `a'
2783| to the single-precision floating-point format. The conversion is performed
2784| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2785*----------------------------------------------------------------------------*/
2786
e5a41ffa 2787float32 uint64_to_float32(uint64_t a, float_status *status)
6bb8e0f1
PM
2788{
2789 int shiftcount;
2790
2791 if (a == 0) {
2792 return float32_zero;
2793 }
2794
2795 /* Determine (left) shift needed to put first set bit into bit posn 23
2796 * (since packFloat32() expects the binary point between bits 23 and 22);
2797 * this is the fast case for smallish numbers.
2798 */
2799 shiftcount = countLeadingZeros64(a) - 40;
2800 if (shiftcount >= 0) {
2801 return packFloat32(0, 0x95 - shiftcount, a << shiftcount);
2802 }
2803 /* Otherwise we need to do a round-and-pack. roundAndPackFloat32()
2804 * expects the binary point between bits 30 and 29, hence the + 7.
2805 */
2806 shiftcount += 7;
2807 if (shiftcount < 0) {
2808 shift64RightJamming(a, -shiftcount, &a);
2809 } else {
2810 a <<= shiftcount;
2811 }
2812
ff32e16e 2813 return roundAndPackFloat32(0, 0x9c - shiftcount, a, status);
6bb8e0f1
PM
2814}
2815
2816/*----------------------------------------------------------------------------
2817| Returns the result of converting the 64-bit unsigned integer `a'
2818| to the double-precision floating-point format. The conversion is performed
2819| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2820*----------------------------------------------------------------------------*/
2821
e5a41ffa 2822float64 uint64_to_float64(uint64_t a, float_status *status)
6bb8e0f1
PM
2823{
2824 int exp = 0x43C;
2825 int shiftcount;
2826
2827 if (a == 0) {
2828 return float64_zero;
2829 }
2830
2831 shiftcount = countLeadingZeros64(a) - 1;
2832 if (shiftcount < 0) {
2833 shift64RightJamming(a, -shiftcount, &a);
2834 } else {
2835 a <<= shiftcount;
2836 }
ff32e16e 2837 return roundAndPackFloat64(0, exp - shiftcount, a, status);
6bb8e0f1
PM
2838}
2839
2840/*----------------------------------------------------------------------------
2841| Returns the result of converting the 64-bit unsigned integer `a'
2842| to the quadruple-precision floating-point format. The conversion is performed
2843| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2844*----------------------------------------------------------------------------*/
2845
e5a41ffa 2846float128 uint64_to_float128(uint64_t a, float_status *status)
1e397ead
RH
2847{
2848 if (a == 0) {
2849 return float128_zero;
2850 }
ff32e16e 2851 return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status);
1e397ead
RH
2852}
2853
158142c2 2854
158142c2 2855
158142c2
FB
2856
2857/*----------------------------------------------------------------------------
2858| Returns the result of converting the single-precision floating-point value
2859| `a' to the double-precision floating-point format. The conversion is
2860| performed according to the IEC/IEEE Standard for Binary Floating-Point
2861| Arithmetic.
2862*----------------------------------------------------------------------------*/
2863
e5a41ffa 2864float64 float32_to_float64(float32 a, float_status *status)
158142c2
FB
2865{
2866 flag aSign;
0c48262d 2867 int aExp;
bb98fe42 2868 uint32_t aSig;
ff32e16e 2869 a = float32_squash_input_denormal(a, status);
158142c2
FB
2870
2871 aSig = extractFloat32Frac( a );
2872 aExp = extractFloat32Exp( a );
2873 aSign = extractFloat32Sign( a );
2874 if ( aExp == 0xFF ) {
ff32e16e
PM
2875 if (aSig) {
2876 return commonNaNToFloat64(float32ToCommonNaN(a, status), status);
2877 }
158142c2
FB
2878 return packFloat64( aSign, 0x7FF, 0 );
2879 }
2880 if ( aExp == 0 ) {
2881 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
2882 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2883 --aExp;
2884 }
bb98fe42 2885 return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
158142c2
FB
2886
2887}
2888
158142c2
FB
2889/*----------------------------------------------------------------------------
2890| Returns the result of converting the single-precision floating-point value
2891| `a' to the extended double-precision floating-point format. The conversion
2892| is performed according to the IEC/IEEE Standard for Binary Floating-Point
2893| Arithmetic.
2894*----------------------------------------------------------------------------*/
2895
e5a41ffa 2896floatx80 float32_to_floatx80(float32 a, float_status *status)
158142c2
FB
2897{
2898 flag aSign;
0c48262d 2899 int aExp;
bb98fe42 2900 uint32_t aSig;
158142c2 2901
ff32e16e 2902 a = float32_squash_input_denormal(a, status);
158142c2
FB
2903 aSig = extractFloat32Frac( a );
2904 aExp = extractFloat32Exp( a );
2905 aSign = extractFloat32Sign( a );
2906 if ( aExp == 0xFF ) {
ff32e16e
PM
2907 if (aSig) {
2908 return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
2909 }
158142c2
FB
2910 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2911 }
2912 if ( aExp == 0 ) {
2913 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2914 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2915 }
2916 aSig |= 0x00800000;
bb98fe42 2917 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
158142c2
FB
2918
2919}
2920
158142c2
FB
2921/*----------------------------------------------------------------------------
2922| Returns the result of converting the single-precision floating-point value
2923| `a' to the double-precision floating-point format. The conversion is
2924| performed according to the IEC/IEEE Standard for Binary Floating-Point
2925| Arithmetic.
2926*----------------------------------------------------------------------------*/
2927
e5a41ffa 2928float128 float32_to_float128(float32 a, float_status *status)
158142c2
FB
2929{
2930 flag aSign;
0c48262d 2931 int aExp;
bb98fe42 2932 uint32_t aSig;
158142c2 2933
ff32e16e 2934 a = float32_squash_input_denormal(a, status);
158142c2
FB
2935 aSig = extractFloat32Frac( a );
2936 aExp = extractFloat32Exp( a );
2937 aSign = extractFloat32Sign( a );
2938 if ( aExp == 0xFF ) {
ff32e16e
PM
2939 if (aSig) {
2940 return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
2941 }
158142c2
FB
2942 return packFloat128( aSign, 0x7FFF, 0, 0 );
2943 }
2944 if ( aExp == 0 ) {
2945 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2946 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2947 --aExp;
2948 }
bb98fe42 2949 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
158142c2
FB
2950
2951}
2952
158142c2
FB
2953/*----------------------------------------------------------------------------
2954| Returns the remainder of the single-precision floating-point value `a'
2955| with respect to the corresponding value `b'. The operation is performed
2956| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2957*----------------------------------------------------------------------------*/
2958
e5a41ffa 2959float32 float32_rem(float32 a, float32 b, float_status *status)
158142c2 2960{
ed086f3d 2961 flag aSign, zSign;
0c48262d 2962 int aExp, bExp, expDiff;
bb98fe42
AF
2963 uint32_t aSig, bSig;
2964 uint32_t q;
2965 uint64_t aSig64, bSig64, q64;
2966 uint32_t alternateASig;
2967 int32_t sigMean;
ff32e16e
PM
2968 a = float32_squash_input_denormal(a, status);
2969 b = float32_squash_input_denormal(b, status);
158142c2
FB
2970
2971 aSig = extractFloat32Frac( a );
2972 aExp = extractFloat32Exp( a );
2973 aSign = extractFloat32Sign( a );
2974 bSig = extractFloat32Frac( b );
2975 bExp = extractFloat32Exp( b );
158142c2
FB
2976 if ( aExp == 0xFF ) {
2977 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
ff32e16e 2978 return propagateFloat32NaN(a, b, status);
158142c2 2979 }
ff32e16e 2980 float_raise(float_flag_invalid, status);
af39bc8c 2981 return float32_default_nan(status);
158142c2
FB
2982 }
2983 if ( bExp == 0xFF ) {
ff32e16e
PM
2984 if (bSig) {
2985 return propagateFloat32NaN(a, b, status);
2986 }
158142c2
FB
2987 return a;
2988 }
2989 if ( bExp == 0 ) {
2990 if ( bSig == 0 ) {
ff32e16e 2991 float_raise(float_flag_invalid, status);
af39bc8c 2992 return float32_default_nan(status);
158142c2
FB
2993 }
2994 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2995 }
2996 if ( aExp == 0 ) {
2997 if ( aSig == 0 ) return a;
2998 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2999 }
3000 expDiff = aExp - bExp;
3001 aSig |= 0x00800000;
3002 bSig |= 0x00800000;
3003 if ( expDiff < 32 ) {
3004 aSig <<= 8;
3005 bSig <<= 8;
3006 if ( expDiff < 0 ) {
3007 if ( expDiff < -1 ) return a;
3008 aSig >>= 1;
3009 }
3010 q = ( bSig <= aSig );
3011 if ( q ) aSig -= bSig;
3012 if ( 0 < expDiff ) {
bb98fe42 3013 q = ( ( (uint64_t) aSig )<<32 ) / bSig;
158142c2
FB
3014 q >>= 32 - expDiff;
3015 bSig >>= 2;
3016 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3017 }
3018 else {
3019 aSig >>= 2;
3020 bSig >>= 2;
3021 }
3022 }
3023 else {
3024 if ( bSig <= aSig ) aSig -= bSig;
bb98fe42
AF
3025 aSig64 = ( (uint64_t) aSig )<<40;
3026 bSig64 = ( (uint64_t) bSig )<<40;
158142c2
FB
3027 expDiff -= 64;
3028 while ( 0 < expDiff ) {
3029 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3030 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3031 aSig64 = - ( ( bSig * q64 )<<38 );
3032 expDiff -= 62;
3033 }
3034 expDiff += 64;
3035 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3036 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3037 q = q64>>( 64 - expDiff );
3038 bSig <<= 6;
3039 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
3040 }
3041 do {
3042 alternateASig = aSig;
3043 ++q;
3044 aSig -= bSig;
bb98fe42 3045 } while ( 0 <= (int32_t) aSig );
158142c2
FB
3046 sigMean = aSig + alternateASig;
3047 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3048 aSig = alternateASig;
3049 }
bb98fe42 3050 zSign = ( (int32_t) aSig < 0 );
158142c2 3051 if ( zSign ) aSig = - aSig;
ff32e16e 3052 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
158142c2
FB
3053}
3054
369be8f6 3055
158142c2
FB
3056/*----------------------------------------------------------------------------
3057| Returns the square root of the single-precision floating-point value `a'.
3058| The operation is performed according to the IEC/IEEE Standard for Binary
3059| Floating-Point Arithmetic.
3060*----------------------------------------------------------------------------*/
3061
e5a41ffa 3062float32 float32_sqrt(float32 a, float_status *status)
158142c2
FB
3063{
3064 flag aSign;
0c48262d 3065 int aExp, zExp;
bb98fe42
AF
3066 uint32_t aSig, zSig;
3067 uint64_t rem, term;
ff32e16e 3068 a = float32_squash_input_denormal(a, status);
158142c2
FB
3069
3070 aSig = extractFloat32Frac( a );
3071 aExp = extractFloat32Exp( a );
3072 aSign = extractFloat32Sign( a );
3073 if ( aExp == 0xFF ) {
ff32e16e
PM
3074 if (aSig) {
3075 return propagateFloat32NaN(a, float32_zero, status);
3076 }
158142c2 3077 if ( ! aSign ) return a;
ff32e16e 3078 float_raise(float_flag_invalid, status);
af39bc8c 3079 return float32_default_nan(status);
158142c2
FB
3080 }
3081 if ( aSign ) {
3082 if ( ( aExp | aSig ) == 0 ) return a;
ff32e16e 3083 float_raise(float_flag_invalid, status);
af39bc8c 3084 return float32_default_nan(status);
158142c2
FB
3085 }
3086 if ( aExp == 0 ) {
f090c9d4 3087 if ( aSig == 0 ) return float32_zero;
158142c2
FB
3088 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3089 }
3090 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
3091 aSig = ( aSig | 0x00800000 )<<8;
3092 zSig = estimateSqrt32( aExp, aSig ) + 2;
3093 if ( ( zSig & 0x7F ) <= 5 ) {
3094 if ( zSig < 2 ) {
3095 zSig = 0x7FFFFFFF;
3096 goto roundAndPack;
3097 }
3098 aSig >>= aExp & 1;
bb98fe42
AF
3099 term = ( (uint64_t) zSig ) * zSig;
3100 rem = ( ( (uint64_t) aSig )<<32 ) - term;
3101 while ( (int64_t) rem < 0 ) {
158142c2 3102 --zSig;
bb98fe42 3103 rem += ( ( (uint64_t) zSig )<<1 ) | 1;
158142c2
FB
3104 }
3105 zSig |= ( rem != 0 );
3106 }
3107 shift32RightJamming( zSig, 1, &zSig );
3108 roundAndPack:
ff32e16e 3109 return roundAndPackFloat32(0, zExp, zSig, status);
158142c2
FB
3110
3111}
3112
8229c991
AJ
3113/*----------------------------------------------------------------------------
3114| Returns the binary exponential of the single-precision floating-point value
3115| `a'. The operation is performed according to the IEC/IEEE Standard for
3116| Binary Floating-Point Arithmetic.
3117|
3118| Uses the following identities:
3119|
3120| 1. -------------------------------------------------------------------------
3121| x x*ln(2)
3122| 2 = e
3123|
3124| 2. -------------------------------------------------------------------------
3125| 2 3 4 5 n
3126| x x x x x x x
3127| e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
3128| 1! 2! 3! 4! 5! n!
3129*----------------------------------------------------------------------------*/
3130
3131static const float64 float32_exp2_coefficients[15] =
3132{
d5138cf4
PM
3133 const_float64( 0x3ff0000000000000ll ), /* 1 */
3134 const_float64( 0x3fe0000000000000ll ), /* 2 */
3135 const_float64( 0x3fc5555555555555ll ), /* 3 */
3136 const_float64( 0x3fa5555555555555ll ), /* 4 */
3137 const_float64( 0x3f81111111111111ll ), /* 5 */
3138 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
3139 const_float64( 0x3f2a01a01a01a01all ), /* 7 */
3140 const_float64( 0x3efa01a01a01a01all ), /* 8 */
3141 const_float64( 0x3ec71de3a556c734ll ), /* 9 */
3142 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
3143 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
3144 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
3145 const_float64( 0x3de6124613a86d09ll ), /* 13 */
3146 const_float64( 0x3da93974a8c07c9dll ), /* 14 */
3147 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
8229c991
AJ
3148};
3149
e5a41ffa 3150float32 float32_exp2(float32 a, float_status *status)
8229c991
AJ
3151{
3152 flag aSign;
0c48262d 3153 int aExp;
bb98fe42 3154 uint32_t aSig;
8229c991
AJ
3155 float64 r, x, xn;
3156 int i;
ff32e16e 3157 a = float32_squash_input_denormal(a, status);
8229c991
AJ
3158
3159 aSig = extractFloat32Frac( a );
3160 aExp = extractFloat32Exp( a );
3161 aSign = extractFloat32Sign( a );
3162
3163 if ( aExp == 0xFF) {
ff32e16e
PM
3164 if (aSig) {
3165 return propagateFloat32NaN(a, float32_zero, status);
3166 }
8229c991
AJ
3167 return (aSign) ? float32_zero : a;
3168 }
3169 if (aExp == 0) {
3170 if (aSig == 0) return float32_one;
3171 }
3172
ff32e16e 3173 float_raise(float_flag_inexact, status);
8229c991
AJ
3174
3175 /* ******************************* */
3176 /* using float64 for approximation */
3177 /* ******************************* */
ff32e16e
PM
3178 x = float32_to_float64(a, status);
3179 x = float64_mul(x, float64_ln2, status);
8229c991
AJ
3180
3181 xn = x;
3182 r = float64_one;
3183 for (i = 0 ; i < 15 ; i++) {
3184 float64 f;
3185
ff32e16e
PM
3186 f = float64_mul(xn, float32_exp2_coefficients[i], status);
3187 r = float64_add(r, f, status);
8229c991 3188
ff32e16e 3189 xn = float64_mul(xn, x, status);
8229c991
AJ
3190 }
3191
3192 return float64_to_float32(r, status);
3193}
3194
374dfc33
AJ
3195/*----------------------------------------------------------------------------
3196| Returns the binary log of the single-precision floating-point value `a'.
3197| The operation is performed according to the IEC/IEEE Standard for Binary
3198| Floating-Point Arithmetic.
3199*----------------------------------------------------------------------------*/
e5a41ffa 3200float32 float32_log2(float32 a, float_status *status)
374dfc33
AJ
3201{
3202 flag aSign, zSign;
0c48262d 3203 int aExp;
bb98fe42 3204 uint32_t aSig, zSig, i;
374dfc33 3205
ff32e16e 3206 a = float32_squash_input_denormal(a, status);
374dfc33
AJ
3207 aSig = extractFloat32Frac( a );
3208 aExp = extractFloat32Exp( a );
3209 aSign = extractFloat32Sign( a );
3210
3211 if ( aExp == 0 ) {
3212 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
3213 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3214 }
3215 if ( aSign ) {
ff32e16e 3216 float_raise(float_flag_invalid, status);
af39bc8c 3217 return float32_default_nan(status);
374dfc33
AJ
3218 }
3219 if ( aExp == 0xFF ) {
ff32e16e
PM
3220 if (aSig) {
3221 return propagateFloat32NaN(a, float32_zero, status);
3222 }
374dfc33
AJ
3223 return a;
3224 }
3225
3226 aExp -= 0x7F;
3227 aSig |= 0x00800000;
3228 zSign = aExp < 0;
3229 zSig = aExp << 23;
3230
3231 for (i = 1 << 22; i > 0; i >>= 1) {
bb98fe42 3232 aSig = ( (uint64_t)aSig * aSig ) >> 23;
374dfc33
AJ
3233 if ( aSig & 0x01000000 ) {
3234 aSig >>= 1;
3235 zSig |= i;
3236 }
3237 }
3238
3239 if ( zSign )
3240 zSig = -zSig;
3241
ff32e16e 3242 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
374dfc33
AJ
3243}
3244
158142c2
FB
3245/*----------------------------------------------------------------------------
3246| Returns 1 if the single-precision floating-point value `a' is equal to
b689362d
AJ
3247| the corresponding value `b', and 0 otherwise. The invalid exception is
3248| raised if either operand is a NaN. Otherwise, the comparison is performed
158142c2
FB
3249| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3250*----------------------------------------------------------------------------*/
3251
e5a41ffa 3252int float32_eq(float32 a, float32 b, float_status *status)
158142c2 3253{
b689362d 3254 uint32_t av, bv;
ff32e16e
PM
3255 a = float32_squash_input_denormal(a, status);
3256 b = float32_squash_input_denormal(b, status);
158142c2
FB
3257
3258 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3259 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3260 ) {
ff32e16e 3261 float_raise(float_flag_invalid, status);
158142c2
FB
3262 return 0;
3263 }
b689362d
AJ
3264 av = float32_val(a);
3265 bv = float32_val(b);
3266 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
158142c2
FB
3267}
3268
3269/*----------------------------------------------------------------------------
3270| Returns 1 if the single-precision floating-point value `a' is less than
f5a64251
AJ
3271| or equal to the corresponding value `b', and 0 otherwise. The invalid
3272| exception is raised if either operand is a NaN. The comparison is performed
3273| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
3274*----------------------------------------------------------------------------*/
3275
e5a41ffa 3276int float32_le(float32 a, float32 b, float_status *status)
158142c2
FB
3277{
3278 flag aSign, bSign;
bb98fe42 3279 uint32_t av, bv;
ff32e16e
PM
3280 a = float32_squash_input_denormal(a, status);
3281 b = float32_squash_input_denormal(b, status);
158142c2
FB
3282
3283 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3284 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3285 ) {
ff32e16e 3286 float_raise(float_flag_invalid, status);
158142c2
FB
3287 return 0;
3288 }
3289 aSign = extractFloat32Sign( a );
3290 bSign = extractFloat32Sign( b );
f090c9d4
PB
3291 av = float32_val(a);
3292 bv = float32_val(b);
bb98fe42 3293 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 3294 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
3295
3296}
3297
3298/*----------------------------------------------------------------------------
3299| Returns 1 if the single-precision floating-point value `a' is less than
f5a64251
AJ
3300| the corresponding value `b', and 0 otherwise. The invalid exception is
3301| raised if either operand is a NaN. The comparison is performed according
3302| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
3303*----------------------------------------------------------------------------*/
3304
e5a41ffa 3305int float32_lt(float32 a, float32 b, float_status *status)
158142c2
FB
3306{
3307 flag aSign, bSign;
bb98fe42 3308 uint32_t av, bv;
ff32e16e
PM
3309 a = float32_squash_input_denormal(a, status);
3310 b = float32_squash_input_denormal(b, status);
158142c2
FB
3311
3312 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3313 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3314 ) {
ff32e16e 3315 float_raise(float_flag_invalid, status);
158142c2
FB
3316 return 0;
3317 }
3318 aSign = extractFloat32Sign( a );
3319 bSign = extractFloat32Sign( b );
f090c9d4
PB
3320 av = float32_val(a);
3321 bv = float32_val(b);
bb98fe42 3322 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 3323 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
3324
3325}
3326
67b7861d
AJ
3327/*----------------------------------------------------------------------------
3328| Returns 1 if the single-precision floating-point values `a' and `b' cannot
f5a64251
AJ
3329| be compared, and 0 otherwise. The invalid exception is raised if either
3330| operand is a NaN. The comparison is performed according to the IEC/IEEE
3331| Standard for Binary Floating-Point Arithmetic.
67b7861d
AJ
3332*----------------------------------------------------------------------------*/
3333
e5a41ffa 3334int float32_unordered(float32 a, float32 b, float_status *status)
67b7861d 3335{
ff32e16e
PM
3336 a = float32_squash_input_denormal(a, status);
3337 b = float32_squash_input_denormal(b, status);
67b7861d
AJ
3338
3339 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3340 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3341 ) {
ff32e16e 3342 float_raise(float_flag_invalid, status);
67b7861d
AJ
3343 return 1;
3344 }
3345 return 0;
3346}
b689362d 3347
158142c2
FB
3348/*----------------------------------------------------------------------------
3349| Returns 1 if the single-precision floating-point value `a' is equal to
f5a64251
AJ
3350| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3351| exception. The comparison is performed according to the IEC/IEEE Standard
3352| for Binary Floating-Point Arithmetic.
158142c2
FB
3353*----------------------------------------------------------------------------*/
3354
e5a41ffa 3355int float32_eq_quiet(float32 a, float32 b, float_status *status)
158142c2 3356{
ff32e16e
PM
3357 a = float32_squash_input_denormal(a, status);
3358 b = float32_squash_input_denormal(b, status);
158142c2
FB
3359
3360 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3361 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3362 ) {
af39bc8c
AM
3363 if (float32_is_signaling_nan(a, status)
3364 || float32_is_signaling_nan(b, status)) {
ff32e16e 3365 float_raise(float_flag_invalid, status);
b689362d 3366 }
158142c2
FB
3367 return 0;
3368 }
b689362d
AJ
3369 return ( float32_val(a) == float32_val(b) ) ||
3370 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
158142c2
FB
3371}
3372
3373/*----------------------------------------------------------------------------
3374| Returns 1 if the single-precision floating-point value `a' is less than or
3375| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3376| cause an exception. Otherwise, the comparison is performed according to the
3377| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3378*----------------------------------------------------------------------------*/
3379
e5a41ffa 3380int float32_le_quiet(float32 a, float32 b, float_status *status)
158142c2
FB
3381{
3382 flag aSign, bSign;
bb98fe42 3383 uint32_t av, bv;
ff32e16e
PM
3384 a = float32_squash_input_denormal(a, status);
3385 b = float32_squash_input_denormal(b, status);
158142c2
FB
3386
3387 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3388 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3389 ) {
af39bc8c
AM
3390 if (float32_is_signaling_nan(a, status)
3391 || float32_is_signaling_nan(b, status)) {
ff32e16e 3392 float_raise(float_flag_invalid, status);
158142c2
FB
3393 }
3394 return 0;
3395 }
3396 aSign = extractFloat32Sign( a );
3397 bSign = extractFloat32Sign( b );
f090c9d4
PB
3398 av = float32_val(a);
3399 bv = float32_val(b);
bb98fe42 3400 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 3401 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
3402
3403}
3404
3405/*----------------------------------------------------------------------------
3406| Returns 1 if the single-precision floating-point value `a' is less than
3407| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3408| exception. Otherwise, the comparison is performed according to the IEC/IEEE
ab52f973 3409| Standard for Binary Floating-Point Arithmetic.
158142c2
FB
3410*----------------------------------------------------------------------------*/
3411
ab52f973 3412int float32_lt_quiet(float32 a, float32 b, float_status *status)
158142c2 3413{
ab52f973
AB
3414 flag aSign, bSign;
3415 uint32_t av, bv;
3416 a = float32_squash_input_denormal(a, status);
3417 b = float32_squash_input_denormal(b, status);
158142c2 3418
ab52f973
AB
3419 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3420 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3421 ) {
3422 if (float32_is_signaling_nan(a, status)
3423 || float32_is_signaling_nan(b, status)) {
ff32e16e 3424 float_raise(float_flag_invalid, status);
158142c2 3425 }
ab52f973 3426 return 0;
158142c2 3427 }
ab52f973
AB
3428 aSign = extractFloat32Sign( a );
3429 bSign = extractFloat32Sign( b );
3430 av = float32_val(a);
3431 bv = float32_val(b);
3432 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3433 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
3434
3435}
3436
3437/*----------------------------------------------------------------------------
ab52f973
AB
3438| Returns 1 if the single-precision floating-point values `a' and `b' cannot
3439| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
3440| comparison is performed according to the IEC/IEEE Standard for Binary
3441| Floating-Point Arithmetic.
158142c2
FB
3442*----------------------------------------------------------------------------*/
3443
ab52f973 3444int float32_unordered_quiet(float32 a, float32 b, float_status *status)
158142c2 3445{
ab52f973
AB
3446 a = float32_squash_input_denormal(a, status);
3447 b = float32_squash_input_denormal(b, status);
158142c2 3448
ab52f973
AB
3449 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3450 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3451 ) {
3452 if (float32_is_signaling_nan(a, status)
3453 || float32_is_signaling_nan(b, status)) {
3454 float_raise(float_flag_invalid, status);
158142c2 3455 }
ab52f973 3456 return 1;
158142c2 3457 }
ab52f973 3458 return 0;
158142c2
FB
3459}
3460
ab52f973 3461
158142c2
FB
3462/*----------------------------------------------------------------------------
3463| Returns the result of converting the double-precision floating-point value
3464| `a' to the single-precision floating-point format. The conversion is
3465| performed according to the IEC/IEEE Standard for Binary Floating-Point
3466| Arithmetic.
3467*----------------------------------------------------------------------------*/
3468
e5a41ffa 3469float32 float64_to_float32(float64 a, float_status *status)
158142c2
FB
3470{
3471 flag aSign;
0c48262d 3472 int aExp;
bb98fe42
AF
3473 uint64_t aSig;
3474 uint32_t zSig;
ff32e16e 3475 a = float64_squash_input_denormal(a, status);
158142c2
FB
3476
3477 aSig = extractFloat64Frac( a );
3478 aExp = extractFloat64Exp( a );
3479 aSign = extractFloat64Sign( a );
3480 if ( aExp == 0x7FF ) {
ff32e16e
PM
3481 if (aSig) {
3482 return commonNaNToFloat32(float64ToCommonNaN(a, status), status);
3483 }
158142c2
FB
3484 return packFloat32( aSign, 0xFF, 0 );
3485 }
3486 shift64RightJamming( aSig, 22, &aSig );
3487 zSig = aSig;
3488 if ( aExp || zSig ) {
3489 zSig |= 0x40000000;
3490 aExp -= 0x381;
3491 }
ff32e16e 3492 return roundAndPackFloat32(aSign, aExp, zSig, status);
158142c2
FB
3493
3494}
3495
60011498
PB
3496
3497/*----------------------------------------------------------------------------
3498| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3499| half-precision floating-point value, returning the result. After being
3500| shifted into the proper positions, the three fields are simply added
3501| together to form the result. This means that any integer portion of `zSig'
3502| will be added into the exponent. Since a properly normalized significand
3503| will have an integer portion equal to 1, the `zExp' input should be 1 less
3504| than the desired result exponent whenever `zSig' is a complete, normalized
3505| significand.
3506*----------------------------------------------------------------------------*/
0c48262d 3507static float16 packFloat16(flag zSign, int zExp, uint16_t zSig)
60011498 3508{
bb4d4bb3 3509 return make_float16(
bb98fe42 3510 (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
60011498
PB
3511}
3512
c4a1c5e7
PM
3513/*----------------------------------------------------------------------------
3514| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3515| and significand `zSig', and returns the proper half-precision floating-
3516| point value corresponding to the abstract input. Ordinarily, the abstract
3517| value is simply rounded and packed into the half-precision format, with
3518| the inexact exception raised if the abstract input cannot be represented
3519| exactly. However, if the abstract value is too large, the overflow and
3520| inexact exceptions are raised and an infinity or maximal finite value is
3521| returned. If the abstract value is too small, the input value is rounded to
3522| a subnormal number, and the underflow and inexact exceptions are raised if
3523| the abstract input cannot be represented exactly as a subnormal half-
3524| precision floating-point number.
3525| The `ieee' flag indicates whether to use IEEE standard half precision, or
3526| ARM-style "alternative representation", which omits the NaN and Inf
3527| encodings in order to raise the maximum representable exponent by one.
3528| The input significand `zSig' has its binary point between bits 22
3529| and 23, which is 13 bits to the left of the usual location. This shifted
3530| significand must be normalized or smaller. If `zSig' is not normalized,
3531| `zExp' must be 0; in that case, the result returned is a subnormal number,
3532| and it must not require rounding. In the usual case that `zSig' is
3533| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3534| Note the slightly odd position of the binary point in zSig compared with the
3535| other roundAndPackFloat functions. This should probably be fixed if we
3536| need to implement more float16 routines than just conversion.
3537| The handling of underflow and overflow follows the IEC/IEEE Standard for
3538| Binary Floating-Point Arithmetic.
3539*----------------------------------------------------------------------------*/
3540
0c48262d 3541static float16 roundAndPackFloat16(flag zSign, int zExp,
e5a41ffa
PM
3542 uint32_t zSig, flag ieee,
3543 float_status *status)
c4a1c5e7
PM
3544{
3545 int maxexp = ieee ? 29 : 30;
3546 uint32_t mask;
3547 uint32_t increment;
c4a1c5e7
PM
3548 bool rounding_bumps_exp;
3549 bool is_tiny = false;
3550
3551 /* Calculate the mask of bits of the mantissa which are not
3552 * representable in half-precision and will be lost.
3553 */
3554 if (zExp < 1) {
3555 /* Will be denormal in halfprec */
3556 mask = 0x00ffffff;
3557 if (zExp >= -11) {
3558 mask >>= 11 + zExp;
3559 }
3560 } else {
3561 /* Normal number in halfprec */
3562 mask = 0x00001fff;
3563 }
3564
a2f2d288 3565 switch (status->float_rounding_mode) {
c4a1c5e7
PM
3566 case float_round_nearest_even:
3567 increment = (mask + 1) >> 1;
3568 if ((zSig & mask) == increment) {
3569 increment = zSig & (increment << 1);
3570 }
3571 break;
f9288a76
PM
3572 case float_round_ties_away:
3573 increment = (mask + 1) >> 1;
3574 break;
c4a1c5e7
PM
3575 case float_round_up:
3576 increment = zSign ? 0 : mask;
3577 break;
3578 case float_round_down:
3579 increment = zSign ? mask : 0;
3580 break;
3581 default: /* round_to_zero */
3582 increment = 0;
3583 break;
3584 }
3585
3586 rounding_bumps_exp = (zSig + increment >= 0x01000000);
3587
3588 if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3589 if (ieee) {
ff32e16e 3590 float_raise(float_flag_overflow | float_flag_inexact, status);
c4a1c5e7
PM
3591 return packFloat16(zSign, 0x1f, 0);
3592 } else {
ff32e16e 3593 float_raise(float_flag_invalid, status);
c4a1c5e7
PM
3594 return packFloat16(zSign, 0x1f, 0x3ff);
3595 }
3596 }
3597
3598 if (zExp < 0) {
3599 /* Note that flush-to-zero does not affect half-precision results */
3600 is_tiny =
a2f2d288 3601 (status->float_detect_tininess == float_tininess_before_rounding)
c4a1c5e7
PM
3602 || (zExp < -1)
3603 || (!rounding_bumps_exp);
3604 }
3605 if (zSig & mask) {
ff32e16e 3606 float_raise(float_flag_inexact, status);
c4a1c5e7 3607 if (is_tiny) {
ff32e16e 3608 float_raise(float_flag_underflow, status);
c4a1c5e7
PM
3609 }
3610 }
3611
3612 zSig += increment;
3613 if (rounding_bumps_exp) {
3614 zSig >>= 1;
3615 zExp++;
3616 }
3617
3618 if (zExp < -10) {
3619 return packFloat16(zSign, 0, 0);
3620 }
3621 if (zExp < 0) {
3622 zSig >>= -zExp;
3623 zExp = 0;
3624 }
3625 return packFloat16(zSign, zExp, zSig >> 13);
3626}
3627
210cbd49
AB
3628/*----------------------------------------------------------------------------
3629| If `a' is denormal and we are in flush-to-zero mode then set the
3630| input-denormal exception and return zero. Otherwise just return the value.
3631*----------------------------------------------------------------------------*/
3632float16 float16_squash_input_denormal(float16 a, float_status *status)
3633{
3634 if (status->flush_inputs_to_zero) {
3635 if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) {
3636 float_raise(float_flag_input_denormal, status);
3637 return make_float16(float16_val(a) & 0x8000);
3638 }
3639 }
3640 return a;
3641}
3642
0c48262d 3643static void normalizeFloat16Subnormal(uint32_t aSig, int *zExpPtr,
c4a1c5e7
PM
3644 uint32_t *zSigPtr)
3645{
3646 int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3647 *zSigPtr = aSig << shiftCount;
3648 *zExpPtr = 1 - shiftCount;
3649}
3650
60011498
PB
3651/* Half precision floats come in two formats: standard IEEE and "ARM" format.
3652 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
bb4d4bb3 3653
e5a41ffa 3654float32 float16_to_float32(float16 a, flag ieee, float_status *status)
60011498
PB
3655{
3656 flag aSign;
0c48262d 3657 int aExp;
bb98fe42 3658 uint32_t aSig;
60011498 3659
bb4d4bb3
PM
3660 aSign = extractFloat16Sign(a);
3661 aExp = extractFloat16Exp(a);
3662 aSig = extractFloat16Frac(a);
60011498
PB
3663
3664 if (aExp == 0x1f && ieee) {
3665 if (aSig) {
ff32e16e 3666 return commonNaNToFloat32(float16ToCommonNaN(a, status), status);
60011498 3667 }
4be8eeac 3668 return packFloat32(aSign, 0xff, 0);
60011498
PB
3669 }
3670 if (aExp == 0) {
60011498
PB
3671 if (aSig == 0) {
3672 return packFloat32(aSign, 0, 0);
3673 }
3674
c4a1c5e7
PM
3675 normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3676 aExp--;
60011498
PB
3677 }
3678 return packFloat32( aSign, aExp + 0x70, aSig << 13);
3679}
3680
e5a41ffa 3681float16 float32_to_float16(float32 a, flag ieee, float_status *status)
60011498
PB
3682{
3683 flag aSign;
0c48262d 3684 int aExp;
bb98fe42 3685 uint32_t aSig;
38970efa 3686
ff32e16e 3687 a = float32_squash_input_denormal(a, status);
60011498
PB
3688
3689 aSig = extractFloat32Frac( a );
3690 aExp = extractFloat32Exp( a );
3691 aSign = extractFloat32Sign( a );
3692 if ( aExp == 0xFF ) {
3693 if (aSig) {
600e30d2 3694 /* Input is a NaN */
600e30d2 3695 if (!ieee) {
ff32e16e 3696 float_raise(float_flag_invalid, status);
600e30d2
PM
3697 return packFloat16(aSign, 0, 0);
3698 }
38970efa 3699 return commonNaNToFloat16(
ff32e16e 3700 float32ToCommonNaN(a, status), status);
60011498 3701 }
600e30d2
PM
3702 /* Infinity */
3703 if (!ieee) {
ff32e16e 3704 float_raise(float_flag_invalid, status);
600e30d2
PM
3705 return packFloat16(aSign, 0x1f, 0x3ff);
3706 }
3707 return packFloat16(aSign, 0x1f, 0);
60011498 3708 }
600e30d2 3709 if (aExp == 0 && aSig == 0) {
60011498
PB
3710 return packFloat16(aSign, 0, 0);
3711 }
38970efa
PM
3712 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3713 * even if the input is denormal; however this is harmless because
3714 * the largest possible single-precision denormal is still smaller
3715 * than the smallest representable half-precision denormal, and so we
3716 * will end up ignoring aSig and returning via the "always return zero"
3717 * codepath.
3718 */
60011498 3719 aSig |= 0x00800000;
c4a1c5e7 3720 aExp -= 0x71;
60011498 3721
ff32e16e 3722 return roundAndPackFloat16(aSign, aExp, aSig, ieee, status);
60011498
PB
3723}
3724
e5a41ffa 3725float64 float16_to_float64(float16 a, flag ieee, float_status *status)
14c9a07e
PM
3726{
3727 flag aSign;
0c48262d 3728 int aExp;
14c9a07e
PM
3729 uint32_t aSig;
3730
3731 aSign = extractFloat16Sign(a);
3732 aExp = extractFloat16Exp(a);
3733 aSig = extractFloat16Frac(a);
3734
3735 if (aExp == 0x1f && ieee) {
3736 if (aSig) {
3737 return commonNaNToFloat64(
ff32e16e 3738 float16ToCommonNaN(a, status), status);
14c9a07e
PM
3739 }
3740 return packFloat64(aSign, 0x7ff, 0);
3741 }
3742 if (aExp == 0) {
3743 if (aSig == 0) {
3744 return packFloat64(aSign, 0, 0);
3745 }
3746
3747 normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3748 aExp--;
3749 }
3750 return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);
3751}
3752
e5a41ffa 3753float16 float64_to_float16(float64 a, flag ieee, float_status *status)
14c9a07e
PM
3754{
3755 flag aSign;
0c48262d 3756 int aExp;
14c9a07e
PM
3757 uint64_t aSig;
3758 uint32_t zSig;
3759
ff32e16e 3760 a = float64_squash_input_denormal(a, status);
14c9a07e
PM
3761
3762 aSig = extractFloat64Frac(a);
3763 aExp = extractFloat64Exp(a);
3764 aSign = extractFloat64Sign(a);
3765 if (aExp == 0x7FF) {
3766 if (aSig) {
3767 /* Input is a NaN */
3768 if (!ieee) {
ff32e16e 3769 float_raise(float_flag_invalid, status);
14c9a07e
PM
3770 return packFloat16(aSign, 0, 0);
3771 }
3772 return commonNaNToFloat16(
ff32e16e 3773 float64ToCommonNaN(a, status), status);
14c9a07e
PM
3774 }
3775 /* Infinity */
3776 if (!ieee) {
ff32e16e 3777 float_raise(float_flag_invalid, status);
14c9a07e
PM
3778 return packFloat16(aSign, 0x1f, 0x3ff);
3779 }
3780 return packFloat16(aSign, 0x1f, 0);
3781 }
3782 shift64RightJamming(aSig, 29, &aSig);
3783 zSig = aSig;
3784 if (aExp == 0 && zSig == 0) {
3785 return packFloat16(aSign, 0, 0);
3786 }
3787 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3788 * even if the input is denormal; however this is harmless because
3789 * the largest possible single-precision denormal is still smaller
3790 * than the smallest representable half-precision denormal, and so we
3791 * will end up ignoring aSig and returning via the "always return zero"
3792 * codepath.
3793 */
3794 zSig |= 0x00800000;
3795 aExp -= 0x3F1;
3796
ff32e16e 3797 return roundAndPackFloat16(aSign, aExp, zSig, ieee, status);
14c9a07e
PM
3798}
3799
158142c2
FB
3800/*----------------------------------------------------------------------------
3801| Returns the result of converting the double-precision floating-point value
3802| `a' to the extended double-precision floating-point format. The conversion
3803| is performed according to the IEC/IEEE Standard for Binary Floating-Point
3804| Arithmetic.
3805*----------------------------------------------------------------------------*/
3806
e5a41ffa 3807floatx80 float64_to_floatx80(float64 a, float_status *status)
158142c2
FB
3808{
3809 flag aSign;
0c48262d 3810 int aExp;
bb98fe42 3811 uint64_t aSig;
158142c2 3812
ff32e16e 3813 a = float64_squash_input_denormal(a, status);
158142c2
FB
3814 aSig = extractFloat64Frac( a );
3815 aExp = extractFloat64Exp( a );
3816 aSign = extractFloat64Sign( a );
3817 if ( aExp == 0x7FF ) {
ff32e16e
PM
3818 if (aSig) {
3819 return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
3820 }
158142c2
FB
3821 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3822 }
3823 if ( aExp == 0 ) {
3824 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3825 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3826 }
3827 return
3828 packFloatx80(
3829 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
3830
3831}
3832
158142c2
FB
3833/*----------------------------------------------------------------------------
3834| Returns the result of converting the double-precision floating-point value
3835| `a' to the quadruple-precision floating-point format. The conversion is
3836| performed according to the IEC/IEEE Standard for Binary Floating-Point
3837| Arithmetic.
3838*----------------------------------------------------------------------------*/
3839
e5a41ffa 3840float128 float64_to_float128(float64 a, float_status *status)
158142c2
FB
3841{
3842 flag aSign;
0c48262d 3843 int aExp;
bb98fe42 3844 uint64_t aSig, zSig0, zSig1;
158142c2 3845
ff32e16e 3846 a = float64_squash_input_denormal(a, status);
158142c2
FB
3847 aSig = extractFloat64Frac( a );
3848 aExp = extractFloat64Exp( a );
3849 aSign = extractFloat64Sign( a );
3850 if ( aExp == 0x7FF ) {
ff32e16e
PM
3851 if (aSig) {
3852 return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
3853 }
158142c2
FB
3854 return packFloat128( aSign, 0x7FFF, 0, 0 );
3855 }
3856 if ( aExp == 0 ) {
3857 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3858 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3859 --aExp;
3860 }
3861 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
3862 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
3863
3864}
3865
158142c2
FB
3866
3867/*----------------------------------------------------------------------------
3868| Returns the remainder of the double-precision floating-point value `a'
3869| with respect to the corresponding value `b'. The operation is performed
3870| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3871*----------------------------------------------------------------------------*/
3872
e5a41ffa 3873float64 float64_rem(float64 a, float64 b, float_status *status)
158142c2 3874{
ed086f3d 3875 flag aSign, zSign;
0c48262d 3876 int aExp, bExp, expDiff;
bb98fe42
AF
3877 uint64_t aSig, bSig;
3878 uint64_t q, alternateASig;
3879 int64_t sigMean;
158142c2 3880
ff32e16e
PM
3881 a = float64_squash_input_denormal(a, status);
3882 b = float64_squash_input_denormal(b, status);
158142c2
FB
3883 aSig = extractFloat64Frac( a );
3884 aExp = extractFloat64Exp( a );
3885 aSign = extractFloat64Sign( a );
3886 bSig = extractFloat64Frac( b );
3887 bExp = extractFloat64Exp( b );
158142c2
FB
3888 if ( aExp == 0x7FF ) {
3889 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
ff32e16e 3890 return propagateFloat64NaN(a, b, status);
158142c2 3891 }
ff32e16e 3892 float_raise(float_flag_invalid, status);
af39bc8c 3893 return float64_default_nan(status);
158142c2
FB
3894 }
3895 if ( bExp == 0x7FF ) {
ff32e16e
PM
3896 if (bSig) {
3897 return propagateFloat64NaN(a, b, status);
3898 }
158142c2
FB
3899 return a;
3900 }
3901 if ( bExp == 0 ) {
3902 if ( bSig == 0 ) {
ff32e16e 3903 float_raise(float_flag_invalid, status);
af39bc8c 3904 return float64_default_nan(status);
158142c2
FB
3905 }
3906 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3907 }
3908 if ( aExp == 0 ) {
3909 if ( aSig == 0 ) return a;
3910 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3911 }
3912 expDiff = aExp - bExp;
3913 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3914 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3915 if ( expDiff < 0 ) {
3916 if ( expDiff < -1 ) return a;
3917 aSig >>= 1;
3918 }
3919 q = ( bSig <= aSig );
3920 if ( q ) aSig -= bSig;
3921 expDiff -= 64;
3922 while ( 0 < expDiff ) {
3923 q = estimateDiv128To64( aSig, 0, bSig );
3924 q = ( 2 < q ) ? q - 2 : 0;
3925 aSig = - ( ( bSig>>2 ) * q );
3926 expDiff -= 62;
3927 }
3928 expDiff += 64;
3929 if ( 0 < expDiff ) {
3930 q = estimateDiv128To64( aSig, 0, bSig );
3931 q = ( 2 < q ) ? q - 2 : 0;
3932 q >>= 64 - expDiff;
3933 bSig >>= 2;
3934 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3935 }
3936 else {
3937 aSig >>= 2;
3938 bSig >>= 2;
3939 }
3940 do {
3941 alternateASig = aSig;
3942 ++q;
3943 aSig -= bSig;
bb98fe42 3944 } while ( 0 <= (int64_t) aSig );
158142c2
FB
3945 sigMean = aSig + alternateASig;
3946 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3947 aSig = alternateASig;
3948 }
bb98fe42 3949 zSign = ( (int64_t) aSig < 0 );
158142c2 3950 if ( zSign ) aSig = - aSig;
ff32e16e 3951 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
158142c2
FB
3952
3953}
3954
369be8f6 3955
158142c2
FB
3956/*----------------------------------------------------------------------------
3957| Returns the square root of the double-precision floating-point value `a'.
3958| The operation is performed according to the IEC/IEEE Standard for Binary
3959| Floating-Point Arithmetic.
3960*----------------------------------------------------------------------------*/
3961
e5a41ffa 3962float64 float64_sqrt(float64 a, float_status *status)
158142c2
FB
3963{
3964 flag aSign;
0c48262d 3965 int aExp, zExp;
bb98fe42
AF
3966 uint64_t aSig, zSig, doubleZSig;
3967 uint64_t rem0, rem1, term0, term1;
ff32e16e 3968 a = float64_squash_input_denormal(a, status);
158142c2
FB
3969
3970 aSig = extractFloat64Frac( a );
3971 aExp = extractFloat64Exp( a );
3972 aSign = extractFloat64Sign( a );
3973 if ( aExp == 0x7FF ) {
ff32e16e
PM
3974 if (aSig) {
3975 return propagateFloat64NaN(a, a, status);
3976 }
158142c2 3977 if ( ! aSign ) return a;
ff32e16e 3978 float_raise(float_flag_invalid, status);
af39bc8c 3979 return float64_default_nan(status);
158142c2
FB
3980 }
3981 if ( aSign ) {
3982 if ( ( aExp | aSig ) == 0 ) return a;
ff32e16e 3983 float_raise(float_flag_invalid, status);
af39bc8c 3984 return float64_default_nan(status);
158142c2
FB
3985 }
3986 if ( aExp == 0 ) {
f090c9d4 3987 if ( aSig == 0 ) return float64_zero;
158142c2
FB
3988 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3989 }
3990 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3991 aSig |= LIT64( 0x0010000000000000 );
3992 zSig = estimateSqrt32( aExp, aSig>>21 );
3993 aSig <<= 9 - ( aExp & 1 );
3994 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3995 if ( ( zSig & 0x1FF ) <= 5 ) {
3996 doubleZSig = zSig<<1;
3997 mul64To128( zSig, zSig, &term0, &term1 );
3998 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
bb98fe42 3999 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
4000 --zSig;
4001 doubleZSig -= 2;
4002 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
4003 }
4004 zSig |= ( ( rem0 | rem1 ) != 0 );
4005 }
ff32e16e 4006 return roundAndPackFloat64(0, zExp, zSig, status);
158142c2
FB
4007
4008}
4009
374dfc33
AJ
4010/*----------------------------------------------------------------------------
4011| Returns the binary log of the double-precision floating-point value `a'.
4012| The operation is performed according to the IEC/IEEE Standard for Binary
4013| Floating-Point Arithmetic.
4014*----------------------------------------------------------------------------*/
e5a41ffa 4015float64 float64_log2(float64 a, float_status *status)
374dfc33
AJ
4016{
4017 flag aSign, zSign;
0c48262d 4018 int aExp;
bb98fe42 4019 uint64_t aSig, aSig0, aSig1, zSig, i;
ff32e16e 4020 a = float64_squash_input_denormal(a, status);
374dfc33
AJ
4021
4022 aSig = extractFloat64Frac( a );
4023 aExp = extractFloat64Exp( a );
4024 aSign = extractFloat64Sign( a );
4025
4026 if ( aExp == 0 ) {
4027 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4028 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4029 }
4030 if ( aSign ) {
ff32e16e 4031 float_raise(float_flag_invalid, status);
af39bc8c 4032 return float64_default_nan(status);
374dfc33
AJ
4033 }
4034 if ( aExp == 0x7FF ) {
ff32e16e
PM
4035 if (aSig) {
4036 return propagateFloat64NaN(a, float64_zero, status);
4037 }
374dfc33
AJ
4038 return a;
4039 }
4040
4041 aExp -= 0x3FF;
4042 aSig |= LIT64( 0x0010000000000000 );
4043 zSign = aExp < 0;
bb98fe42 4044 zSig = (uint64_t)aExp << 52;
374dfc33
AJ
4045 for (i = 1LL << 51; i > 0; i >>= 1) {
4046 mul64To128( aSig, aSig, &aSig0, &aSig1 );
4047 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4048 if ( aSig & LIT64( 0x0020000000000000 ) ) {
4049 aSig >>= 1;
4050 zSig |= i;
4051 }
4052 }
4053
4054 if ( zSign )
4055 zSig = -zSig;
ff32e16e 4056 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
374dfc33
AJ
4057}
4058
158142c2
FB
4059/*----------------------------------------------------------------------------
4060| Returns 1 if the double-precision floating-point value `a' is equal to the
b689362d
AJ
4061| corresponding value `b', and 0 otherwise. The invalid exception is raised
4062| if either operand is a NaN. Otherwise, the comparison is performed
158142c2
FB
4063| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4064*----------------------------------------------------------------------------*/
4065
e5a41ffa 4066int float64_eq(float64 a, float64 b, float_status *status)
158142c2 4067{
bb98fe42 4068 uint64_t av, bv;
ff32e16e
PM
4069 a = float64_squash_input_denormal(a, status);
4070 b = float64_squash_input_denormal(b, status);
158142c2
FB
4071
4072 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4073 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4074 ) {
ff32e16e 4075 float_raise(float_flag_invalid, status);
158142c2
FB
4076 return 0;
4077 }
f090c9d4 4078 av = float64_val(a);
a1b91bb4 4079 bv = float64_val(b);
bb98fe42 4080 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
158142c2
FB
4081
4082}
4083
4084/*----------------------------------------------------------------------------
4085| Returns 1 if the double-precision floating-point value `a' is less than or
f5a64251
AJ
4086| equal to the corresponding value `b', and 0 otherwise. The invalid
4087| exception is raised if either operand is a NaN. The comparison is performed
4088| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
4089*----------------------------------------------------------------------------*/
4090
e5a41ffa 4091int float64_le(float64 a, float64 b, float_status *status)
158142c2
FB
4092{
4093 flag aSign, bSign;
bb98fe42 4094 uint64_t av, bv;
ff32e16e
PM
4095 a = float64_squash_input_denormal(a, status);
4096 b = float64_squash_input_denormal(b, status);
158142c2
FB
4097
4098 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4099 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4100 ) {
ff32e16e 4101 float_raise(float_flag_invalid, status);
158142c2
FB
4102 return 0;
4103 }
4104 aSign = extractFloat64Sign( a );
4105 bSign = extractFloat64Sign( b );
f090c9d4 4106 av = float64_val(a);
a1b91bb4 4107 bv = float64_val(b);
bb98fe42 4108 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 4109 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
4110
4111}
4112
4113/*----------------------------------------------------------------------------
4114| Returns 1 if the double-precision floating-point value `a' is less than
f5a64251
AJ
4115| the corresponding value `b', and 0 otherwise. The invalid exception is
4116| raised if either operand is a NaN. The comparison is performed according
4117| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
4118*----------------------------------------------------------------------------*/
4119
e5a41ffa 4120int float64_lt(float64 a, float64 b, float_status *status)
158142c2
FB
4121{
4122 flag aSign, bSign;
bb98fe42 4123 uint64_t av, bv;
158142c2 4124
ff32e16e
PM
4125 a = float64_squash_input_denormal(a, status);
4126 b = float64_squash_input_denormal(b, status);
158142c2
FB
4127 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4128 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4129 ) {
ff32e16e 4130 float_raise(float_flag_invalid, status);
158142c2
FB
4131 return 0;
4132 }
4133 aSign = extractFloat64Sign( a );
4134 bSign = extractFloat64Sign( b );
f090c9d4 4135 av = float64_val(a);
a1b91bb4 4136 bv = float64_val(b);
bb98fe42 4137 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 4138 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
4139
4140}
4141
67b7861d
AJ
4142/*----------------------------------------------------------------------------
4143| Returns 1 if the double-precision floating-point values `a' and `b' cannot
f5a64251
AJ
4144| be compared, and 0 otherwise. The invalid exception is raised if either
4145| operand is a NaN. The comparison is performed according to the IEC/IEEE
4146| Standard for Binary Floating-Point Arithmetic.
67b7861d
AJ
4147*----------------------------------------------------------------------------*/
4148
e5a41ffa 4149int float64_unordered(float64 a, float64 b, float_status *status)
67b7861d 4150{
ff32e16e
PM
4151 a = float64_squash_input_denormal(a, status);
4152 b = float64_squash_input_denormal(b, status);
67b7861d
AJ
4153
4154 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4155 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4156 ) {
ff32e16e 4157 float_raise(float_flag_invalid, status);
67b7861d
AJ
4158 return 1;
4159 }
4160 return 0;
4161}
4162
158142c2
FB
4163/*----------------------------------------------------------------------------
4164| Returns 1 if the double-precision floating-point value `a' is equal to the
f5a64251
AJ
4165| corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4166| exception.The comparison is performed according to the IEC/IEEE Standard
4167| for Binary Floating-Point Arithmetic.
158142c2
FB
4168*----------------------------------------------------------------------------*/
4169
e5a41ffa 4170int float64_eq_quiet(float64 a, float64 b, float_status *status)
158142c2 4171{
bb98fe42 4172 uint64_t av, bv;
ff32e16e
PM
4173 a = float64_squash_input_denormal(a, status);
4174 b = float64_squash_input_denormal(b, status);
158142c2
FB
4175
4176 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4177 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4178 ) {
af39bc8c
AM
4179 if (float64_is_signaling_nan(a, status)
4180 || float64_is_signaling_nan(b, status)) {
ff32e16e 4181 float_raise(float_flag_invalid, status);
b689362d 4182 }
158142c2
FB
4183 return 0;
4184 }
f090c9d4 4185 av = float64_val(a);
a1b91bb4 4186 bv = float64_val(b);
bb98fe42 4187 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
158142c2
FB
4188
4189}
4190
4191/*----------------------------------------------------------------------------
4192| Returns 1 if the double-precision floating-point value `a' is less than or
4193| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4194| cause an exception. Otherwise, the comparison is performed according to the
4195| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4196*----------------------------------------------------------------------------*/
4197
e5a41ffa 4198int float64_le_quiet(float64 a, float64 b, float_status *status)
158142c2
FB
4199{
4200 flag aSign, bSign;
bb98fe42 4201 uint64_t av, bv;
ff32e16e
PM
4202 a = float64_squash_input_denormal(a, status);
4203 b = float64_squash_input_denormal(b, status);
158142c2
FB
4204
4205 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4206 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4207 ) {
af39bc8c
AM
4208 if (float64_is_signaling_nan(a, status)
4209 || float64_is_signaling_nan(b, status)) {
ff32e16e 4210 float_raise(float_flag_invalid, status);
158142c2
FB
4211 }
4212 return 0;
4213 }
4214 aSign = extractFloat64Sign( a );
4215 bSign = extractFloat64Sign( b );
f090c9d4 4216 av = float64_val(a);
a1b91bb4 4217 bv = float64_val(b);
bb98fe42 4218 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 4219 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
4220
4221}
4222
4223/*----------------------------------------------------------------------------
4224| Returns 1 if the double-precision floating-point value `a' is less than
4225| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4226| exception. Otherwise, the comparison is performed according to the IEC/IEEE
4227| Standard for Binary Floating-Point Arithmetic.
4228*----------------------------------------------------------------------------*/
4229
e5a41ffa 4230int float64_lt_quiet(float64 a, float64 b, float_status *status)
158142c2
FB
4231{
4232 flag aSign, bSign;
bb98fe42 4233 uint64_t av, bv;
ff32e16e
PM
4234 a = float64_squash_input_denormal(a, status);
4235 b = float64_squash_input_denormal(b, status);
158142c2
FB
4236
4237 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4238 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4239 ) {
af39bc8c
AM
4240 if (float64_is_signaling_nan(a, status)
4241 || float64_is_signaling_nan(b, status)) {
ff32e16e 4242 float_raise(float_flag_invalid, status);
158142c2
FB
4243 }
4244 return 0;
4245 }
4246 aSign = extractFloat64Sign( a );
4247 bSign = extractFloat64Sign( b );
f090c9d4 4248 av = float64_val(a);
a1b91bb4 4249 bv = float64_val(b);
bb98fe42 4250 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 4251 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
4252
4253}
4254
67b7861d
AJ
4255/*----------------------------------------------------------------------------
4256| Returns 1 if the double-precision floating-point values `a' and `b' cannot
4257| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4258| comparison is performed according to the IEC/IEEE Standard for Binary
4259| Floating-Point Arithmetic.
4260*----------------------------------------------------------------------------*/
4261
e5a41ffa 4262int float64_unordered_quiet(float64 a, float64 b, float_status *status)
67b7861d 4263{
ff32e16e
PM
4264 a = float64_squash_input_denormal(a, status);
4265 b = float64_squash_input_denormal(b, status);
67b7861d
AJ
4266
4267 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4268 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4269 ) {
af39bc8c
AM
4270 if (float64_is_signaling_nan(a, status)
4271 || float64_is_signaling_nan(b, status)) {
ff32e16e 4272 float_raise(float_flag_invalid, status);
67b7861d
AJ
4273 }
4274 return 1;
4275 }
4276 return 0;
4277}
4278
158142c2
FB
4279/*----------------------------------------------------------------------------
4280| Returns the result of converting the extended double-precision floating-
4281| point value `a' to the 32-bit two's complement integer format. The
4282| conversion is performed according to the IEC/IEEE Standard for Binary
4283| Floating-Point Arithmetic---which means in particular that the conversion
4284| is rounded according to the current rounding mode. If `a' is a NaN, the
4285| largest positive integer is returned. Otherwise, if the conversion
4286| overflows, the largest integer with the same sign as `a' is returned.
4287*----------------------------------------------------------------------------*/
4288
f4014512 4289int32_t floatx80_to_int32(floatx80 a, float_status *status)
158142c2
FB
4290{
4291 flag aSign;
f4014512 4292 int32_t aExp, shiftCount;
bb98fe42 4293 uint64_t aSig;
158142c2 4294
d1eb8f2a
AD
4295 if (floatx80_invalid_encoding(a)) {
4296 float_raise(float_flag_invalid, status);
4297 return 1 << 31;
4298 }
158142c2
FB
4299 aSig = extractFloatx80Frac( a );
4300 aExp = extractFloatx80Exp( a );
4301 aSign = extractFloatx80Sign( a );
bb98fe42 4302 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
158142c2
FB
4303 shiftCount = 0x4037 - aExp;
4304 if ( shiftCount <= 0 ) shiftCount = 1;
4305 shift64RightJamming( aSig, shiftCount, &aSig );
ff32e16e 4306 return roundAndPackInt32(aSign, aSig, status);
158142c2
FB
4307
4308}
4309
4310/*----------------------------------------------------------------------------
4311| Returns the result of converting the extended double-precision floating-
4312| point value `a' to the 32-bit two's complement integer format. The
4313| conversion is performed according to the IEC/IEEE Standard for Binary
4314| Floating-Point Arithmetic, except that the conversion is always rounded
4315| toward zero. If `a' is a NaN, the largest positive integer is returned.
4316| Otherwise, if the conversion overflows, the largest integer with the same
4317| sign as `a' is returned.
4318*----------------------------------------------------------------------------*/
4319
f4014512 4320int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
158142c2
FB
4321{
4322 flag aSign;
f4014512 4323 int32_t aExp, shiftCount;
bb98fe42 4324 uint64_t aSig, savedASig;
b3a6a2e0 4325 int32_t z;
158142c2 4326
d1eb8f2a
AD
4327 if (floatx80_invalid_encoding(a)) {
4328 float_raise(float_flag_invalid, status);
4329 return 1 << 31;
4330 }
158142c2
FB
4331 aSig = extractFloatx80Frac( a );
4332 aExp = extractFloatx80Exp( a );
4333 aSign = extractFloatx80Sign( a );
4334 if ( 0x401E < aExp ) {
bb98fe42 4335 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
158142c2
FB
4336 goto invalid;
4337 }
4338 else if ( aExp < 0x3FFF ) {
a2f2d288
PM
4339 if (aExp || aSig) {
4340 status->float_exception_flags |= float_flag_inexact;
4341 }
158142c2
FB
4342 return 0;
4343 }
4344 shiftCount = 0x403E - aExp;
4345 savedASig = aSig;
4346 aSig >>= shiftCount;
4347 z = aSig;
4348 if ( aSign ) z = - z;
4349 if ( ( z < 0 ) ^ aSign ) {
4350 invalid:
ff32e16e 4351 float_raise(float_flag_invalid, status);
bb98fe42 4352 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2
FB
4353 }
4354 if ( ( aSig<<shiftCount ) != savedASig ) {
a2f2d288 4355 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
4356 }
4357 return z;
4358
4359}
4360
4361/*----------------------------------------------------------------------------
4362| Returns the result of converting the extended double-precision floating-
4363| point value `a' to the 64-bit two's complement integer format. The
4364| conversion is performed according to the IEC/IEEE Standard for Binary
4365| Floating-Point Arithmetic---which means in particular that the conversion
4366| is rounded according to the current rounding mode. If `a' is a NaN,
4367| the largest positive integer is returned. Otherwise, if the conversion
4368| overflows, the largest integer with the same sign as `a' is returned.
4369*----------------------------------------------------------------------------*/
4370
f42c2224 4371int64_t floatx80_to_int64(floatx80 a, float_status *status)
158142c2
FB
4372{
4373 flag aSign;
f4014512 4374 int32_t aExp, shiftCount;
bb98fe42 4375 uint64_t aSig, aSigExtra;
158142c2 4376
d1eb8f2a
AD
4377 if (floatx80_invalid_encoding(a)) {
4378 float_raise(float_flag_invalid, status);
4379 return 1ULL << 63;
4380 }
158142c2
FB
4381 aSig = extractFloatx80Frac( a );
4382 aExp = extractFloatx80Exp( a );
4383 aSign = extractFloatx80Sign( a );
4384 shiftCount = 0x403E - aExp;
4385 if ( shiftCount <= 0 ) {
4386 if ( shiftCount ) {
ff32e16e 4387 float_raise(float_flag_invalid, status);
158142c2
FB
4388 if ( ! aSign
4389 || ( ( aExp == 0x7FFF )
4390 && ( aSig != LIT64( 0x8000000000000000 ) ) )
4391 ) {
4392 return LIT64( 0x7FFFFFFFFFFFFFFF );
4393 }
bb98fe42 4394 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
4395 }
4396 aSigExtra = 0;
4397 }
4398 else {
4399 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4400 }
ff32e16e 4401 return roundAndPackInt64(aSign, aSig, aSigExtra, status);
158142c2
FB
4402
4403}
4404
4405/*----------------------------------------------------------------------------
4406| Returns the result of converting the extended double-precision floating-
4407| point value `a' to the 64-bit two's complement integer format. The
4408| conversion is performed according to the IEC/IEEE Standard for Binary
4409| Floating-Point Arithmetic, except that the conversion is always rounded
4410| toward zero. If `a' is a NaN, the largest positive integer is returned.
4411| Otherwise, if the conversion overflows, the largest integer with the same
4412| sign as `a' is returned.
4413*----------------------------------------------------------------------------*/
4414
f42c2224 4415int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
158142c2
FB
4416{
4417 flag aSign;
f4014512 4418 int32_t aExp, shiftCount;
bb98fe42 4419 uint64_t aSig;
f42c2224 4420 int64_t z;
158142c2 4421
d1eb8f2a
AD
4422 if (floatx80_invalid_encoding(a)) {
4423 float_raise(float_flag_invalid, status);
4424 return 1ULL << 63;
4425 }
158142c2
FB
4426 aSig = extractFloatx80Frac( a );
4427 aExp = extractFloatx80Exp( a );
4428 aSign = extractFloatx80Sign( a );
4429 shiftCount = aExp - 0x403E;
4430 if ( 0 <= shiftCount ) {
4431 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4432 if ( ( a.high != 0xC03E ) || aSig ) {
ff32e16e 4433 float_raise(float_flag_invalid, status);
158142c2
FB
4434 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4435 return LIT64( 0x7FFFFFFFFFFFFFFF );
4436 }
4437 }
bb98fe42 4438 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
4439 }
4440 else if ( aExp < 0x3FFF ) {
a2f2d288
PM
4441 if (aExp | aSig) {
4442 status->float_exception_flags |= float_flag_inexact;
4443 }
158142c2
FB
4444 return 0;
4445 }
4446 z = aSig>>( - shiftCount );
bb98fe42 4447 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
a2f2d288 4448 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
4449 }
4450 if ( aSign ) z = - z;
4451 return z;
4452
4453}
4454
4455/*----------------------------------------------------------------------------
4456| Returns the result of converting the extended double-precision floating-
4457| point value `a' to the single-precision floating-point format. The
4458| conversion is performed according to the IEC/IEEE Standard for Binary
4459| Floating-Point Arithmetic.
4460*----------------------------------------------------------------------------*/
4461
e5a41ffa 4462float32 floatx80_to_float32(floatx80 a, float_status *status)
158142c2
FB
4463{
4464 flag aSign;
f4014512 4465 int32_t aExp;
bb98fe42 4466 uint64_t aSig;
158142c2 4467
d1eb8f2a
AD
4468 if (floatx80_invalid_encoding(a)) {
4469 float_raise(float_flag_invalid, status);
4470 return float32_default_nan(status);
4471 }
158142c2
FB
4472 aSig = extractFloatx80Frac( a );
4473 aExp = extractFloatx80Exp( a );
4474 aSign = extractFloatx80Sign( a );
4475 if ( aExp == 0x7FFF ) {
bb98fe42 4476 if ( (uint64_t) ( aSig<<1 ) ) {
ff32e16e 4477 return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
158142c2
FB
4478 }
4479 return packFloat32( aSign, 0xFF, 0 );
4480 }
4481 shift64RightJamming( aSig, 33, &aSig );
4482 if ( aExp || aSig ) aExp -= 0x3F81;
ff32e16e 4483 return roundAndPackFloat32(aSign, aExp, aSig, status);
158142c2
FB
4484
4485}
4486
4487/*----------------------------------------------------------------------------
4488| Returns the result of converting the extended double-precision floating-
4489| point value `a' to the double-precision floating-point format. The
4490| conversion is performed according to the IEC/IEEE Standard for Binary
4491| Floating-Point Arithmetic.
4492*----------------------------------------------------------------------------*/
4493
e5a41ffa 4494float64 floatx80_to_float64(floatx80 a, float_status *status)
158142c2
FB
4495{
4496 flag aSign;
f4014512 4497 int32_t aExp;
bb98fe42 4498 uint64_t aSig, zSig;
158142c2 4499
d1eb8f2a
AD
4500 if (floatx80_invalid_encoding(a)) {
4501 float_raise(float_flag_invalid, status);
4502 return float64_default_nan(status);
4503 }
158142c2
FB
4504 aSig = extractFloatx80Frac( a );
4505 aExp = extractFloatx80Exp( a );
4506 aSign = extractFloatx80Sign( a );
4507 if ( aExp == 0x7FFF ) {
bb98fe42 4508 if ( (uint64_t) ( aSig<<1 ) ) {
ff32e16e 4509 return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
158142c2
FB
4510 }
4511 return packFloat64( aSign, 0x7FF, 0 );
4512 }
4513 shift64RightJamming( aSig, 1, &zSig );
4514 if ( aExp || aSig ) aExp -= 0x3C01;
ff32e16e 4515 return roundAndPackFloat64(aSign, aExp, zSig, status);
158142c2
FB
4516
4517}
4518
158142c2
FB
4519/*----------------------------------------------------------------------------
4520| Returns the result of converting the extended double-precision floating-
4521| point value `a' to the quadruple-precision floating-point format. The
4522| conversion is performed according to the IEC/IEEE Standard for Binary
4523| Floating-Point Arithmetic.
4524*----------------------------------------------------------------------------*/
4525
e5a41ffa 4526float128 floatx80_to_float128(floatx80 a, float_status *status)
158142c2
FB
4527{
4528 flag aSign;
0c48262d 4529 int aExp;
bb98fe42 4530 uint64_t aSig, zSig0, zSig1;
158142c2 4531
d1eb8f2a
AD
4532 if (floatx80_invalid_encoding(a)) {
4533 float_raise(float_flag_invalid, status);
4534 return float128_default_nan(status);
4535 }
158142c2
FB
4536 aSig = extractFloatx80Frac( a );
4537 aExp = extractFloatx80Exp( a );
4538 aSign = extractFloatx80Sign( a );
bb98fe42 4539 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
ff32e16e 4540 return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
158142c2
FB
4541 }
4542 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4543 return packFloat128( aSign, aExp, zSig0, zSig1 );
4544
4545}
4546
0f721292
LV
4547/*----------------------------------------------------------------------------
4548| Rounds the extended double-precision floating-point value `a'
4549| to the precision provided by floatx80_rounding_precision and returns the
4550| result as an extended double-precision floating-point value.
4551| The operation is performed according to the IEC/IEEE Standard for Binary
4552| Floating-Point Arithmetic.
4553*----------------------------------------------------------------------------*/
4554
4555floatx80 floatx80_round(floatx80 a, float_status *status)
4556{
4557 return roundAndPackFloatx80(status->floatx80_rounding_precision,
4558 extractFloatx80Sign(a),
4559 extractFloatx80Exp(a),
4560 extractFloatx80Frac(a), 0, status);
4561}
4562
158142c2
FB
4563/*----------------------------------------------------------------------------
4564| Rounds the extended double-precision floating-point value `a' to an integer,
4565| and returns the result as an extended quadruple-precision floating-point
4566| value. The operation is performed according to the IEC/IEEE Standard for
4567| Binary Floating-Point Arithmetic.
4568*----------------------------------------------------------------------------*/
4569
e5a41ffa 4570floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
158142c2
FB
4571{
4572 flag aSign;
f4014512 4573 int32_t aExp;
bb98fe42 4574 uint64_t lastBitMask, roundBitsMask;
158142c2
FB
4575 floatx80 z;
4576
d1eb8f2a
AD
4577 if (floatx80_invalid_encoding(a)) {
4578 float_raise(float_flag_invalid, status);
4579 return floatx80_default_nan(status);
4580 }
158142c2
FB
4581 aExp = extractFloatx80Exp( a );
4582 if ( 0x403E <= aExp ) {
bb98fe42 4583 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
ff32e16e 4584 return propagateFloatx80NaN(a, a, status);
158142c2
FB
4585 }
4586 return a;
4587 }
4588 if ( aExp < 0x3FFF ) {
4589 if ( ( aExp == 0 )
bb98fe42 4590 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
158142c2
FB
4591 return a;
4592 }
a2f2d288 4593 status->float_exception_flags |= float_flag_inexact;
158142c2 4594 aSign = extractFloatx80Sign( a );
a2f2d288 4595 switch (status->float_rounding_mode) {
158142c2 4596 case float_round_nearest_even:
bb98fe42 4597 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
158142c2
FB
4598 ) {
4599 return
4600 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4601 }
4602 break;
f9288a76
PM
4603 case float_round_ties_away:
4604 if (aExp == 0x3FFE) {
4605 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
4606 }
4607 break;
158142c2
FB
4608 case float_round_down:
4609 return
4610 aSign ?
4611 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4612 : packFloatx80( 0, 0, 0 );
4613 case float_round_up:
4614 return
4615 aSign ? packFloatx80( 1, 0, 0 )
4616 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4617 }
4618 return packFloatx80( aSign, 0, 0 );
4619 }
4620 lastBitMask = 1;
4621 lastBitMask <<= 0x403E - aExp;
4622 roundBitsMask = lastBitMask - 1;
4623 z = a;
a2f2d288 4624 switch (status->float_rounding_mode) {
dc355b76 4625 case float_round_nearest_even:
158142c2 4626 z.low += lastBitMask>>1;
dc355b76
PM
4627 if ((z.low & roundBitsMask) == 0) {
4628 z.low &= ~lastBitMask;
4629 }
4630 break;
f9288a76
PM
4631 case float_round_ties_away:
4632 z.low += lastBitMask >> 1;
4633 break;
dc355b76
PM
4634 case float_round_to_zero:
4635 break;
4636 case float_round_up:
4637 if (!extractFloatx80Sign(z)) {
4638 z.low += roundBitsMask;
4639 }
4640 break;
4641 case float_round_down:
4642 if (extractFloatx80Sign(z)) {
158142c2
FB
4643 z.low += roundBitsMask;
4644 }
dc355b76
PM
4645 break;
4646 default:
4647 abort();
158142c2
FB
4648 }
4649 z.low &= ~ roundBitsMask;
4650 if ( z.low == 0 ) {
4651 ++z.high;
4652 z.low = LIT64( 0x8000000000000000 );
4653 }
a2f2d288
PM
4654 if (z.low != a.low) {
4655 status->float_exception_flags |= float_flag_inexact;
4656 }
158142c2
FB
4657 return z;
4658
4659}
4660
4661/*----------------------------------------------------------------------------
4662| Returns the result of adding the absolute values of the extended double-
4663| precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
4664| negated before being returned. `zSign' is ignored if the result is a NaN.
4665| The addition is performed according to the IEC/IEEE Standard for Binary
4666| Floating-Point Arithmetic.
4667*----------------------------------------------------------------------------*/
4668
e5a41ffa
PM
4669static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4670 float_status *status)
158142c2 4671{
f4014512 4672 int32_t aExp, bExp, zExp;
bb98fe42 4673 uint64_t aSig, bSig, zSig0, zSig1;
f4014512 4674 int32_t expDiff;
158142c2
FB
4675
4676 aSig = extractFloatx80Frac( a );
4677 aExp = extractFloatx80Exp( a );
4678 bSig = extractFloatx80Frac( b );
4679 bExp = extractFloatx80Exp( b );
4680 expDiff = aExp - bExp;
4681 if ( 0 < expDiff ) {
4682 if ( aExp == 0x7FFF ) {
ff32e16e
PM
4683 if ((uint64_t)(aSig << 1)) {
4684 return propagateFloatx80NaN(a, b, status);
4685 }
158142c2
FB
4686 return a;
4687 }
4688 if ( bExp == 0 ) --expDiff;
4689 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4690 zExp = aExp;
4691 }
4692 else if ( expDiff < 0 ) {
4693 if ( bExp == 0x7FFF ) {
ff32e16e
PM
4694 if ((uint64_t)(bSig << 1)) {
4695 return propagateFloatx80NaN(a, b, status);
4696 }
158142c2
FB
4697 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4698 }
4699 if ( aExp == 0 ) ++expDiff;
4700 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4701 zExp = bExp;
4702 }
4703 else {
4704 if ( aExp == 0x7FFF ) {
bb98fe42 4705 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
ff32e16e 4706 return propagateFloatx80NaN(a, b, status);
158142c2
FB
4707 }
4708 return a;
4709 }
4710 zSig1 = 0;
4711 zSig0 = aSig + bSig;
4712 if ( aExp == 0 ) {
4713 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4714 goto roundAndPack;
4715 }
4716 zExp = aExp;
4717 goto shiftRight1;
4718 }
4719 zSig0 = aSig + bSig;
bb98fe42 4720 if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
158142c2
FB
4721 shiftRight1:
4722 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4723 zSig0 |= LIT64( 0x8000000000000000 );
4724 ++zExp;
4725 roundAndPack:
a2f2d288 4726 return roundAndPackFloatx80(status->floatx80_rounding_precision,
ff32e16e 4727 zSign, zExp, zSig0, zSig1, status);
158142c2
FB
4728}
4729
4730/*----------------------------------------------------------------------------
4731| Returns the result of subtracting the absolute values of the extended
4732| double-precision floating-point values `a' and `b'. If `zSign' is 1, the
4733| difference is negated before being returned. `zSign' is ignored if the
4734| result is a NaN. The subtraction is performed according to the IEC/IEEE
4735| Standard for Binary Floating-Point Arithmetic.
4736*----------------------------------------------------------------------------*/
4737
e5a41ffa
PM
4738static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4739 float_status *status)
158142c2 4740{
f4014512 4741 int32_t aExp, bExp, zExp;
bb98fe42 4742 uint64_t aSig, bSig, zSig0, zSig1;
f4014512 4743 int32_t expDiff;
158142c2
FB
4744
4745 aSig = extractFloatx80Frac( a );
4746 aExp = extractFloatx80Exp( a );
4747 bSig = extractFloatx80Frac( b );
4748 bExp = extractFloatx80Exp( b );
4749 expDiff = aExp - bExp;
4750 if ( 0 < expDiff ) goto aExpBigger;
4751 if ( expDiff < 0 ) goto bExpBigger;
4752 if ( aExp == 0x7FFF ) {
bb98fe42 4753 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
ff32e16e 4754 return propagateFloatx80NaN(a, b, status);
158142c2 4755 }
ff32e16e 4756 float_raise(float_flag_invalid, status);
af39bc8c 4757 return floatx80_default_nan(status);
158142c2
FB
4758 }
4759 if ( aExp == 0 ) {
4760 aExp = 1;
4761 bExp = 1;
4762 }
4763 zSig1 = 0;
4764 if ( bSig < aSig ) goto aBigger;
4765 if ( aSig < bSig ) goto bBigger;
a2f2d288 4766 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
158142c2
FB
4767 bExpBigger:
4768 if ( bExp == 0x7FFF ) {
ff32e16e
PM
4769 if ((uint64_t)(bSig << 1)) {
4770 return propagateFloatx80NaN(a, b, status);
4771 }
158142c2
FB
4772 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
4773 }
4774 if ( aExp == 0 ) ++expDiff;
4775 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4776 bBigger:
4777 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4778 zExp = bExp;
4779 zSign ^= 1;
4780 goto normalizeRoundAndPack;
4781 aExpBigger:
4782 if ( aExp == 0x7FFF ) {
ff32e16e
PM
4783 if ((uint64_t)(aSig << 1)) {
4784 return propagateFloatx80NaN(a, b, status);
4785 }
158142c2
FB
4786 return a;
4787 }
4788 if ( bExp == 0 ) --expDiff;
4789 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4790 aBigger:
4791 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4792 zExp = aExp;
4793 normalizeRoundAndPack:
a2f2d288 4794 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
ff32e16e 4795 zSign, zExp, zSig0, zSig1, status);
158142c2
FB
4796}
4797
4798/*----------------------------------------------------------------------------
4799| Returns the result of adding the extended double-precision floating-point
4800| values `a' and `b'. The operation is performed according to the IEC/IEEE
4801| Standard for Binary Floating-Point Arithmetic.
4802*----------------------------------------------------------------------------*/
4803
e5a41ffa 4804floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
4805{
4806 flag aSign, bSign;
4807
d1eb8f2a
AD
4808 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
4809 float_raise(float_flag_invalid, status);
4810 return floatx80_default_nan(status);
4811 }
158142c2
FB
4812 aSign = extractFloatx80Sign( a );
4813 bSign = extractFloatx80Sign( b );
4814 if ( aSign == bSign ) {
ff32e16e 4815 return addFloatx80Sigs(a, b, aSign, status);
158142c2
FB
4816 }
4817 else {
ff32e16e 4818 return subFloatx80Sigs(a, b, aSign, status);
158142c2
FB
4819 }
4820
4821}
4822
4823/*----------------------------------------------------------------------------
4824| Returns the result of subtracting the extended double-precision floating-
4825| point values `a' and `b'. The operation is performed according to the
4826| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4827*----------------------------------------------------------------------------*/
4828
e5a41ffa 4829floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
4830{
4831 flag aSign, bSign;
4832
d1eb8f2a
AD
4833 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
4834 float_raise(float_flag_invalid, status);
4835 return floatx80_default_nan(status);
4836 }
158142c2
FB
4837 aSign = extractFloatx80Sign( a );
4838 bSign = extractFloatx80Sign( b );
4839 if ( aSign == bSign ) {
ff32e16e 4840 return subFloatx80Sigs(a, b, aSign, status);
158142c2
FB
4841 }
4842 else {
ff32e16e 4843 return addFloatx80Sigs(a, b, aSign, status);
158142c2
FB
4844 }
4845
4846}
4847
4848/*----------------------------------------------------------------------------
4849| Returns the result of multiplying the extended double-precision floating-
4850| point values `a' and `b'. The operation is performed according to the
4851| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4852*----------------------------------------------------------------------------*/
4853
e5a41ffa 4854floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
4855{
4856 flag aSign, bSign, zSign;
f4014512 4857 int32_t aExp, bExp, zExp;
bb98fe42 4858 uint64_t aSig, bSig, zSig0, zSig1;
158142c2 4859
d1eb8f2a
AD
4860 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
4861 float_raise(float_flag_invalid, status);
4862 return floatx80_default_nan(status);
4863 }
158142c2
FB
4864 aSig = extractFloatx80Frac( a );
4865 aExp = extractFloatx80Exp( a );
4866 aSign = extractFloatx80Sign( a );
4867 bSig = extractFloatx80Frac( b );
4868 bExp = extractFloatx80Exp( b );
4869 bSign = extractFloatx80Sign( b );
4870 zSign = aSign ^ bSign;
4871 if ( aExp == 0x7FFF ) {
bb98fe42
AF
4872 if ( (uint64_t) ( aSig<<1 )
4873 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
ff32e16e 4874 return propagateFloatx80NaN(a, b, status);
158142c2
FB
4875 }
4876 if ( ( bExp | bSig ) == 0 ) goto invalid;
4877 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4878 }
4879 if ( bExp == 0x7FFF ) {
ff32e16e
PM
4880 if ((uint64_t)(bSig << 1)) {
4881 return propagateFloatx80NaN(a, b, status);
4882 }
158142c2
FB
4883 if ( ( aExp | aSig ) == 0 ) {
4884 invalid:
ff32e16e 4885 float_raise(float_flag_invalid, status);
af39bc8c 4886 return floatx80_default_nan(status);
158142c2
FB
4887 }
4888 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4889 }
4890 if ( aExp == 0 ) {
4891 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4892 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4893 }
4894 if ( bExp == 0 ) {
4895 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
4896 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4897 }
4898 zExp = aExp + bExp - 0x3FFE;
4899 mul64To128( aSig, bSig, &zSig0, &zSig1 );
bb98fe42 4900 if ( 0 < (int64_t) zSig0 ) {
158142c2
FB
4901 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
4902 --zExp;
4903 }
a2f2d288 4904 return roundAndPackFloatx80(status->floatx80_rounding_precision,
ff32e16e 4905 zSign, zExp, zSig0, zSig1, status);
158142c2
FB
4906}
4907
4908/*----------------------------------------------------------------------------
4909| Returns the result of dividing the extended double-precision floating-point
4910| value `a' by the corresponding value `b'. The operation is performed
4911| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4912*----------------------------------------------------------------------------*/
4913
e5a41ffa 4914floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
4915{
4916 flag aSign, bSign, zSign;
f4014512 4917 int32_t aExp, bExp, zExp;
bb98fe42
AF
4918 uint64_t aSig, bSig, zSig0, zSig1;
4919 uint64_t rem0, rem1, rem2, term0, term1, term2;
158142c2 4920
d1eb8f2a
AD
4921 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
4922 float_raise(float_flag_invalid, status);
4923 return floatx80_default_nan(status);
4924 }
158142c2
FB
4925 aSig = extractFloatx80Frac( a );
4926 aExp = extractFloatx80Exp( a );
4927 aSign = extractFloatx80Sign( a );
4928 bSig = extractFloatx80Frac( b );
4929 bExp = extractFloatx80Exp( b );
4930 bSign = extractFloatx80Sign( b );
4931 zSign = aSign ^ bSign;
4932 if ( aExp == 0x7FFF ) {
ff32e16e
PM
4933 if ((uint64_t)(aSig << 1)) {
4934 return propagateFloatx80NaN(a, b, status);
4935 }
158142c2 4936 if ( bExp == 0x7FFF ) {
ff32e16e
PM
4937 if ((uint64_t)(bSig << 1)) {
4938 return propagateFloatx80NaN(a, b, status);
4939 }
158142c2
FB
4940 goto invalid;
4941 }
4942 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4943 }
4944 if ( bExp == 0x7FFF ) {
ff32e16e
PM
4945 if ((uint64_t)(bSig << 1)) {
4946 return propagateFloatx80NaN(a, b, status);
4947 }
158142c2
FB
4948 return packFloatx80( zSign, 0, 0 );
4949 }
4950 if ( bExp == 0 ) {
4951 if ( bSig == 0 ) {
4952 if ( ( aExp | aSig ) == 0 ) {
4953 invalid:
ff32e16e 4954 float_raise(float_flag_invalid, status);
af39bc8c 4955 return floatx80_default_nan(status);
158142c2 4956 }
ff32e16e 4957 float_raise(float_flag_divbyzero, status);
158142c2
FB
4958 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4959 }
4960 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4961 }
4962 if ( aExp == 0 ) {
4963 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4964 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4965 }
4966 zExp = aExp - bExp + 0x3FFE;
4967 rem1 = 0;
4968 if ( bSig <= aSig ) {
4969 shift128Right( aSig, 0, 1, &aSig, &rem1 );
4970 ++zExp;
4971 }
4972 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
4973 mul64To128( bSig, zSig0, &term0, &term1 );
4974 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
bb98fe42 4975 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
4976 --zSig0;
4977 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4978 }
4979 zSig1 = estimateDiv128To64( rem1, 0, bSig );
bb98fe42 4980 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
158142c2
FB
4981 mul64To128( bSig, zSig1, &term1, &term2 );
4982 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
bb98fe42 4983 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
4984 --zSig1;
4985 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
4986 }
4987 zSig1 |= ( ( rem1 | rem2 ) != 0 );
4988 }
a2f2d288 4989 return roundAndPackFloatx80(status->floatx80_rounding_precision,
ff32e16e 4990 zSign, zExp, zSig0, zSig1, status);
158142c2
FB
4991}
4992
4993/*----------------------------------------------------------------------------
4994| Returns the remainder of the extended double-precision floating-point value
4995| `a' with respect to the corresponding value `b'. The operation is performed
4996| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4997*----------------------------------------------------------------------------*/
4998
e5a41ffa 4999floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
158142c2 5000{
ed086f3d 5001 flag aSign, zSign;
f4014512 5002 int32_t aExp, bExp, expDiff;
bb98fe42
AF
5003 uint64_t aSig0, aSig1, bSig;
5004 uint64_t q, term0, term1, alternateASig0, alternateASig1;
158142c2 5005
d1eb8f2a
AD
5006 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5007 float_raise(float_flag_invalid, status);
5008 return floatx80_default_nan(status);
5009 }
158142c2
FB
5010 aSig0 = extractFloatx80Frac( a );
5011 aExp = extractFloatx80Exp( a );
5012 aSign = extractFloatx80Sign( a );
5013 bSig = extractFloatx80Frac( b );
5014 bExp = extractFloatx80Exp( b );
158142c2 5015 if ( aExp == 0x7FFF ) {
bb98fe42
AF
5016 if ( (uint64_t) ( aSig0<<1 )
5017 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
ff32e16e 5018 return propagateFloatx80NaN(a, b, status);
158142c2
FB
5019 }
5020 goto invalid;
5021 }
5022 if ( bExp == 0x7FFF ) {
ff32e16e
PM
5023 if ((uint64_t)(bSig << 1)) {
5024 return propagateFloatx80NaN(a, b, status);
5025 }
158142c2
FB
5026 return a;
5027 }
5028 if ( bExp == 0 ) {
5029 if ( bSig == 0 ) {
5030 invalid:
ff32e16e 5031 float_raise(float_flag_invalid, status);
af39bc8c 5032 return floatx80_default_nan(status);
158142c2
FB
5033 }
5034 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5035 }
5036 if ( aExp == 0 ) {
bb98fe42 5037 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
158142c2
FB
5038 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5039 }
5040 bSig |= LIT64( 0x8000000000000000 );
5041 zSign = aSign;
5042 expDiff = aExp - bExp;
5043 aSig1 = 0;
5044 if ( expDiff < 0 ) {
5045 if ( expDiff < -1 ) return a;
5046 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5047 expDiff = 0;
5048 }
5049 q = ( bSig <= aSig0 );
5050 if ( q ) aSig0 -= bSig;
5051 expDiff -= 64;
5052 while ( 0 < expDiff ) {
5053 q = estimateDiv128To64( aSig0, aSig1, bSig );
5054 q = ( 2 < q ) ? q - 2 : 0;
5055 mul64To128( bSig, q, &term0, &term1 );
5056 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5057 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5058 expDiff -= 62;
5059 }
5060 expDiff += 64;
5061 if ( 0 < expDiff ) {
5062 q = estimateDiv128To64( aSig0, aSig1, bSig );
5063 q = ( 2 < q ) ? q - 2 : 0;
5064 q >>= 64 - expDiff;
5065 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5066 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5067 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5068 while ( le128( term0, term1, aSig0, aSig1 ) ) {
5069 ++q;
5070 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5071 }
5072 }
5073 else {
5074 term1 = 0;
5075 term0 = bSig;
5076 }
5077 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5078 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5079 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5080 && ( q & 1 ) )
5081 ) {
5082 aSig0 = alternateASig0;
5083 aSig1 = alternateASig1;
5084 zSign = ! zSign;
5085 }
5086 return
5087 normalizeRoundAndPackFloatx80(
ff32e16e 5088 80, zSign, bExp + expDiff, aSig0, aSig1, status);
158142c2
FB
5089
5090}
5091
5092/*----------------------------------------------------------------------------
5093| Returns the square root of the extended double-precision floating-point
5094| value `a'. The operation is performed according to the IEC/IEEE Standard
5095| for Binary Floating-Point Arithmetic.
5096*----------------------------------------------------------------------------*/
5097
e5a41ffa 5098floatx80 floatx80_sqrt(floatx80 a, float_status *status)
158142c2
FB
5099{
5100 flag aSign;
f4014512 5101 int32_t aExp, zExp;
bb98fe42
AF
5102 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5103 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
158142c2 5104
d1eb8f2a
AD
5105 if (floatx80_invalid_encoding(a)) {
5106 float_raise(float_flag_invalid, status);
5107 return floatx80_default_nan(status);
5108 }
158142c2
FB
5109 aSig0 = extractFloatx80Frac( a );
5110 aExp = extractFloatx80Exp( a );
5111 aSign = extractFloatx80Sign( a );
5112 if ( aExp == 0x7FFF ) {
ff32e16e
PM
5113 if ((uint64_t)(aSig0 << 1)) {
5114 return propagateFloatx80NaN(a, a, status);
5115 }
158142c2
FB
5116 if ( ! aSign ) return a;
5117 goto invalid;
5118 }
5119 if ( aSign ) {
5120 if ( ( aExp | aSig0 ) == 0 ) return a;
5121 invalid:
ff32e16e 5122 float_raise(float_flag_invalid, status);
af39bc8c 5123 return floatx80_default_nan(status);
158142c2
FB
5124 }
5125 if ( aExp == 0 ) {
5126 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5127 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5128 }
5129 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5130 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5131 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5132 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5133 doubleZSig0 = zSig0<<1;
5134 mul64To128( zSig0, zSig0, &term0, &term1 );
5135 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
bb98fe42 5136 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
5137 --zSig0;
5138 doubleZSig0 -= 2;
5139 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5140 }
5141 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5142 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5143 if ( zSig1 == 0 ) zSig1 = 1;
5144 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5145 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5146 mul64To128( zSig1, zSig1, &term2, &term3 );
5147 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
bb98fe42 5148 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
5149 --zSig1;
5150 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5151 term3 |= 1;
5152 term2 |= doubleZSig0;
5153 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5154 }
5155 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5156 }
5157 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5158 zSig0 |= doubleZSig0;
a2f2d288
PM
5159 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5160 0, zExp, zSig0, zSig1, status);
158142c2
FB
5161}
5162
5163/*----------------------------------------------------------------------------
b689362d
AJ
5164| Returns 1 if the extended double-precision floating-point value `a' is equal
5165| to the corresponding value `b', and 0 otherwise. The invalid exception is
5166| raised if either operand is a NaN. Otherwise, the comparison is performed
5167| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
5168*----------------------------------------------------------------------------*/
5169
e5a41ffa 5170int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5171{
5172
d1eb8f2a
AD
5173 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5174 || (extractFloatx80Exp(a) == 0x7FFF
5175 && (uint64_t) (extractFloatx80Frac(a) << 1))
5176 || (extractFloatx80Exp(b) == 0x7FFF
5177 && (uint64_t) (extractFloatx80Frac(b) << 1))
158142c2 5178 ) {
ff32e16e 5179 float_raise(float_flag_invalid, status);
158142c2
FB
5180 return 0;
5181 }
5182 return
5183 ( a.low == b.low )
5184 && ( ( a.high == b.high )
5185 || ( ( a.low == 0 )
bb98fe42 5186 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
5187 );
5188
5189}
5190
5191/*----------------------------------------------------------------------------
5192| Returns 1 if the extended double-precision floating-point value `a' is
5193| less than or equal to the corresponding value `b', and 0 otherwise. The
f5a64251
AJ
5194| invalid exception is raised if either operand is a NaN. The comparison is
5195| performed according to the IEC/IEEE Standard for Binary Floating-Point
5196| Arithmetic.
158142c2
FB
5197*----------------------------------------------------------------------------*/
5198
e5a41ffa 5199int floatx80_le(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5200{
5201 flag aSign, bSign;
5202
d1eb8f2a
AD
5203 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5204 || (extractFloatx80Exp(a) == 0x7FFF
5205 && (uint64_t) (extractFloatx80Frac(a) << 1))
5206 || (extractFloatx80Exp(b) == 0x7FFF
5207 && (uint64_t) (extractFloatx80Frac(b) << 1))
158142c2 5208 ) {
ff32e16e 5209 float_raise(float_flag_invalid, status);
158142c2
FB
5210 return 0;
5211 }
5212 aSign = extractFloatx80Sign( a );
5213 bSign = extractFloatx80Sign( b );
5214 if ( aSign != bSign ) {
5215 return
5216 aSign
bb98fe42 5217 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5218 == 0 );
5219 }
5220 return
5221 aSign ? le128( b.high, b.low, a.high, a.low )
5222 : le128( a.high, a.low, b.high, b.low );
5223
5224}
5225
5226/*----------------------------------------------------------------------------
5227| Returns 1 if the extended double-precision floating-point value `a' is
f5a64251
AJ
5228| less than the corresponding value `b', and 0 otherwise. The invalid
5229| exception is raised if either operand is a NaN. The comparison is performed
5230| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
5231*----------------------------------------------------------------------------*/
5232
e5a41ffa 5233int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5234{
5235 flag aSign, bSign;
5236
d1eb8f2a
AD
5237 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5238 || (extractFloatx80Exp(a) == 0x7FFF
5239 && (uint64_t) (extractFloatx80Frac(a) << 1))
5240 || (extractFloatx80Exp(b) == 0x7FFF
5241 && (uint64_t) (extractFloatx80Frac(b) << 1))
158142c2 5242 ) {
ff32e16e 5243 float_raise(float_flag_invalid, status);
158142c2
FB
5244 return 0;
5245 }
5246 aSign = extractFloatx80Sign( a );
5247 bSign = extractFloatx80Sign( b );
5248 if ( aSign != bSign ) {
5249 return
5250 aSign
bb98fe42 5251 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5252 != 0 );
5253 }
5254 return
5255 aSign ? lt128( b.high, b.low, a.high, a.low )
5256 : lt128( a.high, a.low, b.high, b.low );
5257
5258}
5259
67b7861d
AJ
5260/*----------------------------------------------------------------------------
5261| Returns 1 if the extended double-precision floating-point values `a' and `b'
f5a64251
AJ
5262| cannot be compared, and 0 otherwise. The invalid exception is raised if
5263| either operand is a NaN. The comparison is performed according to the
5264| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
67b7861d 5265*----------------------------------------------------------------------------*/
e5a41ffa 5266int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
67b7861d 5267{
d1eb8f2a
AD
5268 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5269 || (extractFloatx80Exp(a) == 0x7FFF
5270 && (uint64_t) (extractFloatx80Frac(a) << 1))
5271 || (extractFloatx80Exp(b) == 0x7FFF
5272 && (uint64_t) (extractFloatx80Frac(b) << 1))
67b7861d 5273 ) {
ff32e16e 5274 float_raise(float_flag_invalid, status);
67b7861d
AJ
5275 return 1;
5276 }
5277 return 0;
5278}
5279
158142c2 5280/*----------------------------------------------------------------------------
b689362d 5281| Returns 1 if the extended double-precision floating-point value `a' is
f5a64251
AJ
5282| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5283| cause an exception. The comparison is performed according to the IEC/IEEE
5284| Standard for Binary Floating-Point Arithmetic.
158142c2
FB
5285*----------------------------------------------------------------------------*/
5286
e5a41ffa 5287int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5288{
5289
d1eb8f2a
AD
5290 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5291 float_raise(float_flag_invalid, status);
5292 return 0;
5293 }
158142c2 5294 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5295 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5296 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5297 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2 5298 ) {
af39bc8c
AM
5299 if (floatx80_is_signaling_nan(a, status)
5300 || floatx80_is_signaling_nan(b, status)) {
ff32e16e 5301 float_raise(float_flag_invalid, status);
b689362d 5302 }
158142c2
FB
5303 return 0;
5304 }
5305 return
5306 ( a.low == b.low )
5307 && ( ( a.high == b.high )
5308 || ( ( a.low == 0 )
bb98fe42 5309 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
5310 );
5311
5312}
5313
5314/*----------------------------------------------------------------------------
5315| Returns 1 if the extended double-precision floating-point value `a' is less
5316| than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
5317| do not cause an exception. Otherwise, the comparison is performed according
5318| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5319*----------------------------------------------------------------------------*/
5320
e5a41ffa 5321int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5322{
5323 flag aSign, bSign;
5324
d1eb8f2a
AD
5325 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5326 float_raise(float_flag_invalid, status);
5327 return 0;
5328 }
158142c2 5329 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5330 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5331 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5332 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2 5333 ) {
af39bc8c
AM
5334 if (floatx80_is_signaling_nan(a, status)
5335 || floatx80_is_signaling_nan(b, status)) {
ff32e16e 5336 float_raise(float_flag_invalid, status);
158142c2
FB
5337 }
5338 return 0;
5339 }
5340 aSign = extractFloatx80Sign( a );
5341 bSign = extractFloatx80Sign( b );
5342 if ( aSign != bSign ) {
5343 return
5344 aSign
bb98fe42 5345 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5346 == 0 );
5347 }
5348 return
5349 aSign ? le128( b.high, b.low, a.high, a.low )
5350 : le128( a.high, a.low, b.high, b.low );
5351
5352}
5353
5354/*----------------------------------------------------------------------------
5355| Returns 1 if the extended double-precision floating-point value `a' is less
5356| than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
5357| an exception. Otherwise, the comparison is performed according to the
5358| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5359*----------------------------------------------------------------------------*/
5360
e5a41ffa 5361int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5362{
5363 flag aSign, bSign;
5364
d1eb8f2a
AD
5365 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5366 float_raise(float_flag_invalid, status);
5367 return 0;
5368 }
158142c2 5369 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5370 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5371 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5372 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2 5373 ) {
af39bc8c
AM
5374 if (floatx80_is_signaling_nan(a, status)
5375 || floatx80_is_signaling_nan(b, status)) {
ff32e16e 5376 float_raise(float_flag_invalid, status);
158142c2
FB
5377 }
5378 return 0;
5379 }
5380 aSign = extractFloatx80Sign( a );
5381 bSign = extractFloatx80Sign( b );
5382 if ( aSign != bSign ) {
5383 return
5384 aSign
bb98fe42 5385 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5386 != 0 );
5387 }
5388 return
5389 aSign ? lt128( b.high, b.low, a.high, a.low )
5390 : lt128( a.high, a.low, b.high, b.low );
5391
5392}
5393
67b7861d
AJ
5394/*----------------------------------------------------------------------------
5395| Returns 1 if the extended double-precision floating-point values `a' and `b'
5396| cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
5397| The comparison is performed according to the IEC/IEEE Standard for Binary
5398| Floating-Point Arithmetic.
5399*----------------------------------------------------------------------------*/
e5a41ffa 5400int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
67b7861d 5401{
d1eb8f2a
AD
5402 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5403 float_raise(float_flag_invalid, status);
5404 return 1;
5405 }
67b7861d
AJ
5406 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5407 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5408 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5409 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5410 ) {
af39bc8c
AM
5411 if (floatx80_is_signaling_nan(a, status)
5412 || floatx80_is_signaling_nan(b, status)) {
ff32e16e 5413 float_raise(float_flag_invalid, status);
67b7861d
AJ
5414 }
5415 return 1;
5416 }
5417 return 0;
5418}
5419
158142c2
FB
5420/*----------------------------------------------------------------------------
5421| Returns the result of converting the quadruple-precision floating-point
5422| value `a' to the 32-bit two's complement integer format. The conversion
5423| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5424| Arithmetic---which means in particular that the conversion is rounded
5425| according to the current rounding mode. If `a' is a NaN, the largest
5426| positive integer is returned. Otherwise, if the conversion overflows, the
5427| largest integer with the same sign as `a' is returned.
5428*----------------------------------------------------------------------------*/
5429
f4014512 5430int32_t float128_to_int32(float128 a, float_status *status)
158142c2
FB
5431{
5432 flag aSign;
f4014512 5433 int32_t aExp, shiftCount;
bb98fe42 5434 uint64_t aSig0, aSig1;
158142c2
FB
5435
5436 aSig1 = extractFloat128Frac1( a );
5437 aSig0 = extractFloat128Frac0( a );
5438 aExp = extractFloat128Exp( a );
5439 aSign = extractFloat128Sign( a );
5440 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5441 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5442 aSig0 |= ( aSig1 != 0 );
5443 shiftCount = 0x4028 - aExp;
5444 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
ff32e16e 5445 return roundAndPackInt32(aSign, aSig0, status);
158142c2
FB
5446
5447}
5448
5449/*----------------------------------------------------------------------------
5450| Returns the result of converting the quadruple-precision floating-point
5451| value `a' to the 32-bit two's complement integer format. The conversion
5452| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5453| Arithmetic, except that the conversion is always rounded toward zero. If
5454| `a' is a NaN, the largest positive integer is returned. Otherwise, if the
5455| conversion overflows, the largest integer with the same sign as `a' is
5456| returned.
5457*----------------------------------------------------------------------------*/
5458
f4014512 5459int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
158142c2
FB
5460{
5461 flag aSign;
f4014512 5462 int32_t aExp, shiftCount;
bb98fe42 5463 uint64_t aSig0, aSig1, savedASig;
b3a6a2e0 5464 int32_t z;
158142c2
FB
5465
5466 aSig1 = extractFloat128Frac1( a );
5467 aSig0 = extractFloat128Frac0( a );
5468 aExp = extractFloat128Exp( a );
5469 aSign = extractFloat128Sign( a );
5470 aSig0 |= ( aSig1 != 0 );
5471 if ( 0x401E < aExp ) {
5472 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5473 goto invalid;
5474 }
5475 else if ( aExp < 0x3FFF ) {
a2f2d288
PM
5476 if (aExp || aSig0) {
5477 status->float_exception_flags |= float_flag_inexact;
5478 }
158142c2
FB
5479 return 0;
5480 }
5481 aSig0 |= LIT64( 0x0001000000000000 );
5482 shiftCount = 0x402F - aExp;
5483 savedASig = aSig0;
5484 aSig0 >>= shiftCount;
5485 z = aSig0;
5486 if ( aSign ) z = - z;
5487 if ( ( z < 0 ) ^ aSign ) {
5488 invalid:
ff32e16e 5489 float_raise(float_flag_invalid, status);
bb98fe42 5490 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2
FB
5491 }
5492 if ( ( aSig0<<shiftCount ) != savedASig ) {
a2f2d288 5493 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
5494 }
5495 return z;
5496
5497}
5498
5499/*----------------------------------------------------------------------------
5500| Returns the result of converting the quadruple-precision floating-point
5501| value `a' to the 64-bit two's complement integer format. The conversion
5502| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5503| Arithmetic---which means in particular that the conversion is rounded
5504| according to the current rounding mode. If `a' is a NaN, the largest
5505| positive integer is returned. Otherwise, if the conversion overflows, the
5506| largest integer with the same sign as `a' is returned.
5507*----------------------------------------------------------------------------*/
5508
f42c2224 5509int64_t float128_to_int64(float128 a, float_status *status)
158142c2
FB
5510{
5511 flag aSign;
f4014512 5512 int32_t aExp, shiftCount;
bb98fe42 5513 uint64_t aSig0, aSig1;
158142c2
FB
5514
5515 aSig1 = extractFloat128Frac1( a );
5516 aSig0 = extractFloat128Frac0( a );
5517 aExp = extractFloat128Exp( a );
5518 aSign = extractFloat128Sign( a );
5519 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5520 shiftCount = 0x402F - aExp;
5521 if ( shiftCount <= 0 ) {
5522 if ( 0x403E < aExp ) {
ff32e16e 5523 float_raise(float_flag_invalid, status);
158142c2
FB
5524 if ( ! aSign
5525 || ( ( aExp == 0x7FFF )
5526 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5527 )
5528 ) {
5529 return LIT64( 0x7FFFFFFFFFFFFFFF );
5530 }
bb98fe42 5531 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
5532 }
5533 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5534 }
5535 else {
5536 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5537 }
ff32e16e 5538 return roundAndPackInt64(aSign, aSig0, aSig1, status);
158142c2
FB
5539
5540}
5541
5542/*----------------------------------------------------------------------------
5543| Returns the result of converting the quadruple-precision floating-point
5544| value `a' to the 64-bit two's complement integer format. The conversion
5545| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5546| Arithmetic, except that the conversion is always rounded toward zero.
5547| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
5548| the conversion overflows, the largest integer with the same sign as `a' is
5549| returned.
5550*----------------------------------------------------------------------------*/
5551
f42c2224 5552int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
158142c2
FB
5553{
5554 flag aSign;
f4014512 5555 int32_t aExp, shiftCount;
bb98fe42 5556 uint64_t aSig0, aSig1;
f42c2224 5557 int64_t z;
158142c2
FB
5558
5559 aSig1 = extractFloat128Frac1( a );
5560 aSig0 = extractFloat128Frac0( a );
5561 aExp = extractFloat128Exp( a );
5562 aSign = extractFloat128Sign( a );
5563 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5564 shiftCount = aExp - 0x402F;
5565 if ( 0 < shiftCount ) {
5566 if ( 0x403E <= aExp ) {
5567 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5568 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
5569 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
a2f2d288
PM
5570 if (aSig1) {
5571 status->float_exception_flags |= float_flag_inexact;
5572 }
158142c2
FB
5573 }
5574 else {
ff32e16e 5575 float_raise(float_flag_invalid, status);
158142c2
FB
5576 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5577 return LIT64( 0x7FFFFFFFFFFFFFFF );
5578 }
5579 }
bb98fe42 5580 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
5581 }
5582 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
bb98fe42 5583 if ( (uint64_t) ( aSig1<<shiftCount ) ) {
a2f2d288 5584 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
5585 }
5586 }
5587 else {
5588 if ( aExp < 0x3FFF ) {
5589 if ( aExp | aSig0 | aSig1 ) {
a2f2d288 5590 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
5591 }
5592 return 0;
5593 }
5594 z = aSig0>>( - shiftCount );
5595 if ( aSig1
bb98fe42 5596 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
a2f2d288 5597 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
5598 }
5599 }
5600 if ( aSign ) z = - z;
5601 return z;
5602
5603}
5604
2e6d8568
BR
5605/*----------------------------------------------------------------------------
5606| Returns the result of converting the quadruple-precision floating-point value
5607| `a' to the 64-bit unsigned integer format. The conversion is
5608| performed according to the IEC/IEEE Standard for Binary Floating-Point
5609| Arithmetic---which means in particular that the conversion is rounded
5610| according to the current rounding mode. If `a' is a NaN, the largest
5611| positive integer is returned. If the conversion overflows, the
5612| largest unsigned integer is returned. If 'a' is negative, the value is
5613| rounded and zero is returned; negative values that do not round to zero
5614| will raise the inexact exception.
5615*----------------------------------------------------------------------------*/
5616
5617uint64_t float128_to_uint64(float128 a, float_status *status)
5618{
5619 flag aSign;
5620 int aExp;
5621 int shiftCount;
5622 uint64_t aSig0, aSig1;
5623
5624 aSig0 = extractFloat128Frac0(a);
5625 aSig1 = extractFloat128Frac1(a);
5626 aExp = extractFloat128Exp(a);
5627 aSign = extractFloat128Sign(a);
5628 if (aSign && (aExp > 0x3FFE)) {
5629 float_raise(float_flag_invalid, status);
5630 if (float128_is_any_nan(a)) {
5631 return LIT64(0xFFFFFFFFFFFFFFFF);
5632 } else {
5633 return 0;
5634 }
5635 }
5636 if (aExp) {
5637 aSig0 |= LIT64(0x0001000000000000);
5638 }
5639 shiftCount = 0x402F - aExp;
5640 if (shiftCount <= 0) {
5641 if (0x403E < aExp) {
5642 float_raise(float_flag_invalid, status);
5643 return LIT64(0xFFFFFFFFFFFFFFFF);
5644 }
5645 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
5646 } else {
5647 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
5648 }
5649 return roundAndPackUint64(aSign, aSig0, aSig1, status);
5650}
5651
5652uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
5653{
5654 uint64_t v;
5655 signed char current_rounding_mode = status->float_rounding_mode;
5656
5657 set_float_rounding_mode(float_round_to_zero, status);
5658 v = float128_to_uint64(a, status);
5659 set_float_rounding_mode(current_rounding_mode, status);
5660
5661 return v;
5662}
5663
158142c2
FB
5664/*----------------------------------------------------------------------------
5665| Returns the result of converting the quadruple-precision floating-point
fd425037
BR
5666| value `a' to the 32-bit unsigned integer format. The conversion
5667| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5668| Arithmetic except that the conversion is always rounded toward zero.
5669| If `a' is a NaN, the largest positive integer is returned. Otherwise,
5670| if the conversion overflows, the largest unsigned integer is returned.
5671| If 'a' is negative, the value is rounded and zero is returned; negative
5672| values that do not round to zero will raise the inexact exception.
5673*----------------------------------------------------------------------------*/
5674
5675uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
5676{
5677 uint64_t v;
5678 uint32_t res;
5679 int old_exc_flags = get_float_exception_flags(status);
5680
5681 v = float128_to_uint64_round_to_zero(a, status);
5682 if (v > 0xffffffff) {
5683 res = 0xffffffff;
5684 } else {
5685 return v;
5686 }
5687 set_float_exception_flags(old_exc_flags, status);
5688 float_raise(float_flag_invalid, status);
5689 return res;
5690}
5691
5692/*----------------------------------------------------------------------------
5693| Returns the result of converting the quadruple-precision floating-point
158142c2
FB
5694| value `a' to the single-precision floating-point format. The conversion
5695| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5696| Arithmetic.
5697*----------------------------------------------------------------------------*/
5698
e5a41ffa 5699float32 float128_to_float32(float128 a, float_status *status)
158142c2
FB
5700{
5701 flag aSign;
f4014512 5702 int32_t aExp;
bb98fe42
AF
5703 uint64_t aSig0, aSig1;
5704 uint32_t zSig;
158142c2
FB
5705
5706 aSig1 = extractFloat128Frac1( a );
5707 aSig0 = extractFloat128Frac0( a );
5708 aExp = extractFloat128Exp( a );
5709 aSign = extractFloat128Sign( a );
5710 if ( aExp == 0x7FFF ) {
5711 if ( aSig0 | aSig1 ) {
ff32e16e 5712 return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
158142c2
FB
5713 }
5714 return packFloat32( aSign, 0xFF, 0 );
5715 }
5716 aSig0 |= ( aSig1 != 0 );
5717 shift64RightJamming( aSig0, 18, &aSig0 );
5718 zSig = aSig0;
5719 if ( aExp || zSig ) {
5720 zSig |= 0x40000000;
5721 aExp -= 0x3F81;
5722 }
ff32e16e 5723 return roundAndPackFloat32(aSign, aExp, zSig, status);
158142c2
FB
5724
5725}
5726
5727/*----------------------------------------------------------------------------
5728| Returns the result of converting the quadruple-precision floating-point
5729| value `a' to the double-precision floating-point format. The conversion
5730| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5731| Arithmetic.
5732*----------------------------------------------------------------------------*/
5733
e5a41ffa 5734float64 float128_to_float64(float128 a, float_status *status)
158142c2
FB
5735{
5736 flag aSign;
f4014512 5737 int32_t aExp;
bb98fe42 5738 uint64_t aSig0, aSig1;
158142c2
FB
5739
5740 aSig1 = extractFloat128Frac1( a );
5741 aSig0 = extractFloat128Frac0( a );
5742 aExp = extractFloat128Exp( a );
5743 aSign = extractFloat128Sign( a );
5744 if ( aExp == 0x7FFF ) {
5745 if ( aSig0 | aSig1 ) {
ff32e16e 5746 return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
158142c2
FB
5747 }
5748 return packFloat64( aSign, 0x7FF, 0 );
5749 }
5750 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5751 aSig0 |= ( aSig1 != 0 );
5752 if ( aExp || aSig0 ) {
5753 aSig0 |= LIT64( 0x4000000000000000 );
5754 aExp -= 0x3C01;
5755 }
ff32e16e 5756 return roundAndPackFloat64(aSign, aExp, aSig0, status);
158142c2
FB
5757
5758}
5759
158142c2
FB
5760/*----------------------------------------------------------------------------
5761| Returns the result of converting the quadruple-precision floating-point
5762| value `a' to the extended double-precision floating-point format. The
5763| conversion is performed according to the IEC/IEEE Standard for Binary
5764| Floating-Point Arithmetic.
5765*----------------------------------------------------------------------------*/
5766
e5a41ffa 5767floatx80 float128_to_floatx80(float128 a, float_status *status)
158142c2
FB
5768{
5769 flag aSign;
f4014512 5770 int32_t aExp;
bb98fe42 5771 uint64_t aSig0, aSig1;
158142c2
FB
5772
5773 aSig1 = extractFloat128Frac1( a );
5774 aSig0 = extractFloat128Frac0( a );
5775 aExp = extractFloat128Exp( a );
5776 aSign = extractFloat128Sign( a );
5777 if ( aExp == 0x7FFF ) {
5778 if ( aSig0 | aSig1 ) {
ff32e16e 5779 return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
158142c2
FB
5780 }
5781 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5782 }
5783 if ( aExp == 0 ) {
5784 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5785 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5786 }
5787 else {
5788 aSig0 |= LIT64( 0x0001000000000000 );
5789 }
5790 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
ff32e16e 5791 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
158142c2
FB
5792
5793}
5794
158142c2
FB
5795/*----------------------------------------------------------------------------
5796| Rounds the quadruple-precision floating-point value `a' to an integer, and
5797| returns the result as a quadruple-precision floating-point value. The
5798| operation is performed according to the IEC/IEEE Standard for Binary
5799| Floating-Point Arithmetic.
5800*----------------------------------------------------------------------------*/
5801
e5a41ffa 5802float128 float128_round_to_int(float128 a, float_status *status)
158142c2
FB
5803{
5804 flag aSign;
f4014512 5805 int32_t aExp;
bb98fe42 5806 uint64_t lastBitMask, roundBitsMask;
158142c2
FB
5807 float128 z;
5808
5809 aExp = extractFloat128Exp( a );
5810 if ( 0x402F <= aExp ) {
5811 if ( 0x406F <= aExp ) {
5812 if ( ( aExp == 0x7FFF )
5813 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5814 ) {
ff32e16e 5815 return propagateFloat128NaN(a, a, status);
158142c2
FB
5816 }
5817 return a;
5818 }
5819 lastBitMask = 1;
5820 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5821 roundBitsMask = lastBitMask - 1;
5822 z = a;
a2f2d288 5823 switch (status->float_rounding_mode) {
dc355b76 5824 case float_round_nearest_even:
158142c2
FB
5825 if ( lastBitMask ) {
5826 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
5827 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
5828 }
5829 else {
bb98fe42 5830 if ( (int64_t) z.low < 0 ) {
158142c2 5831 ++z.high;
bb98fe42 5832 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
158142c2
FB
5833 }
5834 }
dc355b76 5835 break;
f9288a76
PM
5836 case float_round_ties_away:
5837 if (lastBitMask) {
5838 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
5839 } else {
5840 if ((int64_t) z.low < 0) {
5841 ++z.high;
5842 }
5843 }
5844 break;
dc355b76
PM
5845 case float_round_to_zero:
5846 break;
5847 case float_round_up:
5848 if (!extractFloat128Sign(z)) {
5849 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
5850 }
5851 break;
5852 case float_round_down:
5853 if (extractFloat128Sign(z)) {
5854 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
158142c2 5855 }
dc355b76
PM
5856 break;
5857 default:
5858 abort();
158142c2
FB
5859 }
5860 z.low &= ~ roundBitsMask;
5861 }
5862 else {
5863 if ( aExp < 0x3FFF ) {
bb98fe42 5864 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
a2f2d288 5865 status->float_exception_flags |= float_flag_inexact;
158142c2 5866 aSign = extractFloat128Sign( a );
a2f2d288 5867 switch (status->float_rounding_mode) {
158142c2
FB
5868 case float_round_nearest_even:
5869 if ( ( aExp == 0x3FFE )
5870 && ( extractFloat128Frac0( a )
5871 | extractFloat128Frac1( a ) )
5872 ) {
5873 return packFloat128( aSign, 0x3FFF, 0, 0 );
5874 }
5875 break;
f9288a76
PM
5876 case float_round_ties_away:
5877 if (aExp == 0x3FFE) {
5878 return packFloat128(aSign, 0x3FFF, 0, 0);
5879 }
5880 break;
158142c2
FB
5881 case float_round_down:
5882 return
5883 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
5884 : packFloat128( 0, 0, 0, 0 );
5885 case float_round_up:
5886 return
5887 aSign ? packFloat128( 1, 0, 0, 0 )
5888 : packFloat128( 0, 0x3FFF, 0, 0 );
5889 }
5890 return packFloat128( aSign, 0, 0, 0 );
5891 }
5892 lastBitMask = 1;
5893 lastBitMask <<= 0x402F - aExp;
5894 roundBitsMask = lastBitMask - 1;
5895 z.low = 0;
5896 z.high = a.high;
a2f2d288 5897 switch (status->float_rounding_mode) {
dc355b76 5898 case float_round_nearest_even:
158142c2
FB
5899 z.high += lastBitMask>>1;
5900 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
5901 z.high &= ~ lastBitMask;
5902 }
dc355b76 5903 break;
f9288a76
PM
5904 case float_round_ties_away:
5905 z.high += lastBitMask>>1;
5906 break;
dc355b76
PM
5907 case float_round_to_zero:
5908 break;
5909 case float_round_up:
5910 if (!extractFloat128Sign(z)) {
158142c2
FB
5911 z.high |= ( a.low != 0 );
5912 z.high += roundBitsMask;
5913 }
dc355b76
PM
5914 break;
5915 case float_round_down:
5916 if (extractFloat128Sign(z)) {
5917 z.high |= (a.low != 0);
5918 z.high += roundBitsMask;
5919 }
5920 break;
5921 default:
5922 abort();
158142c2
FB
5923 }
5924 z.high &= ~ roundBitsMask;
5925 }
5926 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
a2f2d288 5927 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
5928 }
5929 return z;
5930
5931}
5932
5933/*----------------------------------------------------------------------------
5934| Returns the result of adding the absolute values of the quadruple-precision
5935| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
5936| before being returned. `zSign' is ignored if the result is a NaN.
5937| The addition is performed according to the IEC/IEEE Standard for Binary
5938| Floating-Point Arithmetic.
5939*----------------------------------------------------------------------------*/
5940
e5a41ffa
PM
5941static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
5942 float_status *status)
158142c2 5943{
f4014512 5944 int32_t aExp, bExp, zExp;
bb98fe42 5945 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
f4014512 5946 int32_t expDiff;
158142c2
FB
5947
5948 aSig1 = extractFloat128Frac1( a );
5949 aSig0 = extractFloat128Frac0( a );
5950 aExp = extractFloat128Exp( a );
5951 bSig1 = extractFloat128Frac1( b );
5952 bSig0 = extractFloat128Frac0( b );
5953 bExp = extractFloat128Exp( b );
5954 expDiff = aExp - bExp;
5955 if ( 0 < expDiff ) {
5956 if ( aExp == 0x7FFF ) {
ff32e16e
PM
5957 if (aSig0 | aSig1) {
5958 return propagateFloat128NaN(a, b, status);
5959 }
158142c2
FB
5960 return a;
5961 }
5962 if ( bExp == 0 ) {
5963 --expDiff;
5964 }
5965 else {
5966 bSig0 |= LIT64( 0x0001000000000000 );
5967 }
5968 shift128ExtraRightJamming(
5969 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
5970 zExp = aExp;
5971 }
5972 else if ( expDiff < 0 ) {
5973 if ( bExp == 0x7FFF ) {
ff32e16e
PM
5974 if (bSig0 | bSig1) {
5975 return propagateFloat128NaN(a, b, status);
5976 }
158142c2
FB
5977 return packFloat128( zSign, 0x7FFF, 0, 0 );
5978 }
5979 if ( aExp == 0 ) {
5980 ++expDiff;
5981 }
5982 else {
5983 aSig0 |= LIT64( 0x0001000000000000 );
5984 }
5985 shift128ExtraRightJamming(
5986 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
5987 zExp = bExp;
5988 }
5989 else {
5990 if ( aExp == 0x7FFF ) {
5991 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
ff32e16e 5992 return propagateFloat128NaN(a, b, status);
158142c2
FB
5993 }
5994 return a;
5995 }
5996 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
fe76d976 5997 if ( aExp == 0 ) {
a2f2d288 5998 if (status->flush_to_zero) {
e6afc87f 5999 if (zSig0 | zSig1) {
ff32e16e 6000 float_raise(float_flag_output_denormal, status);
e6afc87f
PM
6001 }
6002 return packFloat128(zSign, 0, 0, 0);
6003 }
fe76d976
PB
6004 return packFloat128( zSign, 0, zSig0, zSig1 );
6005 }
158142c2
FB
6006 zSig2 = 0;
6007 zSig0 |= LIT64( 0x0002000000000000 );
6008 zExp = aExp;
6009 goto shiftRight1;
6010 }
6011 aSig0 |= LIT64( 0x0001000000000000 );
6012 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6013 --zExp;
6014 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6015 ++zExp;
6016 shiftRight1:
6017 shift128ExtraRightJamming(
6018 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6019 roundAndPack:
ff32e16e 6020 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
158142c2
FB
6021
6022}
6023
6024/*----------------------------------------------------------------------------
6025| Returns the result of subtracting the absolute values of the quadruple-
6026| precision floating-point values `a' and `b'. If `zSign' is 1, the
6027| difference is negated before being returned. `zSign' is ignored if the
6028| result is a NaN. The subtraction is performed according to the IEC/IEEE
6029| Standard for Binary Floating-Point Arithmetic.
6030*----------------------------------------------------------------------------*/
6031
e5a41ffa
PM
6032static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6033 float_status *status)
158142c2 6034{
f4014512 6035 int32_t aExp, bExp, zExp;
bb98fe42 6036 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
f4014512 6037 int32_t expDiff;
158142c2
FB
6038
6039 aSig1 = extractFloat128Frac1( a );
6040 aSig0 = extractFloat128Frac0( a );
6041 aExp = extractFloat128Exp( a );
6042 bSig1 = extractFloat128Frac1( b );
6043 bSig0 = extractFloat128Frac0( b );
6044 bExp = extractFloat128Exp( b );
6045 expDiff = aExp - bExp;
6046 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6047 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6048 if ( 0 < expDiff ) goto aExpBigger;
6049 if ( expDiff < 0 ) goto bExpBigger;
6050 if ( aExp == 0x7FFF ) {
6051 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
ff32e16e 6052 return propagateFloat128NaN(a, b, status);
158142c2 6053 }
ff32e16e 6054 float_raise(float_flag_invalid, status);
af39bc8c 6055 return float128_default_nan(status);
158142c2
FB
6056 }
6057 if ( aExp == 0 ) {
6058 aExp = 1;
6059 bExp = 1;
6060 }
6061 if ( bSig0 < aSig0 ) goto aBigger;
6062 if ( aSig0 < bSig0 ) goto bBigger;
6063 if ( bSig1 < aSig1 ) goto aBigger;
6064 if ( aSig1 < bSig1 ) goto bBigger;
a2f2d288
PM
6065 return packFloat128(status->float_rounding_mode == float_round_down,
6066 0, 0, 0);
158142c2
FB
6067 bExpBigger:
6068 if ( bExp == 0x7FFF ) {
ff32e16e
PM
6069 if (bSig0 | bSig1) {
6070 return propagateFloat128NaN(a, b, status);
6071 }
158142c2
FB
6072 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6073 }
6074 if ( aExp == 0 ) {
6075 ++expDiff;
6076 }
6077 else {
6078 aSig0 |= LIT64( 0x4000000000000000 );
6079 }
6080 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6081 bSig0 |= LIT64( 0x4000000000000000 );
6082 bBigger:
6083 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6084 zExp = bExp;
6085 zSign ^= 1;
6086 goto normalizeRoundAndPack;
6087 aExpBigger:
6088 if ( aExp == 0x7FFF ) {
ff32e16e
PM
6089 if (aSig0 | aSig1) {
6090 return propagateFloat128NaN(a, b, status);
6091 }
158142c2
FB
6092 return a;
6093 }
6094 if ( bExp == 0 ) {
6095 --expDiff;
6096 }
6097 else {
6098 bSig0 |= LIT64( 0x4000000000000000 );
6099 }
6100 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6101 aSig0 |= LIT64( 0x4000000000000000 );
6102 aBigger:
6103 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6104 zExp = aExp;
6105 normalizeRoundAndPack:
6106 --zExp;
ff32e16e
PM
6107 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6108 status);
158142c2
FB
6109
6110}
6111
6112/*----------------------------------------------------------------------------
6113| Returns the result of adding the quadruple-precision floating-point values
6114| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
6115| for Binary Floating-Point Arithmetic.
6116*----------------------------------------------------------------------------*/
6117
e5a41ffa 6118float128 float128_add(float128 a, float128 b, float_status *status)
158142c2
FB
6119{
6120 flag aSign, bSign;
6121
6122 aSign = extractFloat128Sign( a );
6123 bSign = extractFloat128Sign( b );
6124 if ( aSign == bSign ) {
ff32e16e 6125 return addFloat128Sigs(a, b, aSign, status);
158142c2
FB
6126 }
6127 else {
ff32e16e 6128 return subFloat128Sigs(a, b, aSign, status);
158142c2
FB
6129 }
6130
6131}
6132
6133/*----------------------------------------------------------------------------
6134| Returns the result of subtracting the quadruple-precision floating-point
6135| values `a' and `b'. The operation is performed according to the IEC/IEEE
6136| Standard for Binary Floating-Point Arithmetic.
6137*----------------------------------------------------------------------------*/
6138
e5a41ffa 6139float128 float128_sub(float128 a, float128 b, float_status *status)
158142c2
FB
6140{
6141 flag aSign, bSign;
6142
6143 aSign = extractFloat128Sign( a );
6144 bSign = extractFloat128Sign( b );
6145 if ( aSign == bSign ) {
ff32e16e 6146 return subFloat128Sigs(a, b, aSign, status);
158142c2
FB
6147 }
6148 else {
ff32e16e 6149 return addFloat128Sigs(a, b, aSign, status);
158142c2
FB
6150 }
6151
6152}
6153
6154/*----------------------------------------------------------------------------
6155| Returns the result of multiplying the quadruple-precision floating-point
6156| values `a' and `b'. The operation is performed according to the IEC/IEEE
6157| Standard for Binary Floating-Point Arithmetic.
6158*----------------------------------------------------------------------------*/
6159
e5a41ffa 6160float128 float128_mul(float128 a, float128 b, float_status *status)
158142c2
FB
6161{
6162 flag aSign, bSign, zSign;
f4014512 6163 int32_t aExp, bExp, zExp;
bb98fe42 6164 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
158142c2
FB
6165
6166 aSig1 = extractFloat128Frac1( a );
6167 aSig0 = extractFloat128Frac0( a );
6168 aExp = extractFloat128Exp( a );
6169 aSign = extractFloat128Sign( a );
6170 bSig1 = extractFloat128Frac1( b );
6171 bSig0 = extractFloat128Frac0( b );
6172 bExp = extractFloat128Exp( b );
6173 bSign = extractFloat128Sign( b );
6174 zSign = aSign ^ bSign;
6175 if ( aExp == 0x7FFF ) {
6176 if ( ( aSig0 | aSig1 )
6177 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
ff32e16e 6178 return propagateFloat128NaN(a, b, status);
158142c2
FB
6179 }
6180 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6181 return packFloat128( zSign, 0x7FFF, 0, 0 );
6182 }
6183 if ( bExp == 0x7FFF ) {
ff32e16e
PM
6184 if (bSig0 | bSig1) {
6185 return propagateFloat128NaN(a, b, status);
6186 }
158142c2
FB
6187 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6188 invalid:
ff32e16e 6189 float_raise(float_flag_invalid, status);
af39bc8c 6190 return float128_default_nan(status);
158142c2
FB
6191 }
6192 return packFloat128( zSign, 0x7FFF, 0, 0 );
6193 }
6194 if ( aExp == 0 ) {
6195 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6196 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6197 }
6198 if ( bExp == 0 ) {
6199 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6200 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6201 }
6202 zExp = aExp + bExp - 0x4000;
6203 aSig0 |= LIT64( 0x0001000000000000 );
6204 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6205 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6206 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6207 zSig2 |= ( zSig3 != 0 );
6208 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6209 shift128ExtraRightJamming(
6210 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6211 ++zExp;
6212 }
ff32e16e 6213 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
158142c2
FB
6214
6215}
6216
6217/*----------------------------------------------------------------------------
6218| Returns the result of dividing the quadruple-precision floating-point value
6219| `a' by the corresponding value `b'. The operation is performed according to
6220| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6221*----------------------------------------------------------------------------*/
6222
e5a41ffa 6223float128 float128_div(float128 a, float128 b, float_status *status)
158142c2
FB
6224{
6225 flag aSign, bSign, zSign;
f4014512 6226 int32_t aExp, bExp, zExp;
bb98fe42
AF
6227 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6228 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
158142c2
FB
6229
6230 aSig1 = extractFloat128Frac1( a );
6231 aSig0 = extractFloat128Frac0( a );
6232 aExp = extractFloat128Exp( a );
6233 aSign = extractFloat128Sign( a );
6234 bSig1 = extractFloat128Frac1( b );
6235 bSig0 = extractFloat128Frac0( b );
6236 bExp = extractFloat128Exp( b );
6237 bSign = extractFloat128Sign( b );
6238 zSign = aSign ^ bSign;
6239 if ( aExp == 0x7FFF ) {
ff32e16e
PM
6240 if (aSig0 | aSig1) {
6241 return propagateFloat128NaN(a, b, status);
6242 }
158142c2 6243 if ( bExp == 0x7FFF ) {
ff32e16e
PM
6244 if (bSig0 | bSig1) {
6245 return propagateFloat128NaN(a, b, status);
6246 }
158142c2
FB
6247 goto invalid;
6248 }
6249 return packFloat128( zSign, 0x7FFF, 0, 0 );
6250 }
6251 if ( bExp == 0x7FFF ) {
ff32e16e
PM
6252 if (bSig0 | bSig1) {
6253 return propagateFloat128NaN(a, b, status);
6254 }
158142c2
FB
6255 return packFloat128( zSign, 0, 0, 0 );
6256 }
6257 if ( bExp == 0 ) {
6258 if ( ( bSig0 | bSig1 ) == 0 ) {
6259 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6260 invalid:
ff32e16e 6261 float_raise(float_flag_invalid, status);
af39bc8c 6262 return float128_default_nan(status);
158142c2 6263 }
ff32e16e 6264 float_raise(float_flag_divbyzero, status);
158142c2
FB
6265 return packFloat128( zSign, 0x7FFF, 0, 0 );
6266 }
6267 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6268 }
6269 if ( aExp == 0 ) {
6270 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6271 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6272 }
6273 zExp = aExp - bExp + 0x3FFD;
6274 shortShift128Left(
6275 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6276 shortShift128Left(
6277 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6278 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6279 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6280 ++zExp;
6281 }
6282 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6283 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6284 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
bb98fe42 6285 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
6286 --zSig0;
6287 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6288 }
6289 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6290 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6291 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6292 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
bb98fe42 6293 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
6294 --zSig1;
6295 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6296 }
6297 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6298 }
6299 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
ff32e16e 6300 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
158142c2
FB
6301
6302}
6303
6304/*----------------------------------------------------------------------------
6305| Returns the remainder of the quadruple-precision floating-point value `a'
6306| with respect to the corresponding value `b'. The operation is performed
6307| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6308*----------------------------------------------------------------------------*/
6309
e5a41ffa 6310float128 float128_rem(float128 a, float128 b, float_status *status)
158142c2 6311{
ed086f3d 6312 flag aSign, zSign;
f4014512 6313 int32_t aExp, bExp, expDiff;
bb98fe42
AF
6314 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6315 uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6316 int64_t sigMean0;
158142c2
FB
6317
6318 aSig1 = extractFloat128Frac1( a );
6319 aSig0 = extractFloat128Frac0( a );
6320 aExp = extractFloat128Exp( a );
6321 aSign = extractFloat128Sign( a );
6322 bSig1 = extractFloat128Frac1( b );
6323 bSig0 = extractFloat128Frac0( b );
6324 bExp = extractFloat128Exp( b );
158142c2
FB
6325 if ( aExp == 0x7FFF ) {
6326 if ( ( aSig0 | aSig1 )
6327 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
ff32e16e 6328 return propagateFloat128NaN(a, b, status);
158142c2
FB
6329 }
6330 goto invalid;
6331 }
6332 if ( bExp == 0x7FFF ) {
ff32e16e
PM
6333 if (bSig0 | bSig1) {
6334 return propagateFloat128NaN(a, b, status);
6335 }
158142c2
FB
6336 return a;
6337 }
6338 if ( bExp == 0 ) {
6339 if ( ( bSig0 | bSig1 ) == 0 ) {
6340 invalid:
ff32e16e 6341 float_raise(float_flag_invalid, status);
af39bc8c 6342 return float128_default_nan(status);
158142c2
FB
6343 }
6344 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6345 }
6346 if ( aExp == 0 ) {
6347 if ( ( aSig0 | aSig1 ) == 0 ) return a;
6348 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6349 }
6350 expDiff = aExp - bExp;
6351 if ( expDiff < -1 ) return a;
6352 shortShift128Left(
6353 aSig0 | LIT64( 0x0001000000000000 ),
6354 aSig1,
6355 15 - ( expDiff < 0 ),
6356 &aSig0,
6357 &aSig1
6358 );
6359 shortShift128Left(
6360 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6361 q = le128( bSig0, bSig1, aSig0, aSig1 );
6362 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6363 expDiff -= 64;
6364 while ( 0 < expDiff ) {
6365 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6366 q = ( 4 < q ) ? q - 4 : 0;
6367 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6368 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6369 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6370 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6371 expDiff -= 61;
6372 }
6373 if ( -64 < expDiff ) {
6374 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6375 q = ( 4 < q ) ? q - 4 : 0;
6376 q >>= - expDiff;
6377 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6378 expDiff += 52;
6379 if ( expDiff < 0 ) {
6380 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6381 }
6382 else {
6383 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6384 }
6385 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6386 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6387 }
6388 else {
6389 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6390 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6391 }
6392 do {
6393 alternateASig0 = aSig0;
6394 alternateASig1 = aSig1;
6395 ++q;
6396 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
bb98fe42 6397 } while ( 0 <= (int64_t) aSig0 );
158142c2 6398 add128(
bb98fe42 6399 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
158142c2
FB
6400 if ( ( sigMean0 < 0 )
6401 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6402 aSig0 = alternateASig0;
6403 aSig1 = alternateASig1;
6404 }
bb98fe42 6405 zSign = ( (int64_t) aSig0 < 0 );
158142c2 6406 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
ff32e16e
PM
6407 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
6408 status);
158142c2
FB
6409}
6410
6411/*----------------------------------------------------------------------------
6412| Returns the square root of the quadruple-precision floating-point value `a'.
6413| The operation is performed according to the IEC/IEEE Standard for Binary
6414| Floating-Point Arithmetic.
6415*----------------------------------------------------------------------------*/
6416
e5a41ffa 6417float128 float128_sqrt(float128 a, float_status *status)
158142c2
FB
6418{
6419 flag aSign;
f4014512 6420 int32_t aExp, zExp;
bb98fe42
AF
6421 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6422 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
158142c2
FB
6423
6424 aSig1 = extractFloat128Frac1( a );
6425 aSig0 = extractFloat128Frac0( a );
6426 aExp = extractFloat128Exp( a );
6427 aSign = extractFloat128Sign( a );
6428 if ( aExp == 0x7FFF ) {
ff32e16e
PM
6429 if (aSig0 | aSig1) {
6430 return propagateFloat128NaN(a, a, status);
6431 }
158142c2
FB
6432 if ( ! aSign ) return a;
6433 goto invalid;
6434 }
6435 if ( aSign ) {
6436 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6437 invalid:
ff32e16e 6438 float_raise(float_flag_invalid, status);
af39bc8c 6439 return float128_default_nan(status);
158142c2
FB
6440 }
6441 if ( aExp == 0 ) {
6442 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6443 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6444 }
6445 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6446 aSig0 |= LIT64( 0x0001000000000000 );
6447 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6448 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6449 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6450 doubleZSig0 = zSig0<<1;
6451 mul64To128( zSig0, zSig0, &term0, &term1 );
6452 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
bb98fe42 6453 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
6454 --zSig0;
6455 doubleZSig0 -= 2;
6456 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6457 }
6458 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6459 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6460 if ( zSig1 == 0 ) zSig1 = 1;
6461 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6462 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6463 mul64To128( zSig1, zSig1, &term2, &term3 );
6464 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
bb98fe42 6465 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
6466 --zSig1;
6467 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6468 term3 |= 1;
6469 term2 |= doubleZSig0;
6470 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6471 }
6472 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6473 }
6474 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
ff32e16e 6475 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
158142c2
FB
6476
6477}
6478
6479/*----------------------------------------------------------------------------
6480| Returns 1 if the quadruple-precision floating-point value `a' is equal to
b689362d
AJ
6481| the corresponding value `b', and 0 otherwise. The invalid exception is
6482| raised if either operand is a NaN. Otherwise, the comparison is performed
158142c2
FB
6483| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6484*----------------------------------------------------------------------------*/
6485
e5a41ffa 6486int float128_eq(float128 a, float128 b, float_status *status)
158142c2
FB
6487{
6488
6489 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6490 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6491 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6492 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6493 ) {
ff32e16e 6494 float_raise(float_flag_invalid, status);
158142c2
FB
6495 return 0;
6496 }
6497 return
6498 ( a.low == b.low )
6499 && ( ( a.high == b.high )
6500 || ( ( a.low == 0 )
bb98fe42 6501 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
6502 );
6503
6504}
6505
6506/*----------------------------------------------------------------------------
6507| Returns 1 if the quadruple-precision floating-point value `a' is less than
f5a64251
AJ
6508| or equal to the corresponding value `b', and 0 otherwise. The invalid
6509| exception is raised if either operand is a NaN. The comparison is performed
6510| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
6511*----------------------------------------------------------------------------*/
6512
e5a41ffa 6513int float128_le(float128 a, float128 b, float_status *status)
158142c2
FB
6514{
6515 flag aSign, bSign;
6516
6517 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6518 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6519 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6520 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6521 ) {
ff32e16e 6522 float_raise(float_flag_invalid, status);
158142c2
FB
6523 return 0;
6524 }
6525 aSign = extractFloat128Sign( a );
6526 bSign = extractFloat128Sign( b );
6527 if ( aSign != bSign ) {
6528 return
6529 aSign
bb98fe42 6530 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
6531 == 0 );
6532 }
6533 return
6534 aSign ? le128( b.high, b.low, a.high, a.low )
6535 : le128( a.high, a.low, b.high, b.low );
6536
6537}
6538
6539/*----------------------------------------------------------------------------
6540| Returns 1 if the quadruple-precision floating-point value `a' is less than
f5a64251
AJ
6541| the corresponding value `b', and 0 otherwise. The invalid exception is
6542| raised if either operand is a NaN. The comparison is performed according
6543| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
6544*----------------------------------------------------------------------------*/
6545
e5a41ffa 6546int float128_lt(float128 a, float128 b, float_status *status)
158142c2
FB
6547{
6548 flag aSign, bSign;
6549
6550 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6551 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6552 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6553 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6554 ) {
ff32e16e 6555 float_raise(float_flag_invalid, status);
158142c2
FB
6556 return 0;
6557 }
6558 aSign = extractFloat128Sign( a );
6559 bSign = extractFloat128Sign( b );
6560 if ( aSign != bSign ) {
6561 return
6562 aSign
bb98fe42 6563 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
6564 != 0 );
6565 }
6566 return
6567 aSign ? lt128( b.high, b.low, a.high, a.low )
6568 : lt128( a.high, a.low, b.high, b.low );
6569
6570}
6571
67b7861d
AJ
6572/*----------------------------------------------------------------------------
6573| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
f5a64251
AJ
6574| be compared, and 0 otherwise. The invalid exception is raised if either
6575| operand is a NaN. The comparison is performed according to the IEC/IEEE
6576| Standard for Binary Floating-Point Arithmetic.
67b7861d
AJ
6577*----------------------------------------------------------------------------*/
6578
e5a41ffa 6579int float128_unordered(float128 a, float128 b, float_status *status)
67b7861d
AJ
6580{
6581 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6582 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6583 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6584 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6585 ) {
ff32e16e 6586 float_raise(float_flag_invalid, status);
67b7861d
AJ
6587 return 1;
6588 }
6589 return 0;
6590}
6591
158142c2
FB
6592/*----------------------------------------------------------------------------
6593| Returns 1 if the quadruple-precision floating-point value `a' is equal to
f5a64251
AJ
6594| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6595| exception. The comparison is performed according to the IEC/IEEE Standard
6596| for Binary Floating-Point Arithmetic.
158142c2
FB
6597*----------------------------------------------------------------------------*/
6598
e5a41ffa 6599int float128_eq_quiet(float128 a, float128 b, float_status *status)
158142c2
FB
6600{
6601
6602 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6603 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6604 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6605 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6606 ) {
af39bc8c
AM
6607 if (float128_is_signaling_nan(a, status)
6608 || float128_is_signaling_nan(b, status)) {
ff32e16e 6609 float_raise(float_flag_invalid, status);
b689362d 6610 }
158142c2
FB
6611 return 0;
6612 }
6613 return
6614 ( a.low == b.low )
6615 && ( ( a.high == b.high )
6616 || ( ( a.low == 0 )
bb98fe42 6617 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
6618 );
6619
6620}
6621
6622/*----------------------------------------------------------------------------
6623| Returns 1 if the quadruple-precision floating-point value `a' is less than
6624| or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
6625| cause an exception. Otherwise, the comparison is performed according to the
6626| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6627*----------------------------------------------------------------------------*/
6628
e5a41ffa 6629int float128_le_quiet(float128 a, float128 b, float_status *status)
158142c2
FB
6630{
6631 flag aSign, bSign;
6632
6633 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6634 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6635 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6636 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6637 ) {
af39bc8c
AM
6638 if (float128_is_signaling_nan(a, status)
6639 || float128_is_signaling_nan(b, status)) {
ff32e16e 6640 float_raise(float_flag_invalid, status);
158142c2
FB
6641 }
6642 return 0;
6643 }
6644 aSign = extractFloat128Sign( a );
6645 bSign = extractFloat128Sign( b );
6646 if ( aSign != bSign ) {
6647 return
6648 aSign
bb98fe42 6649 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
6650 == 0 );
6651 }
6652 return
6653 aSign ? le128( b.high, b.low, a.high, a.low )
6654 : le128( a.high, a.low, b.high, b.low );
6655
6656}
6657
6658/*----------------------------------------------------------------------------
6659| Returns 1 if the quadruple-precision floating-point value `a' is less than
6660| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6661| exception. Otherwise, the comparison is performed according to the IEC/IEEE
6662| Standard for Binary Floating-Point Arithmetic.
6663*----------------------------------------------------------------------------*/
6664
e5a41ffa 6665int float128_lt_quiet(float128 a, float128 b, float_status *status)
158142c2
FB
6666{
6667 flag aSign, bSign;
6668
6669 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6670 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6671 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6672 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6673 ) {
af39bc8c
AM
6674 if (float128_is_signaling_nan(a, status)
6675 || float128_is_signaling_nan(b, status)) {
ff32e16e 6676 float_raise(float_flag_invalid, status);
158142c2
FB
6677 }
6678 return 0;
6679 }
6680 aSign = extractFloat128Sign( a );
6681 bSign = extractFloat128Sign( b );
6682 if ( aSign != bSign ) {
6683 return
6684 aSign
bb98fe42 6685 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
6686 != 0 );
6687 }
6688 return
6689 aSign ? lt128( b.high, b.low, a.high, a.low )
6690 : lt128( a.high, a.low, b.high, b.low );
6691
6692}
6693
67b7861d
AJ
6694/*----------------------------------------------------------------------------
6695| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6696| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
6697| comparison is performed according to the IEC/IEEE Standard for Binary
6698| Floating-Point Arithmetic.
6699*----------------------------------------------------------------------------*/
6700
e5a41ffa 6701int float128_unordered_quiet(float128 a, float128 b, float_status *status)
67b7861d
AJ
6702{
6703 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6704 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6705 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6706 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6707 ) {
af39bc8c
AM
6708 if (float128_is_signaling_nan(a, status)
6709 || float128_is_signaling_nan(b, status)) {
ff32e16e 6710 float_raise(float_flag_invalid, status);
67b7861d
AJ
6711 }
6712 return 1;
6713 }
6714 return 0;
6715}
6716
1d6bda35 6717/* misc functions */
e5a41ffa 6718float32 uint32_to_float32(uint32_t a, float_status *status)
1d6bda35 6719{
ff32e16e 6720 return int64_to_float32(a, status);
1d6bda35
FB
6721}
6722
e5a41ffa 6723float64 uint32_to_float64(uint32_t a, float_status *status)
1d6bda35 6724{
ff32e16e 6725 return int64_to_float64(a, status);
1d6bda35
FB
6726}
6727
75d62a58 6728
75d62a58 6729
1d6bda35 6730#define COMPARE(s, nan_exp) \
e5a41ffa
PM
6731static inline int float ## s ## _compare_internal(float ## s a, float ## s b,\
6732 int is_quiet, float_status *status) \
1d6bda35
FB
6733{ \
6734 flag aSign, bSign; \
bb98fe42 6735 uint ## s ## _t av, bv; \
ff32e16e
PM
6736 a = float ## s ## _squash_input_denormal(a, status); \
6737 b = float ## s ## _squash_input_denormal(b, status); \
1d6bda35
FB
6738 \
6739 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
6740 extractFloat ## s ## Frac( a ) ) || \
6741 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
6742 extractFloat ## s ## Frac( b ) )) { \
6743 if (!is_quiet || \
af39bc8c
AM
6744 float ## s ## _is_signaling_nan(a, status) || \
6745 float ## s ## _is_signaling_nan(b, status)) { \
ff32e16e 6746 float_raise(float_flag_invalid, status); \
1d6bda35
FB
6747 } \
6748 return float_relation_unordered; \
6749 } \
6750 aSign = extractFloat ## s ## Sign( a ); \
6751 bSign = extractFloat ## s ## Sign( b ); \
f090c9d4 6752 av = float ## s ## _val(a); \
cd8a2533 6753 bv = float ## s ## _val(b); \
1d6bda35 6754 if ( aSign != bSign ) { \
bb98fe42 6755 if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) { \
1d6bda35
FB
6756 /* zero case */ \
6757 return float_relation_equal; \
6758 } else { \
6759 return 1 - (2 * aSign); \
6760 } \
6761 } else { \
f090c9d4 6762 if (av == bv) { \
1d6bda35
FB
6763 return float_relation_equal; \
6764 } else { \
f090c9d4 6765 return 1 - 2 * (aSign ^ ( av < bv )); \
1d6bda35
FB
6766 } \
6767 } \
6768} \
6769 \
e5a41ffa 6770int float ## s ## _compare(float ## s a, float ## s b, float_status *status) \
1d6bda35 6771{ \
ff32e16e 6772 return float ## s ## _compare_internal(a, b, 0, status); \
1d6bda35
FB
6773} \
6774 \
e5a41ffa
PM
6775int float ## s ## _compare_quiet(float ## s a, float ## s b, \
6776 float_status *status) \
1d6bda35 6777{ \
ff32e16e 6778 return float ## s ## _compare_internal(a, b, 1, status); \
1d6bda35
FB
6779}
6780
6781COMPARE(32, 0xff)
6782COMPARE(64, 0x7ff)
9ee6e8bb 6783
e5a41ffa
PM
6784static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
6785 int is_quiet, float_status *status)
f6714d36
AJ
6786{
6787 flag aSign, bSign;
6788
d1eb8f2a
AD
6789 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6790 float_raise(float_flag_invalid, status);
6791 return float_relation_unordered;
6792 }
f6714d36
AJ
6793 if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
6794 ( extractFloatx80Frac( a )<<1 ) ) ||
6795 ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
6796 ( extractFloatx80Frac( b )<<1 ) )) {
6797 if (!is_quiet ||
af39bc8c
AM
6798 floatx80_is_signaling_nan(a, status) ||
6799 floatx80_is_signaling_nan(b, status)) {
ff32e16e 6800 float_raise(float_flag_invalid, status);
f6714d36
AJ
6801 }
6802 return float_relation_unordered;
6803 }
6804 aSign = extractFloatx80Sign( a );
6805 bSign = extractFloatx80Sign( b );
6806 if ( aSign != bSign ) {
6807
6808 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
6809 ( ( a.low | b.low ) == 0 ) ) {
6810 /* zero case */
6811 return float_relation_equal;
6812 } else {
6813 return 1 - (2 * aSign);
6814 }
6815 } else {
6816 if (a.low == b.low && a.high == b.high) {
6817 return float_relation_equal;
6818 } else {
6819 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6820 }
6821 }
6822}
6823
e5a41ffa 6824int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
f6714d36 6825{
ff32e16e 6826 return floatx80_compare_internal(a, b, 0, status);
f6714d36
AJ
6827}
6828
e5a41ffa 6829int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
f6714d36 6830{
ff32e16e 6831 return floatx80_compare_internal(a, b, 1, status);
f6714d36
AJ
6832}
6833
e5a41ffa
PM
6834static inline int float128_compare_internal(float128 a, float128 b,
6835 int is_quiet, float_status *status)
1f587329
BS
6836{
6837 flag aSign, bSign;
6838
6839 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6840 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6841 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6842 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6843 if (!is_quiet ||
af39bc8c
AM
6844 float128_is_signaling_nan(a, status) ||
6845 float128_is_signaling_nan(b, status)) {
ff32e16e 6846 float_raise(float_flag_invalid, status);
1f587329
BS
6847 }
6848 return float_relation_unordered;
6849 }
6850 aSign = extractFloat128Sign( a );
6851 bSign = extractFloat128Sign( b );
6852 if ( aSign != bSign ) {
6853 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6854 /* zero case */
6855 return float_relation_equal;
6856 } else {
6857 return 1 - (2 * aSign);
6858 }
6859 } else {
6860 if (a.low == b.low && a.high == b.high) {
6861 return float_relation_equal;
6862 } else {
6863 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6864 }
6865 }
6866}
6867
e5a41ffa 6868int float128_compare(float128 a, float128 b, float_status *status)
1f587329 6869{
ff32e16e 6870 return float128_compare_internal(a, b, 0, status);
1f587329
BS
6871}
6872
e5a41ffa 6873int float128_compare_quiet(float128 a, float128 b, float_status *status)
1f587329 6874{
ff32e16e 6875 return float128_compare_internal(a, b, 1, status);
1f587329
BS
6876}
6877
274f1b04
PM
6878/* min() and max() functions. These can't be implemented as
6879 * 'compare and pick one input' because that would mishandle
6880 * NaNs and +0 vs -0.
e17ab310
WN
6881 *
6882 * minnum() and maxnum() functions. These are similar to the min()
6883 * and max() functions but if one of the arguments is a QNaN and
6884 * the other is numerical then the numerical argument is returned.
6885 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
6886 * and maxNum() operations. min() and max() are the typical min/max
6887 * semantics provided by many CPUs which predate that specification.
2d31e060
LA
6888 *
6889 * minnummag() and maxnummag() functions correspond to minNumMag()
6890 * and minNumMag() from the IEEE-754 2008.
274f1b04 6891 */
e70614ea 6892#define MINMAX(s) \
a49db98d 6893static inline float ## s float ## s ## _minmax(float ## s a, float ## s b, \
2d31e060 6894 int ismin, int isieee, \
e5a41ffa
PM
6895 int ismag, \
6896 float_status *status) \
274f1b04
PM
6897{ \
6898 flag aSign, bSign; \
2d31e060 6899 uint ## s ## _t av, bv, aav, abv; \
ff32e16e
PM
6900 a = float ## s ## _squash_input_denormal(a, status); \
6901 b = float ## s ## _squash_input_denormal(b, status); \
274f1b04
PM
6902 if (float ## s ## _is_any_nan(a) || \
6903 float ## s ## _is_any_nan(b)) { \
e17ab310 6904 if (isieee) { \
af39bc8c 6905 if (float ## s ## _is_quiet_nan(a, status) && \
e17ab310
WN
6906 !float ## s ##_is_any_nan(b)) { \
6907 return b; \
af39bc8c
AM
6908 } else if (float ## s ## _is_quiet_nan(b, status) && \
6909 !float ## s ## _is_any_nan(a)) { \
e17ab310
WN
6910 return a; \
6911 } \
6912 } \
ff32e16e 6913 return propagateFloat ## s ## NaN(a, b, status); \
274f1b04
PM
6914 } \
6915 aSign = extractFloat ## s ## Sign(a); \
6916 bSign = extractFloat ## s ## Sign(b); \
6917 av = float ## s ## _val(a); \
6918 bv = float ## s ## _val(b); \
2d31e060
LA
6919 if (ismag) { \
6920 aav = float ## s ## _abs(av); \
6921 abv = float ## s ## _abs(bv); \
6922 if (aav != abv) { \
6923 if (ismin) { \
6924 return (aav < abv) ? a : b; \
6925 } else { \
6926 return (aav < abv) ? b : a; \
6927 } \
6928 } \
6929 } \
274f1b04
PM
6930 if (aSign != bSign) { \
6931 if (ismin) { \
6932 return aSign ? a : b; \
6933 } else { \
6934 return aSign ? b : a; \
6935 } \
6936 } else { \
6937 if (ismin) { \
6938 return (aSign ^ (av < bv)) ? a : b; \
6939 } else { \
6940 return (aSign ^ (av < bv)) ? b : a; \
6941 } \
6942 } \
6943} \
6944 \
e5a41ffa
PM
6945float ## s float ## s ## _min(float ## s a, float ## s b, \
6946 float_status *status) \
274f1b04 6947{ \
ff32e16e 6948 return float ## s ## _minmax(a, b, 1, 0, 0, status); \
274f1b04
PM
6949} \
6950 \
e5a41ffa
PM
6951float ## s float ## s ## _max(float ## s a, float ## s b, \
6952 float_status *status) \
274f1b04 6953{ \
ff32e16e 6954 return float ## s ## _minmax(a, b, 0, 0, 0, status); \
e17ab310
WN
6955} \
6956 \
e5a41ffa
PM
6957float ## s float ## s ## _minnum(float ## s a, float ## s b, \
6958 float_status *status) \
e17ab310 6959{ \
ff32e16e 6960 return float ## s ## _minmax(a, b, 1, 1, 0, status); \
e17ab310
WN
6961} \
6962 \
e5a41ffa
PM
6963float ## s float ## s ## _maxnum(float ## s a, float ## s b, \
6964 float_status *status) \
e17ab310 6965{ \
ff32e16e 6966 return float ## s ## _minmax(a, b, 0, 1, 0, status); \
2d31e060
LA
6967} \
6968 \
e5a41ffa
PM
6969float ## s float ## s ## _minnummag(float ## s a, float ## s b, \
6970 float_status *status) \
2d31e060 6971{ \
ff32e16e 6972 return float ## s ## _minmax(a, b, 1, 1, 1, status); \
2d31e060
LA
6973} \
6974 \
e5a41ffa
PM
6975float ## s float ## s ## _maxnummag(float ## s a, float ## s b, \
6976 float_status *status) \
2d31e060 6977{ \
ff32e16e 6978 return float ## s ## _minmax(a, b, 0, 1, 1, status); \
274f1b04
PM
6979}
6980
e70614ea
WN
6981MINMAX(32)
6982MINMAX(64)
274f1b04
PM
6983
6984
9ee6e8bb 6985/* Multiply A by 2 raised to the power N. */
e5a41ffa 6986float32 float32_scalbn(float32 a, int n, float_status *status)
9ee6e8bb
PB
6987{
6988 flag aSign;
326b9e98 6989 int16_t aExp;
bb98fe42 6990 uint32_t aSig;
9ee6e8bb 6991
ff32e16e 6992 a = float32_squash_input_denormal(a, status);
9ee6e8bb
PB
6993 aSig = extractFloat32Frac( a );
6994 aExp = extractFloat32Exp( a );
6995 aSign = extractFloat32Sign( a );
6996
6997 if ( aExp == 0xFF ) {
326b9e98 6998 if ( aSig ) {
ff32e16e 6999 return propagateFloat32NaN(a, a, status);
326b9e98 7000 }
9ee6e8bb
PB
7001 return a;
7002 }
3c85c37f 7003 if (aExp != 0) {
69397542 7004 aSig |= 0x00800000;
3c85c37f 7005 } else if (aSig == 0) {
69397542 7006 return a;
3c85c37f
PM
7007 } else {
7008 aExp++;
7009 }
69397542 7010
326b9e98
AJ
7011 if (n > 0x200) {
7012 n = 0x200;
7013 } else if (n < -0x200) {
7014 n = -0x200;
7015 }
7016
69397542
PB
7017 aExp += n - 1;
7018 aSig <<= 7;
ff32e16e 7019 return normalizeRoundAndPackFloat32(aSign, aExp, aSig, status);
9ee6e8bb
PB
7020}
7021
e5a41ffa 7022float64 float64_scalbn(float64 a, int n, float_status *status)
9ee6e8bb
PB
7023{
7024 flag aSign;
326b9e98 7025 int16_t aExp;
bb98fe42 7026 uint64_t aSig;
9ee6e8bb 7027
ff32e16e 7028 a = float64_squash_input_denormal(a, status);
9ee6e8bb
PB
7029 aSig = extractFloat64Frac( a );
7030 aExp = extractFloat64Exp( a );
7031 aSign = extractFloat64Sign( a );
7032
7033 if ( aExp == 0x7FF ) {
326b9e98 7034 if ( aSig ) {
ff32e16e 7035 return propagateFloat64NaN(a, a, status);
326b9e98 7036 }
9ee6e8bb
PB
7037 return a;
7038 }
3c85c37f 7039 if (aExp != 0) {
69397542 7040 aSig |= LIT64( 0x0010000000000000 );
3c85c37f 7041 } else if (aSig == 0) {
69397542 7042 return a;
3c85c37f
PM
7043 } else {
7044 aExp++;
7045 }
69397542 7046
326b9e98
AJ
7047 if (n > 0x1000) {
7048 n = 0x1000;
7049 } else if (n < -0x1000) {
7050 n = -0x1000;
7051 }
7052
69397542
PB
7053 aExp += n - 1;
7054 aSig <<= 10;
ff32e16e 7055 return normalizeRoundAndPackFloat64(aSign, aExp, aSig, status);
9ee6e8bb
PB
7056}
7057
e5a41ffa 7058floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
9ee6e8bb
PB
7059{
7060 flag aSign;
326b9e98 7061 int32_t aExp;
bb98fe42 7062 uint64_t aSig;
9ee6e8bb 7063
d1eb8f2a
AD
7064 if (floatx80_invalid_encoding(a)) {
7065 float_raise(float_flag_invalid, status);
7066 return floatx80_default_nan(status);
7067 }
9ee6e8bb
PB
7068 aSig = extractFloatx80Frac( a );
7069 aExp = extractFloatx80Exp( a );
7070 aSign = extractFloatx80Sign( a );
7071
326b9e98
AJ
7072 if ( aExp == 0x7FFF ) {
7073 if ( aSig<<1 ) {
ff32e16e 7074 return propagateFloatx80NaN(a, a, status);
326b9e98 7075 }
9ee6e8bb
PB
7076 return a;
7077 }
326b9e98 7078
3c85c37f
PM
7079 if (aExp == 0) {
7080 if (aSig == 0) {
7081 return a;
7082 }
7083 aExp++;
7084 }
69397542 7085
326b9e98
AJ
7086 if (n > 0x10000) {
7087 n = 0x10000;
7088 } else if (n < -0x10000) {
7089 n = -0x10000;
7090 }
7091
9ee6e8bb 7092 aExp += n;
a2f2d288
PM
7093 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7094 aSign, aExp, aSig, 0, status);
9ee6e8bb 7095}
9ee6e8bb 7096
e5a41ffa 7097float128 float128_scalbn(float128 a, int n, float_status *status)
9ee6e8bb
PB
7098{
7099 flag aSign;
326b9e98 7100 int32_t aExp;
bb98fe42 7101 uint64_t aSig0, aSig1;
9ee6e8bb
PB
7102
7103 aSig1 = extractFloat128Frac1( a );
7104 aSig0 = extractFloat128Frac0( a );
7105 aExp = extractFloat128Exp( a );
7106 aSign = extractFloat128Sign( a );
7107 if ( aExp == 0x7FFF ) {
326b9e98 7108 if ( aSig0 | aSig1 ) {
ff32e16e 7109 return propagateFloat128NaN(a, a, status);
326b9e98 7110 }
9ee6e8bb
PB
7111 return a;
7112 }
3c85c37f 7113 if (aExp != 0) {
69397542 7114 aSig0 |= LIT64( 0x0001000000000000 );
3c85c37f 7115 } else if (aSig0 == 0 && aSig1 == 0) {
69397542 7116 return a;
3c85c37f
PM
7117 } else {
7118 aExp++;
7119 }
69397542 7120
326b9e98
AJ
7121 if (n > 0x10000) {
7122 n = 0x10000;
7123 } else if (n < -0x10000) {
7124 n = -0x10000;
7125 }
7126
69397542
PB
7127 aExp += n - 1;
7128 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
ff32e16e 7129 , status);
9ee6e8bb
PB
7130
7131}