]> git.proxmox.com Git - mirror_qemu.git/blame - fpu/softfloat.c
fpu/softfloat: re-factor div
[mirror_qemu.git] / fpu / softfloat.c
CommitLineData
8d725fac
AF
1/*
2 * QEMU float support
3 *
16017c48
PM
4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
10 * the BSD license
11 * GPL-v2-or-later
12 *
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
8d725fac 16 */
158142c2 17
a7d1ac78
PM
18/*
19===============================================================================
20This C source file is part of the SoftFloat IEC/IEEE Floating-point
21Arithmetic Package, Release 2a.
158142c2
FB
22
23Written by John R. Hauser. This work was made possible in part by the
24International Computer Science Institute, located at Suite 600, 1947 Center
25Street, Berkeley, California 94704. Funding was partially provided by the
26National Science Foundation under grant MIP-9311980. The original version
27of this code was written as part of a project to build a fixed-point vector
28processor in collaboration with the University of California at Berkeley,
29overseen by Profs. Nelson Morgan and John Wawrzynek. More information
a7d1ac78 30is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
158142c2
FB
31arithmetic/SoftFloat.html'.
32
a7d1ac78
PM
33THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
158142c2
FB
38
39Derivative works are acceptable, even for commercial purposes, so long as
a7d1ac78
PM
40(1) they include prominent notice that the work is derivative, and (2) they
41include prominent notice akin to these four paragraphs for those parts of
42this code that are retained.
158142c2 43
a7d1ac78
PM
44===============================================================================
45*/
158142c2 46
16017c48
PM
47/* BSD licensing:
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
50 *
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
53 *
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
56 *
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
60 *
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
64 *
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
76 */
77
78/* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
80 */
81
2ac8bd03
PM
82/* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
84 */
d38ea87a 85#include "qemu/osdep.h"
6fff2167 86#include "qemu/bitops.h"
6b4c305c 87#include "fpu/softfloat.h"
158142c2 88
dc355b76 89/* We only need stdlib for abort() */
dc355b76 90
158142c2
FB
91/*----------------------------------------------------------------------------
92| Primitive arithmetic functions, including multi-word arithmetic, and
93| division and square root approximations. (Can be specialized to target if
94| desired.)
95*----------------------------------------------------------------------------*/
96#include "softfloat-macros.h"
97
98/*----------------------------------------------------------------------------
99| Functions and definitions to determine: (1) whether tininess for underflow
100| is detected before or after rounding by default, (2) what (if anything)
101| happens when exceptions are raised, (3) how signaling NaNs are distinguished
102| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
103| are propagated from function inputs to output. These details are target-
104| specific.
105*----------------------------------------------------------------------------*/
106#include "softfloat-specialize.h"
107
bb4d4bb3
PM
108/*----------------------------------------------------------------------------
109| Returns the fraction bits of the half-precision floating-point value `a'.
110*----------------------------------------------------------------------------*/
111
a49db98d 112static inline uint32_t extractFloat16Frac(float16 a)
bb4d4bb3
PM
113{
114 return float16_val(a) & 0x3ff;
115}
116
117/*----------------------------------------------------------------------------
118| Returns the exponent bits of the half-precision floating-point value `a'.
119*----------------------------------------------------------------------------*/
120
0c48262d 121static inline int extractFloat16Exp(float16 a)
bb4d4bb3
PM
122{
123 return (float16_val(a) >> 10) & 0x1f;
124}
125
126/*----------------------------------------------------------------------------
127| Returns the sign bit of the single-precision floating-point value `a'.
128*----------------------------------------------------------------------------*/
129
a49db98d 130static inline flag extractFloat16Sign(float16 a)
bb4d4bb3
PM
131{
132 return float16_val(a)>>15;
133}
134
d97544c9
AB
135/*----------------------------------------------------------------------------
136| Returns the fraction bits of the single-precision floating-point value `a'.
137*----------------------------------------------------------------------------*/
138
139static inline uint32_t extractFloat32Frac(float32 a)
140{
141 return float32_val(a) & 0x007FFFFF;
142}
143
144/*----------------------------------------------------------------------------
145| Returns the exponent bits of the single-precision floating-point value `a'.
146*----------------------------------------------------------------------------*/
147
148static inline int extractFloat32Exp(float32 a)
149{
150 return (float32_val(a) >> 23) & 0xFF;
151}
152
153/*----------------------------------------------------------------------------
154| Returns the sign bit of the single-precision floating-point value `a'.
155*----------------------------------------------------------------------------*/
156
157static inline flag extractFloat32Sign(float32 a)
158{
159 return float32_val(a) >> 31;
160}
161
162/*----------------------------------------------------------------------------
163| Returns the fraction bits of the double-precision floating-point value `a'.
164*----------------------------------------------------------------------------*/
165
166static inline uint64_t extractFloat64Frac(float64 a)
167{
168 return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF);
169}
170
171/*----------------------------------------------------------------------------
172| Returns the exponent bits of the double-precision floating-point value `a'.
173*----------------------------------------------------------------------------*/
174
175static inline int extractFloat64Exp(float64 a)
176{
177 return (float64_val(a) >> 52) & 0x7FF;
178}
179
180/*----------------------------------------------------------------------------
181| Returns the sign bit of the double-precision floating-point value `a'.
182*----------------------------------------------------------------------------*/
183
184static inline flag extractFloat64Sign(float64 a)
185{
186 return float64_val(a) >> 63;
187}
188
a90119b5
AB
189/*
190 * Classify a floating point number. Everything above float_class_qnan
191 * is a NaN so cls >= float_class_qnan is any NaN.
192 */
193
194typedef enum __attribute__ ((__packed__)) {
195 float_class_unclassified,
196 float_class_zero,
197 float_class_normal,
198 float_class_inf,
199 float_class_qnan, /* all NaNs from here */
200 float_class_snan,
201 float_class_dnan,
202 float_class_msnan, /* maybe silenced */
203} FloatClass;
204
205/*
206 * Structure holding all of the decomposed parts of a float. The
207 * exponent is unbiased and the fraction is normalized. All
208 * calculations are done with a 64 bit fraction and then rounded as
209 * appropriate for the final format.
210 *
211 * Thanks to the packed FloatClass a decent compiler should be able to
212 * fit the whole structure into registers and avoid using the stack
213 * for parameter passing.
214 */
215
216typedef struct {
217 uint64_t frac;
218 int32_t exp;
219 FloatClass cls;
220 bool sign;
221} FloatParts;
222
223#define DECOMPOSED_BINARY_POINT (64 - 2)
224#define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT)
225#define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1)
226
227/* Structure holding all of the relevant parameters for a format.
228 * exp_size: the size of the exponent field
229 * exp_bias: the offset applied to the exponent field
230 * exp_max: the maximum normalised exponent
231 * frac_size: the size of the fraction field
232 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
233 * The following are computed based the size of fraction
234 * frac_lsb: least significant bit of fraction
235 * fram_lsbm1: the bit bellow the least significant bit (for rounding)
236 * round_mask/roundeven_mask: masks used for rounding
237 */
238typedef struct {
239 int exp_size;
240 int exp_bias;
241 int exp_max;
242 int frac_size;
243 int frac_shift;
244 uint64_t frac_lsb;
245 uint64_t frac_lsbm1;
246 uint64_t round_mask;
247 uint64_t roundeven_mask;
248} FloatFmt;
249
250/* Expand fields based on the size of exponent and fraction */
251#define FLOAT_PARAMS(E, F) \
252 .exp_size = E, \
253 .exp_bias = ((1 << E) - 1) >> 1, \
254 .exp_max = (1 << E) - 1, \
255 .frac_size = F, \
256 .frac_shift = DECOMPOSED_BINARY_POINT - F, \
257 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \
258 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \
259 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \
260 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
261
262static const FloatFmt float16_params = {
263 FLOAT_PARAMS(5, 10)
264};
265
266static const FloatFmt float32_params = {
267 FLOAT_PARAMS(8, 23)
268};
269
270static const FloatFmt float64_params = {
271 FLOAT_PARAMS(11, 52)
272};
273
6fff2167
AB
274/* Unpack a float to parts, but do not canonicalize. */
275static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
276{
277 const int sign_pos = fmt.frac_size + fmt.exp_size;
278
279 return (FloatParts) {
280 .cls = float_class_unclassified,
281 .sign = extract64(raw, sign_pos, 1),
282 .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
283 .frac = extract64(raw, 0, fmt.frac_size),
284 };
285}
286
287static inline FloatParts float16_unpack_raw(float16 f)
288{
289 return unpack_raw(float16_params, f);
290}
291
292static inline FloatParts float32_unpack_raw(float32 f)
293{
294 return unpack_raw(float32_params, f);
295}
296
297static inline FloatParts float64_unpack_raw(float64 f)
298{
299 return unpack_raw(float64_params, f);
300}
301
302/* Pack a float from parts, but do not canonicalize. */
303static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
304{
305 const int sign_pos = fmt.frac_size + fmt.exp_size;
306 uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
307 return deposit64(ret, sign_pos, 1, p.sign);
308}
309
310static inline float16 float16_pack_raw(FloatParts p)
311{
312 return make_float16(pack_raw(float16_params, p));
313}
314
315static inline float32 float32_pack_raw(FloatParts p)
316{
317 return make_float32(pack_raw(float32_params, p));
318}
319
320static inline float64 float64_pack_raw(FloatParts p)
321{
322 return make_float64(pack_raw(float64_params, p));
323}
324
325/* Canonicalize EXP and FRAC, setting CLS. */
326static FloatParts canonicalize(FloatParts part, const FloatFmt *parm,
327 float_status *status)
328{
329 if (part.exp == parm->exp_max) {
330 if (part.frac == 0) {
331 part.cls = float_class_inf;
332 } else {
333#ifdef NO_SIGNALING_NANS
334 part.cls = float_class_qnan;
335#else
336 int64_t msb = part.frac << (parm->frac_shift + 2);
337 if ((msb < 0) == status->snan_bit_is_one) {
338 part.cls = float_class_snan;
339 } else {
340 part.cls = float_class_qnan;
341 }
342#endif
343 }
344 } else if (part.exp == 0) {
345 if (likely(part.frac == 0)) {
346 part.cls = float_class_zero;
347 } else if (status->flush_inputs_to_zero) {
348 float_raise(float_flag_input_denormal, status);
349 part.cls = float_class_zero;
350 part.frac = 0;
351 } else {
352 int shift = clz64(part.frac) - 1;
353 part.cls = float_class_normal;
354 part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
355 part.frac <<= shift;
356 }
357 } else {
358 part.cls = float_class_normal;
359 part.exp -= parm->exp_bias;
360 part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
361 }
362 return part;
363}
364
365/* Round and uncanonicalize a floating-point number by parts. There
366 * are FRAC_SHIFT bits that may require rounding at the bottom of the
367 * fraction; these bits will be removed. The exponent will be biased
368 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
369 */
370
371static FloatParts round_canonical(FloatParts p, float_status *s,
372 const FloatFmt *parm)
373{
374 const uint64_t frac_lsbm1 = parm->frac_lsbm1;
375 const uint64_t round_mask = parm->round_mask;
376 const uint64_t roundeven_mask = parm->roundeven_mask;
377 const int exp_max = parm->exp_max;
378 const int frac_shift = parm->frac_shift;
379 uint64_t frac, inc;
380 int exp, flags = 0;
381 bool overflow_norm;
382
383 frac = p.frac;
384 exp = p.exp;
385
386 switch (p.cls) {
387 case float_class_normal:
388 switch (s->float_rounding_mode) {
389 case float_round_nearest_even:
390 overflow_norm = false;
391 inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
392 break;
393 case float_round_ties_away:
394 overflow_norm = false;
395 inc = frac_lsbm1;
396 break;
397 case float_round_to_zero:
398 overflow_norm = true;
399 inc = 0;
400 break;
401 case float_round_up:
402 inc = p.sign ? 0 : round_mask;
403 overflow_norm = p.sign;
404 break;
405 case float_round_down:
406 inc = p.sign ? round_mask : 0;
407 overflow_norm = !p.sign;
408 break;
409 default:
410 g_assert_not_reached();
411 }
412
413 exp += parm->exp_bias;
414 if (likely(exp > 0)) {
415 if (frac & round_mask) {
416 flags |= float_flag_inexact;
417 frac += inc;
418 if (frac & DECOMPOSED_OVERFLOW_BIT) {
419 frac >>= 1;
420 exp++;
421 }
422 }
423 frac >>= frac_shift;
424
425 if (unlikely(exp >= exp_max)) {
426 flags |= float_flag_overflow | float_flag_inexact;
427 if (overflow_norm) {
428 exp = exp_max - 1;
429 frac = -1;
430 } else {
431 p.cls = float_class_inf;
432 goto do_inf;
433 }
434 }
435 } else if (s->flush_to_zero) {
436 flags |= float_flag_output_denormal;
437 p.cls = float_class_zero;
438 goto do_zero;
439 } else {
440 bool is_tiny = (s->float_detect_tininess
441 == float_tininess_before_rounding)
442 || (exp < 0)
443 || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
444
445 shift64RightJamming(frac, 1 - exp, &frac);
446 if (frac & round_mask) {
447 /* Need to recompute round-to-even. */
448 if (s->float_rounding_mode == float_round_nearest_even) {
449 inc = ((frac & roundeven_mask) != frac_lsbm1
450 ? frac_lsbm1 : 0);
451 }
452 flags |= float_flag_inexact;
453 frac += inc;
454 }
455
456 exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
457 frac >>= frac_shift;
458
459 if (is_tiny && (flags & float_flag_inexact)) {
460 flags |= float_flag_underflow;
461 }
462 if (exp == 0 && frac == 0) {
463 p.cls = float_class_zero;
464 }
465 }
466 break;
467
468 case float_class_zero:
469 do_zero:
470 exp = 0;
471 frac = 0;
472 break;
473
474 case float_class_inf:
475 do_inf:
476 exp = exp_max;
477 frac = 0;
478 break;
479
480 case float_class_qnan:
481 case float_class_snan:
482 exp = exp_max;
483 break;
484
485 default:
486 g_assert_not_reached();
487 }
488
489 float_raise(flags, s);
490 p.exp = exp;
491 p.frac = frac;
492 return p;
493}
494
495static FloatParts float16_unpack_canonical(float16 f, float_status *s)
496{
497 return canonicalize(float16_unpack_raw(f), &float16_params, s);
498}
499
500static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
501{
502 switch (p.cls) {
503 case float_class_dnan:
504 return float16_default_nan(s);
505 case float_class_msnan:
506 return float16_maybe_silence_nan(float16_pack_raw(p), s);
507 default:
508 p = round_canonical(p, s, &float16_params);
509 return float16_pack_raw(p);
510 }
511}
512
513static FloatParts float32_unpack_canonical(float32 f, float_status *s)
514{
515 return canonicalize(float32_unpack_raw(f), &float32_params, s);
516}
517
518static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
519{
520 switch (p.cls) {
521 case float_class_dnan:
522 return float32_default_nan(s);
523 case float_class_msnan:
524 return float32_maybe_silence_nan(float32_pack_raw(p), s);
525 default:
526 p = round_canonical(p, s, &float32_params);
527 return float32_pack_raw(p);
528 }
529}
530
531static FloatParts float64_unpack_canonical(float64 f, float_status *s)
532{
533 return canonicalize(float64_unpack_raw(f), &float64_params, s);
534}
535
536static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
537{
538 switch (p.cls) {
539 case float_class_dnan:
540 return float64_default_nan(s);
541 case float_class_msnan:
542 return float64_maybe_silence_nan(float64_pack_raw(p), s);
543 default:
544 p = round_canonical(p, s, &float64_params);
545 return float64_pack_raw(p);
546 }
547}
548
549/* Simple helpers for checking if what NaN we have */
550static bool is_nan(FloatClass c)
551{
552 return unlikely(c >= float_class_qnan);
553}
554static bool is_snan(FloatClass c)
555{
556 return c == float_class_snan;
557}
558static bool is_qnan(FloatClass c)
559{
560 return c == float_class_qnan;
561}
562
563static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
564{
565 if (is_snan(a.cls) || is_snan(b.cls)) {
566 s->float_exception_flags |= float_flag_invalid;
567 }
568
569 if (s->default_nan_mode) {
570 a.cls = float_class_dnan;
571 } else {
572 if (pickNaN(is_qnan(a.cls), is_snan(a.cls),
573 is_qnan(b.cls), is_snan(b.cls),
574 a.frac > b.frac ||
575 (a.frac == b.frac && a.sign < b.sign))) {
576 a = b;
577 }
578 a.cls = float_class_msnan;
579 }
580 return a;
581}
582
583/*
584 * Returns the result of adding or subtracting the values of the
585 * floating-point values `a' and `b'. The operation is performed
586 * according to the IEC/IEEE Standard for Binary Floating-Point
587 * Arithmetic.
588 */
589
590static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
591 float_status *s)
592{
593 bool a_sign = a.sign;
594 bool b_sign = b.sign ^ subtract;
595
596 if (a_sign != b_sign) {
597 /* Subtraction */
598
599 if (a.cls == float_class_normal && b.cls == float_class_normal) {
600 if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
601 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
602 a.frac = a.frac - b.frac;
603 } else {
604 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
605 a.frac = b.frac - a.frac;
606 a.exp = b.exp;
607 a_sign ^= 1;
608 }
609
610 if (a.frac == 0) {
611 a.cls = float_class_zero;
612 a.sign = s->float_rounding_mode == float_round_down;
613 } else {
614 int shift = clz64(a.frac) - 1;
615 a.frac = a.frac << shift;
616 a.exp = a.exp - shift;
617 a.sign = a_sign;
618 }
619 return a;
620 }
621 if (is_nan(a.cls) || is_nan(b.cls)) {
622 return pick_nan(a, b, s);
623 }
624 if (a.cls == float_class_inf) {
625 if (b.cls == float_class_inf) {
626 float_raise(float_flag_invalid, s);
627 a.cls = float_class_dnan;
628 }
629 return a;
630 }
631 if (a.cls == float_class_zero && b.cls == float_class_zero) {
632 a.sign = s->float_rounding_mode == float_round_down;
633 return a;
634 }
635 if (a.cls == float_class_zero || b.cls == float_class_inf) {
636 b.sign = a_sign ^ 1;
637 return b;
638 }
639 if (b.cls == float_class_zero) {
640 return a;
641 }
642 } else {
643 /* Addition */
644 if (a.cls == float_class_normal && b.cls == float_class_normal) {
645 if (a.exp > b.exp) {
646 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
647 } else if (a.exp < b.exp) {
648 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
649 a.exp = b.exp;
650 }
651 a.frac += b.frac;
652 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
653 a.frac >>= 1;
654 a.exp += 1;
655 }
656 return a;
657 }
658 if (is_nan(a.cls) || is_nan(b.cls)) {
659 return pick_nan(a, b, s);
660 }
661 if (a.cls == float_class_inf || b.cls == float_class_zero) {
662 return a;
663 }
664 if (b.cls == float_class_inf || a.cls == float_class_zero) {
665 b.sign = b_sign;
666 return b;
667 }
668 }
669 g_assert_not_reached();
670}
671
672/*
673 * Returns the result of adding or subtracting the floating-point
674 * values `a' and `b'. The operation is performed according to the
675 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
676 */
677
678float16 __attribute__((flatten)) float16_add(float16 a, float16 b,
679 float_status *status)
680{
681 FloatParts pa = float16_unpack_canonical(a, status);
682 FloatParts pb = float16_unpack_canonical(b, status);
683 FloatParts pr = addsub_floats(pa, pb, false, status);
684
685 return float16_round_pack_canonical(pr, status);
686}
687
688float32 __attribute__((flatten)) float32_add(float32 a, float32 b,
689 float_status *status)
690{
691 FloatParts pa = float32_unpack_canonical(a, status);
692 FloatParts pb = float32_unpack_canonical(b, status);
693 FloatParts pr = addsub_floats(pa, pb, false, status);
694
695 return float32_round_pack_canonical(pr, status);
696}
697
698float64 __attribute__((flatten)) float64_add(float64 a, float64 b,
699 float_status *status)
700{
701 FloatParts pa = float64_unpack_canonical(a, status);
702 FloatParts pb = float64_unpack_canonical(b, status);
703 FloatParts pr = addsub_floats(pa, pb, false, status);
704
705 return float64_round_pack_canonical(pr, status);
706}
707
708float16 __attribute__((flatten)) float16_sub(float16 a, float16 b,
709 float_status *status)
710{
711 FloatParts pa = float16_unpack_canonical(a, status);
712 FloatParts pb = float16_unpack_canonical(b, status);
713 FloatParts pr = addsub_floats(pa, pb, true, status);
714
715 return float16_round_pack_canonical(pr, status);
716}
717
718float32 __attribute__((flatten)) float32_sub(float32 a, float32 b,
719 float_status *status)
720{
721 FloatParts pa = float32_unpack_canonical(a, status);
722 FloatParts pb = float32_unpack_canonical(b, status);
723 FloatParts pr = addsub_floats(pa, pb, true, status);
724
725 return float32_round_pack_canonical(pr, status);
726}
727
728float64 __attribute__((flatten)) float64_sub(float64 a, float64 b,
729 float_status *status)
730{
731 FloatParts pa = float64_unpack_canonical(a, status);
732 FloatParts pb = float64_unpack_canonical(b, status);
733 FloatParts pr = addsub_floats(pa, pb, true, status);
734
735 return float64_round_pack_canonical(pr, status);
736}
737
74d707e2
AB
738/*
739 * Returns the result of multiplying the floating-point values `a' and
740 * `b'. The operation is performed according to the IEC/IEEE Standard
741 * for Binary Floating-Point Arithmetic.
742 */
743
744static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
745{
746 bool sign = a.sign ^ b.sign;
747
748 if (a.cls == float_class_normal && b.cls == float_class_normal) {
749 uint64_t hi, lo;
750 int exp = a.exp + b.exp;
751
752 mul64To128(a.frac, b.frac, &hi, &lo);
753 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
754 if (lo & DECOMPOSED_OVERFLOW_BIT) {
755 shift64RightJamming(lo, 1, &lo);
756 exp += 1;
757 }
758
759 /* Re-use a */
760 a.exp = exp;
761 a.sign = sign;
762 a.frac = lo;
763 return a;
764 }
765 /* handle all the NaN cases */
766 if (is_nan(a.cls) || is_nan(b.cls)) {
767 return pick_nan(a, b, s);
768 }
769 /* Inf * Zero == NaN */
770 if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
771 (a.cls == float_class_zero && b.cls == float_class_inf)) {
772 s->float_exception_flags |= float_flag_invalid;
773 a.cls = float_class_dnan;
774 a.sign = sign;
775 return a;
776 }
777 /* Multiply by 0 or Inf */
778 if (a.cls == float_class_inf || a.cls == float_class_zero) {
779 a.sign = sign;
780 return a;
781 }
782 if (b.cls == float_class_inf || b.cls == float_class_zero) {
783 b.sign = sign;
784 return b;
785 }
786 g_assert_not_reached();
787}
788
789float16 __attribute__((flatten)) float16_mul(float16 a, float16 b,
790 float_status *status)
791{
792 FloatParts pa = float16_unpack_canonical(a, status);
793 FloatParts pb = float16_unpack_canonical(b, status);
794 FloatParts pr = mul_floats(pa, pb, status);
795
796 return float16_round_pack_canonical(pr, status);
797}
798
799float32 __attribute__((flatten)) float32_mul(float32 a, float32 b,
800 float_status *status)
801{
802 FloatParts pa = float32_unpack_canonical(a, status);
803 FloatParts pb = float32_unpack_canonical(b, status);
804 FloatParts pr = mul_floats(pa, pb, status);
805
806 return float32_round_pack_canonical(pr, status);
807}
808
809float64 __attribute__((flatten)) float64_mul(float64 a, float64 b,
810 float_status *status)
811{
812 FloatParts pa = float64_unpack_canonical(a, status);
813 FloatParts pb = float64_unpack_canonical(b, status);
814 FloatParts pr = mul_floats(pa, pb, status);
815
816 return float64_round_pack_canonical(pr, status);
817}
818
cf07323d
AB
819/*
820 * Returns the result of dividing the floating-point value `a' by the
821 * corresponding value `b'. The operation is performed according to
822 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
823 */
824
825static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
826{
827 bool sign = a.sign ^ b.sign;
828
829 if (a.cls == float_class_normal && b.cls == float_class_normal) {
830 uint64_t temp_lo, temp_hi;
831 int exp = a.exp - b.exp;
832 if (a.frac < b.frac) {
833 exp -= 1;
834 shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1,
835 &temp_hi, &temp_lo);
836 } else {
837 shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT,
838 &temp_hi, &temp_lo);
839 }
840 /* LSB of quot is set if inexact which roundandpack will use
841 * to set flags. Yet again we re-use a for the result */
842 a.frac = div128To64(temp_lo, temp_hi, b.frac);
843 a.sign = sign;
844 a.exp = exp;
845 return a;
846 }
847 /* handle all the NaN cases */
848 if (is_nan(a.cls) || is_nan(b.cls)) {
849 return pick_nan(a, b, s);
850 }
851 /* 0/0 or Inf/Inf */
852 if (a.cls == b.cls
853 &&
854 (a.cls == float_class_inf || a.cls == float_class_zero)) {
855 s->float_exception_flags |= float_flag_invalid;
856 a.cls = float_class_dnan;
857 return a;
858 }
859 /* Div 0 => Inf */
860 if (b.cls == float_class_zero) {
861 s->float_exception_flags |= float_flag_divbyzero;
862 a.cls = float_class_inf;
863 a.sign = sign;
864 return a;
865 }
866 /* Inf / x or 0 / x */
867 if (a.cls == float_class_inf || a.cls == float_class_zero) {
868 a.sign = sign;
869 return a;
870 }
871 /* Div by Inf */
872 if (b.cls == float_class_inf) {
873 a.cls = float_class_zero;
874 a.sign = sign;
875 return a;
876 }
877 g_assert_not_reached();
878}
879
880float16 float16_div(float16 a, float16 b, float_status *status)
881{
882 FloatParts pa = float16_unpack_canonical(a, status);
883 FloatParts pb = float16_unpack_canonical(b, status);
884 FloatParts pr = div_floats(pa, pb, status);
885
886 return float16_round_pack_canonical(pr, status);
887}
888
889float32 float32_div(float32 a, float32 b, float_status *status)
890{
891 FloatParts pa = float32_unpack_canonical(a, status);
892 FloatParts pb = float32_unpack_canonical(b, status);
893 FloatParts pr = div_floats(pa, pb, status);
894
895 return float32_round_pack_canonical(pr, status);
896}
897
898float64 float64_div(float64 a, float64 b, float_status *status)
899{
900 FloatParts pa = float64_unpack_canonical(a, status);
901 FloatParts pb = float64_unpack_canonical(b, status);
902 FloatParts pr = div_floats(pa, pb, status);
903
904 return float64_round_pack_canonical(pr, status);
905}
906
158142c2
FB
907/*----------------------------------------------------------------------------
908| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
909| and 7, and returns the properly rounded 32-bit integer corresponding to the
910| input. If `zSign' is 1, the input is negated before being converted to an
911| integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
912| is simply rounded to an integer, with the inexact exception raised if the
913| input cannot be represented exactly as an integer. However, if the fixed-
914| point input is too large, the invalid exception is raised and the largest
915| positive or negative integer is returned.
916*----------------------------------------------------------------------------*/
917
f4014512 918static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
158142c2 919{
8f506c70 920 int8_t roundingMode;
158142c2 921 flag roundNearestEven;
8f506c70 922 int8_t roundIncrement, roundBits;
760e1416 923 int32_t z;
158142c2 924
a2f2d288 925 roundingMode = status->float_rounding_mode;
158142c2 926 roundNearestEven = ( roundingMode == float_round_nearest_even );
dc355b76
PM
927 switch (roundingMode) {
928 case float_round_nearest_even:
f9288a76 929 case float_round_ties_away:
dc355b76
PM
930 roundIncrement = 0x40;
931 break;
932 case float_round_to_zero:
933 roundIncrement = 0;
934 break;
935 case float_round_up:
936 roundIncrement = zSign ? 0 : 0x7f;
937 break;
938 case float_round_down:
939 roundIncrement = zSign ? 0x7f : 0;
940 break;
941 default:
942 abort();
158142c2
FB
943 }
944 roundBits = absZ & 0x7F;
945 absZ = ( absZ + roundIncrement )>>7;
946 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
947 z = absZ;
948 if ( zSign ) z = - z;
949 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
ff32e16e 950 float_raise(float_flag_invalid, status);
bb98fe42 951 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2 952 }
a2f2d288
PM
953 if (roundBits) {
954 status->float_exception_flags |= float_flag_inexact;
955 }
158142c2
FB
956 return z;
957
958}
959
960/*----------------------------------------------------------------------------
961| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
962| `absZ1', with binary point between bits 63 and 64 (between the input words),
963| and returns the properly rounded 64-bit integer corresponding to the input.
964| If `zSign' is 1, the input is negated before being converted to an integer.
965| Ordinarily, the fixed-point input is simply rounded to an integer, with
966| the inexact exception raised if the input cannot be represented exactly as
967| an integer. However, if the fixed-point input is too large, the invalid
968| exception is raised and the largest positive or negative integer is
969| returned.
970*----------------------------------------------------------------------------*/
971
f42c2224 972static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
e5a41ffa 973 float_status *status)
158142c2 974{
8f506c70 975 int8_t roundingMode;
158142c2 976 flag roundNearestEven, increment;
760e1416 977 int64_t z;
158142c2 978
a2f2d288 979 roundingMode = status->float_rounding_mode;
158142c2 980 roundNearestEven = ( roundingMode == float_round_nearest_even );
dc355b76
PM
981 switch (roundingMode) {
982 case float_round_nearest_even:
f9288a76 983 case float_round_ties_away:
dc355b76
PM
984 increment = ((int64_t) absZ1 < 0);
985 break;
986 case float_round_to_zero:
987 increment = 0;
988 break;
989 case float_round_up:
990 increment = !zSign && absZ1;
991 break;
992 case float_round_down:
993 increment = zSign && absZ1;
994 break;
995 default:
996 abort();
158142c2
FB
997 }
998 if ( increment ) {
999 ++absZ0;
1000 if ( absZ0 == 0 ) goto overflow;
bb98fe42 1001 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
158142c2
FB
1002 }
1003 z = absZ0;
1004 if ( zSign ) z = - z;
1005 if ( z && ( ( z < 0 ) ^ zSign ) ) {
1006 overflow:
ff32e16e 1007 float_raise(float_flag_invalid, status);
158142c2 1008 return
bb98fe42 1009 zSign ? (int64_t) LIT64( 0x8000000000000000 )
158142c2
FB
1010 : LIT64( 0x7FFFFFFFFFFFFFFF );
1011 }
a2f2d288
PM
1012 if (absZ1) {
1013 status->float_exception_flags |= float_flag_inexact;
1014 }
158142c2
FB
1015 return z;
1016
1017}
1018
fb3ea83a
TM
1019/*----------------------------------------------------------------------------
1020| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
1021| `absZ1', with binary point between bits 63 and 64 (between the input words),
1022| and returns the properly rounded 64-bit unsigned integer corresponding to the
1023| input. Ordinarily, the fixed-point input is simply rounded to an integer,
1024| with the inexact exception raised if the input cannot be represented exactly
1025| as an integer. However, if the fixed-point input is too large, the invalid
1026| exception is raised and the largest unsigned integer is returned.
1027*----------------------------------------------------------------------------*/
1028
f42c2224 1029static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
e5a41ffa 1030 uint64_t absZ1, float_status *status)
fb3ea83a 1031{
8f506c70 1032 int8_t roundingMode;
fb3ea83a
TM
1033 flag roundNearestEven, increment;
1034
a2f2d288 1035 roundingMode = status->float_rounding_mode;
fb3ea83a 1036 roundNearestEven = (roundingMode == float_round_nearest_even);
dc355b76
PM
1037 switch (roundingMode) {
1038 case float_round_nearest_even:
f9288a76 1039 case float_round_ties_away:
dc355b76
PM
1040 increment = ((int64_t)absZ1 < 0);
1041 break;
1042 case float_round_to_zero:
1043 increment = 0;
1044 break;
1045 case float_round_up:
1046 increment = !zSign && absZ1;
1047 break;
1048 case float_round_down:
1049 increment = zSign && absZ1;
1050 break;
1051 default:
1052 abort();
fb3ea83a
TM
1053 }
1054 if (increment) {
1055 ++absZ0;
1056 if (absZ0 == 0) {
ff32e16e 1057 float_raise(float_flag_invalid, status);
fb3ea83a
TM
1058 return LIT64(0xFFFFFFFFFFFFFFFF);
1059 }
1060 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
1061 }
1062
1063 if (zSign && absZ0) {
ff32e16e 1064 float_raise(float_flag_invalid, status);
fb3ea83a
TM
1065 return 0;
1066 }
1067
1068 if (absZ1) {
a2f2d288 1069 status->float_exception_flags |= float_flag_inexact;
fb3ea83a
TM
1070 }
1071 return absZ0;
1072}
1073
37d18660
PM
1074/*----------------------------------------------------------------------------
1075| If `a' is denormal and we are in flush-to-zero mode then set the
1076| input-denormal exception and return zero. Otherwise just return the value.
1077*----------------------------------------------------------------------------*/
e5a41ffa 1078float32 float32_squash_input_denormal(float32 a, float_status *status)
37d18660 1079{
a2f2d288 1080 if (status->flush_inputs_to_zero) {
37d18660 1081 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
ff32e16e 1082 float_raise(float_flag_input_denormal, status);
37d18660
PM
1083 return make_float32(float32_val(a) & 0x80000000);
1084 }
1085 }
1086 return a;
1087}
1088
158142c2
FB
1089/*----------------------------------------------------------------------------
1090| Normalizes the subnormal single-precision floating-point value represented
1091| by the denormalized significand `aSig'. The normalized exponent and
1092| significand are stored at the locations pointed to by `zExpPtr' and
1093| `zSigPtr', respectively.
1094*----------------------------------------------------------------------------*/
1095
1096static void
0c48262d 1097 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
158142c2 1098{
8f506c70 1099 int8_t shiftCount;
158142c2
FB
1100
1101 shiftCount = countLeadingZeros32( aSig ) - 8;
1102 *zSigPtr = aSig<<shiftCount;
1103 *zExpPtr = 1 - shiftCount;
1104
1105}
1106
1107/*----------------------------------------------------------------------------
1108| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
1109| single-precision floating-point value, returning the result. After being
1110| shifted into the proper positions, the three fields are simply added
1111| together to form the result. This means that any integer portion of `zSig'
1112| will be added into the exponent. Since a properly normalized significand
1113| will have an integer portion equal to 1, the `zExp' input should be 1 less
1114| than the desired result exponent whenever `zSig' is a complete, normalized
1115| significand.
1116*----------------------------------------------------------------------------*/
1117
0c48262d 1118static inline float32 packFloat32(flag zSign, int zExp, uint32_t zSig)
158142c2
FB
1119{
1120
f090c9d4 1121 return make_float32(
bb98fe42 1122 ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
158142c2
FB
1123
1124}
1125
1126/*----------------------------------------------------------------------------
1127| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1128| and significand `zSig', and returns the proper single-precision floating-
1129| point value corresponding to the abstract input. Ordinarily, the abstract
1130| value is simply rounded and packed into the single-precision format, with
1131| the inexact exception raised if the abstract input cannot be represented
1132| exactly. However, if the abstract value is too large, the overflow and
1133| inexact exceptions are raised and an infinity or maximal finite value is
1134| returned. If the abstract value is too small, the input value is rounded to
1135| a subnormal number, and the underflow and inexact exceptions are raised if
1136| the abstract input cannot be represented exactly as a subnormal single-
1137| precision floating-point number.
1138| The input significand `zSig' has its binary point between bits 30
1139| and 29, which is 7 bits to the left of the usual location. This shifted
1140| significand must be normalized or smaller. If `zSig' is not normalized,
1141| `zExp' must be 0; in that case, the result returned is a subnormal number,
1142| and it must not require rounding. In the usual case that `zSig' is
1143| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
1144| The handling of underflow and overflow follows the IEC/IEEE Standard for
1145| Binary Floating-Point Arithmetic.
1146*----------------------------------------------------------------------------*/
1147
0c48262d 1148static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
e5a41ffa 1149 float_status *status)
158142c2 1150{
8f506c70 1151 int8_t roundingMode;
158142c2 1152 flag roundNearestEven;
8f506c70 1153 int8_t roundIncrement, roundBits;
158142c2
FB
1154 flag isTiny;
1155
a2f2d288 1156 roundingMode = status->float_rounding_mode;
158142c2 1157 roundNearestEven = ( roundingMode == float_round_nearest_even );
dc355b76
PM
1158 switch (roundingMode) {
1159 case float_round_nearest_even:
f9288a76 1160 case float_round_ties_away:
dc355b76
PM
1161 roundIncrement = 0x40;
1162 break;
1163 case float_round_to_zero:
1164 roundIncrement = 0;
1165 break;
1166 case float_round_up:
1167 roundIncrement = zSign ? 0 : 0x7f;
1168 break;
1169 case float_round_down:
1170 roundIncrement = zSign ? 0x7f : 0;
1171 break;
1172 default:
1173 abort();
1174 break;
158142c2
FB
1175 }
1176 roundBits = zSig & 0x7F;
bb98fe42 1177 if ( 0xFD <= (uint16_t) zExp ) {
158142c2
FB
1178 if ( ( 0xFD < zExp )
1179 || ( ( zExp == 0xFD )
bb98fe42 1180 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
158142c2 1181 ) {
ff32e16e 1182 float_raise(float_flag_overflow | float_flag_inexact, status);
f090c9d4 1183 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
158142c2
FB
1184 }
1185 if ( zExp < 0 ) {
a2f2d288 1186 if (status->flush_to_zero) {
ff32e16e 1187 float_raise(float_flag_output_denormal, status);
e6afc87f
PM
1188 return packFloat32(zSign, 0, 0);
1189 }
158142c2 1190 isTiny =
a2f2d288
PM
1191 (status->float_detect_tininess
1192 == float_tininess_before_rounding)
158142c2
FB
1193 || ( zExp < -1 )
1194 || ( zSig + roundIncrement < 0x80000000 );
1195 shift32RightJamming( zSig, - zExp, &zSig );
1196 zExp = 0;
1197 roundBits = zSig & 0x7F;
ff32e16e
PM
1198 if (isTiny && roundBits) {
1199 float_raise(float_flag_underflow, status);
1200 }
158142c2
FB
1201 }
1202 }
a2f2d288
PM
1203 if (roundBits) {
1204 status->float_exception_flags |= float_flag_inexact;
1205 }
158142c2
FB
1206 zSig = ( zSig + roundIncrement )>>7;
1207 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
1208 if ( zSig == 0 ) zExp = 0;
1209 return packFloat32( zSign, zExp, zSig );
1210
1211}
1212
1213/*----------------------------------------------------------------------------
1214| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1215| and significand `zSig', and returns the proper single-precision floating-
1216| point value corresponding to the abstract input. This routine is just like
1217| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
1218| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
1219| floating-point exponent.
1220*----------------------------------------------------------------------------*/
1221
1222static float32
0c48262d 1223 normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
e5a41ffa 1224 float_status *status)
158142c2 1225{
8f506c70 1226 int8_t shiftCount;
158142c2
FB
1227
1228 shiftCount = countLeadingZeros32( zSig ) - 1;
ff32e16e
PM
1229 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
1230 status);
158142c2
FB
1231
1232}
1233
37d18660
PM
1234/*----------------------------------------------------------------------------
1235| If `a' is denormal and we are in flush-to-zero mode then set the
1236| input-denormal exception and return zero. Otherwise just return the value.
1237*----------------------------------------------------------------------------*/
e5a41ffa 1238float64 float64_squash_input_denormal(float64 a, float_status *status)
37d18660 1239{
a2f2d288 1240 if (status->flush_inputs_to_zero) {
37d18660 1241 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
ff32e16e 1242 float_raise(float_flag_input_denormal, status);
37d18660
PM
1243 return make_float64(float64_val(a) & (1ULL << 63));
1244 }
1245 }
1246 return a;
1247}
1248
158142c2
FB
1249/*----------------------------------------------------------------------------
1250| Normalizes the subnormal double-precision floating-point value represented
1251| by the denormalized significand `aSig'. The normalized exponent and
1252| significand are stored at the locations pointed to by `zExpPtr' and
1253| `zSigPtr', respectively.
1254*----------------------------------------------------------------------------*/
1255
1256static void
0c48262d 1257 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
158142c2 1258{
8f506c70 1259 int8_t shiftCount;
158142c2
FB
1260
1261 shiftCount = countLeadingZeros64( aSig ) - 11;
1262 *zSigPtr = aSig<<shiftCount;
1263 *zExpPtr = 1 - shiftCount;
1264
1265}
1266
1267/*----------------------------------------------------------------------------
1268| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
1269| double-precision floating-point value, returning the result. After being
1270| shifted into the proper positions, the three fields are simply added
1271| together to form the result. This means that any integer portion of `zSig'
1272| will be added into the exponent. Since a properly normalized significand
1273| will have an integer portion equal to 1, the `zExp' input should be 1 less
1274| than the desired result exponent whenever `zSig' is a complete, normalized
1275| significand.
1276*----------------------------------------------------------------------------*/
1277
0c48262d 1278static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
158142c2
FB
1279{
1280
f090c9d4 1281 return make_float64(
bb98fe42 1282 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
158142c2
FB
1283
1284}
1285
1286/*----------------------------------------------------------------------------
1287| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1288| and significand `zSig', and returns the proper double-precision floating-
1289| point value corresponding to the abstract input. Ordinarily, the abstract
1290| value is simply rounded and packed into the double-precision format, with
1291| the inexact exception raised if the abstract input cannot be represented
1292| exactly. However, if the abstract value is too large, the overflow and
1293| inexact exceptions are raised and an infinity or maximal finite value is
a7d1ac78
PM
1294| returned. If the abstract value is too small, the input value is rounded to
1295| a subnormal number, and the underflow and inexact exceptions are raised if
1296| the abstract input cannot be represented exactly as a subnormal double-
158142c2
FB
1297| precision floating-point number.
1298| The input significand `zSig' has its binary point between bits 62
1299| and 61, which is 10 bits to the left of the usual location. This shifted
1300| significand must be normalized or smaller. If `zSig' is not normalized,
1301| `zExp' must be 0; in that case, the result returned is a subnormal number,
1302| and it must not require rounding. In the usual case that `zSig' is
1303| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
1304| The handling of underflow and overflow follows the IEC/IEEE Standard for
1305| Binary Floating-Point Arithmetic.
1306*----------------------------------------------------------------------------*/
1307
0c48262d 1308static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
e5a41ffa 1309 float_status *status)
158142c2 1310{
8f506c70 1311 int8_t roundingMode;
158142c2 1312 flag roundNearestEven;
0c48262d 1313 int roundIncrement, roundBits;
158142c2
FB
1314 flag isTiny;
1315
a2f2d288 1316 roundingMode = status->float_rounding_mode;
158142c2 1317 roundNearestEven = ( roundingMode == float_round_nearest_even );
dc355b76
PM
1318 switch (roundingMode) {
1319 case float_round_nearest_even:
f9288a76 1320 case float_round_ties_away:
dc355b76
PM
1321 roundIncrement = 0x200;
1322 break;
1323 case float_round_to_zero:
1324 roundIncrement = 0;
1325 break;
1326 case float_round_up:
1327 roundIncrement = zSign ? 0 : 0x3ff;
1328 break;
1329 case float_round_down:
1330 roundIncrement = zSign ? 0x3ff : 0;
1331 break;
9ee6f678
BR
1332 case float_round_to_odd:
1333 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
1334 break;
dc355b76
PM
1335 default:
1336 abort();
158142c2
FB
1337 }
1338 roundBits = zSig & 0x3FF;
bb98fe42 1339 if ( 0x7FD <= (uint16_t) zExp ) {
158142c2
FB
1340 if ( ( 0x7FD < zExp )
1341 || ( ( zExp == 0x7FD )
bb98fe42 1342 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
158142c2 1343 ) {
9ee6f678
BR
1344 bool overflow_to_inf = roundingMode != float_round_to_odd &&
1345 roundIncrement != 0;
ff32e16e 1346 float_raise(float_flag_overflow | float_flag_inexact, status);
9ee6f678 1347 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
158142c2
FB
1348 }
1349 if ( zExp < 0 ) {
a2f2d288 1350 if (status->flush_to_zero) {
ff32e16e 1351 float_raise(float_flag_output_denormal, status);
e6afc87f
PM
1352 return packFloat64(zSign, 0, 0);
1353 }
158142c2 1354 isTiny =
a2f2d288
PM
1355 (status->float_detect_tininess
1356 == float_tininess_before_rounding)
158142c2
FB
1357 || ( zExp < -1 )
1358 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
1359 shift64RightJamming( zSig, - zExp, &zSig );
1360 zExp = 0;
1361 roundBits = zSig & 0x3FF;
ff32e16e
PM
1362 if (isTiny && roundBits) {
1363 float_raise(float_flag_underflow, status);
1364 }
9ee6f678
BR
1365 if (roundingMode == float_round_to_odd) {
1366 /*
1367 * For round-to-odd case, the roundIncrement depends on
1368 * zSig which just changed.
1369 */
1370 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
1371 }
158142c2
FB
1372 }
1373 }
a2f2d288
PM
1374 if (roundBits) {
1375 status->float_exception_flags |= float_flag_inexact;
1376 }
158142c2
FB
1377 zSig = ( zSig + roundIncrement )>>10;
1378 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
1379 if ( zSig == 0 ) zExp = 0;
1380 return packFloat64( zSign, zExp, zSig );
1381
1382}
1383
1384/*----------------------------------------------------------------------------
1385| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1386| and significand `zSig', and returns the proper double-precision floating-
1387| point value corresponding to the abstract input. This routine is just like
1388| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
1389| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
1390| floating-point exponent.
1391*----------------------------------------------------------------------------*/
1392
1393static float64
0c48262d 1394 normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
e5a41ffa 1395 float_status *status)
158142c2 1396{
8f506c70 1397 int8_t shiftCount;
158142c2
FB
1398
1399 shiftCount = countLeadingZeros64( zSig ) - 1;
ff32e16e
PM
1400 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
1401 status);
158142c2
FB
1402
1403}
1404
158142c2
FB
1405/*----------------------------------------------------------------------------
1406| Returns the fraction bits of the extended double-precision floating-point
1407| value `a'.
1408*----------------------------------------------------------------------------*/
1409
a49db98d 1410static inline uint64_t extractFloatx80Frac( floatx80 a )
158142c2
FB
1411{
1412
1413 return a.low;
1414
1415}
1416
1417/*----------------------------------------------------------------------------
1418| Returns the exponent bits of the extended double-precision floating-point
1419| value `a'.
1420*----------------------------------------------------------------------------*/
1421
f4014512 1422static inline int32_t extractFloatx80Exp( floatx80 a )
158142c2
FB
1423{
1424
1425 return a.high & 0x7FFF;
1426
1427}
1428
1429/*----------------------------------------------------------------------------
1430| Returns the sign bit of the extended double-precision floating-point value
1431| `a'.
1432*----------------------------------------------------------------------------*/
1433
a49db98d 1434static inline flag extractFloatx80Sign( floatx80 a )
158142c2
FB
1435{
1436
1437 return a.high>>15;
1438
1439}
1440
1441/*----------------------------------------------------------------------------
1442| Normalizes the subnormal extended double-precision floating-point value
1443| represented by the denormalized significand `aSig'. The normalized exponent
1444| and significand are stored at the locations pointed to by `zExpPtr' and
1445| `zSigPtr', respectively.
1446*----------------------------------------------------------------------------*/
1447
1448static void
f4014512 1449 normalizeFloatx80Subnormal( uint64_t aSig, int32_t *zExpPtr, uint64_t *zSigPtr )
158142c2 1450{
8f506c70 1451 int8_t shiftCount;
158142c2
FB
1452
1453 shiftCount = countLeadingZeros64( aSig );
1454 *zSigPtr = aSig<<shiftCount;
1455 *zExpPtr = 1 - shiftCount;
1456
1457}
1458
1459/*----------------------------------------------------------------------------
1460| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
1461| extended double-precision floating-point value, returning the result.
1462*----------------------------------------------------------------------------*/
1463
f4014512 1464static inline floatx80 packFloatx80( flag zSign, int32_t zExp, uint64_t zSig )
158142c2
FB
1465{
1466 floatx80 z;
1467
1468 z.low = zSig;
bb98fe42 1469 z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
158142c2
FB
1470 return z;
1471
1472}
1473
1474/*----------------------------------------------------------------------------
1475| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1476| and extended significand formed by the concatenation of `zSig0' and `zSig1',
1477| and returns the proper extended double-precision floating-point value
1478| corresponding to the abstract input. Ordinarily, the abstract value is
1479| rounded and packed into the extended double-precision format, with the
1480| inexact exception raised if the abstract input cannot be represented
1481| exactly. However, if the abstract value is too large, the overflow and
1482| inexact exceptions are raised and an infinity or maximal finite value is
1483| returned. If the abstract value is too small, the input value is rounded to
1484| a subnormal number, and the underflow and inexact exceptions are raised if
1485| the abstract input cannot be represented exactly as a subnormal extended
1486| double-precision floating-point number.
1487| If `roundingPrecision' is 32 or 64, the result is rounded to the same
1488| number of bits as single or double precision, respectively. Otherwise, the
1489| result is rounded to the full precision of the extended double-precision
1490| format.
1491| The input significand must be normalized or smaller. If the input
1492| significand is not normalized, `zExp' must be 0; in that case, the result
1493| returned is a subnormal number, and it must not require rounding. The
1494| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
1495| Floating-Point Arithmetic.
1496*----------------------------------------------------------------------------*/
1497
8f506c70 1498static floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
f4014512 1499 int32_t zExp, uint64_t zSig0, uint64_t zSig1,
e5a41ffa 1500 float_status *status)
158142c2 1501{
8f506c70 1502 int8_t roundingMode;
158142c2 1503 flag roundNearestEven, increment, isTiny;
f42c2224 1504 int64_t roundIncrement, roundMask, roundBits;
158142c2 1505
a2f2d288 1506 roundingMode = status->float_rounding_mode;
158142c2
FB
1507 roundNearestEven = ( roundingMode == float_round_nearest_even );
1508 if ( roundingPrecision == 80 ) goto precision80;
1509 if ( roundingPrecision == 64 ) {
1510 roundIncrement = LIT64( 0x0000000000000400 );
1511 roundMask = LIT64( 0x00000000000007FF );
1512 }
1513 else if ( roundingPrecision == 32 ) {
1514 roundIncrement = LIT64( 0x0000008000000000 );
1515 roundMask = LIT64( 0x000000FFFFFFFFFF );
1516 }
1517 else {
1518 goto precision80;
1519 }
1520 zSig0 |= ( zSig1 != 0 );
dc355b76
PM
1521 switch (roundingMode) {
1522 case float_round_nearest_even:
f9288a76 1523 case float_round_ties_away:
dc355b76
PM
1524 break;
1525 case float_round_to_zero:
1526 roundIncrement = 0;
1527 break;
1528 case float_round_up:
1529 roundIncrement = zSign ? 0 : roundMask;
1530 break;
1531 case float_round_down:
1532 roundIncrement = zSign ? roundMask : 0;
1533 break;
1534 default:
1535 abort();
158142c2
FB
1536 }
1537 roundBits = zSig0 & roundMask;
bb98fe42 1538 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
158142c2
FB
1539 if ( ( 0x7FFE < zExp )
1540 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
1541 ) {
1542 goto overflow;
1543 }
1544 if ( zExp <= 0 ) {
a2f2d288 1545 if (status->flush_to_zero) {
ff32e16e 1546 float_raise(float_flag_output_denormal, status);
e6afc87f
PM
1547 return packFloatx80(zSign, 0, 0);
1548 }
158142c2 1549 isTiny =
a2f2d288
PM
1550 (status->float_detect_tininess
1551 == float_tininess_before_rounding)
158142c2
FB
1552 || ( zExp < 0 )
1553 || ( zSig0 <= zSig0 + roundIncrement );
1554 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
1555 zExp = 0;
1556 roundBits = zSig0 & roundMask;
ff32e16e
PM
1557 if (isTiny && roundBits) {
1558 float_raise(float_flag_underflow, status);
1559 }
a2f2d288
PM
1560 if (roundBits) {
1561 status->float_exception_flags |= float_flag_inexact;
1562 }
158142c2 1563 zSig0 += roundIncrement;
bb98fe42 1564 if ( (int64_t) zSig0 < 0 ) zExp = 1;
158142c2
FB
1565 roundIncrement = roundMask + 1;
1566 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
1567 roundMask |= roundIncrement;
1568 }
1569 zSig0 &= ~ roundMask;
1570 return packFloatx80( zSign, zExp, zSig0 );
1571 }
1572 }
a2f2d288
PM
1573 if (roundBits) {
1574 status->float_exception_flags |= float_flag_inexact;
1575 }
158142c2
FB
1576 zSig0 += roundIncrement;
1577 if ( zSig0 < roundIncrement ) {
1578 ++zExp;
1579 zSig0 = LIT64( 0x8000000000000000 );
1580 }
1581 roundIncrement = roundMask + 1;
1582 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
1583 roundMask |= roundIncrement;
1584 }
1585 zSig0 &= ~ roundMask;
1586 if ( zSig0 == 0 ) zExp = 0;
1587 return packFloatx80( zSign, zExp, zSig0 );
1588 precision80:
dc355b76
PM
1589 switch (roundingMode) {
1590 case float_round_nearest_even:
f9288a76 1591 case float_round_ties_away:
dc355b76
PM
1592 increment = ((int64_t)zSig1 < 0);
1593 break;
1594 case float_round_to_zero:
1595 increment = 0;
1596 break;
1597 case float_round_up:
1598 increment = !zSign && zSig1;
1599 break;
1600 case float_round_down:
1601 increment = zSign && zSig1;
1602 break;
1603 default:
1604 abort();
158142c2 1605 }
bb98fe42 1606 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
158142c2
FB
1607 if ( ( 0x7FFE < zExp )
1608 || ( ( zExp == 0x7FFE )
1609 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
1610 && increment
1611 )
1612 ) {
1613 roundMask = 0;
1614 overflow:
ff32e16e 1615 float_raise(float_flag_overflow | float_flag_inexact, status);
158142c2
FB
1616 if ( ( roundingMode == float_round_to_zero )
1617 || ( zSign && ( roundingMode == float_round_up ) )
1618 || ( ! zSign && ( roundingMode == float_round_down ) )
1619 ) {
1620 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
1621 }
1622 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1623 }
1624 if ( zExp <= 0 ) {
1625 isTiny =
a2f2d288
PM
1626 (status->float_detect_tininess
1627 == float_tininess_before_rounding)
158142c2
FB
1628 || ( zExp < 0 )
1629 || ! increment
1630 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
1631 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
1632 zExp = 0;
ff32e16e
PM
1633 if (isTiny && zSig1) {
1634 float_raise(float_flag_underflow, status);
1635 }
a2f2d288
PM
1636 if (zSig1) {
1637 status->float_exception_flags |= float_flag_inexact;
1638 }
dc355b76
PM
1639 switch (roundingMode) {
1640 case float_round_nearest_even:
f9288a76 1641 case float_round_ties_away:
dc355b76
PM
1642 increment = ((int64_t)zSig1 < 0);
1643 break;
1644 case float_round_to_zero:
1645 increment = 0;
1646 break;
1647 case float_round_up:
1648 increment = !zSign && zSig1;
1649 break;
1650 case float_round_down:
1651 increment = zSign && zSig1;
1652 break;
1653 default:
1654 abort();
158142c2
FB
1655 }
1656 if ( increment ) {
1657 ++zSig0;
1658 zSig0 &=
bb98fe42
AF
1659 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
1660 if ( (int64_t) zSig0 < 0 ) zExp = 1;
158142c2
FB
1661 }
1662 return packFloatx80( zSign, zExp, zSig0 );
1663 }
1664 }
a2f2d288
PM
1665 if (zSig1) {
1666 status->float_exception_flags |= float_flag_inexact;
1667 }
158142c2
FB
1668 if ( increment ) {
1669 ++zSig0;
1670 if ( zSig0 == 0 ) {
1671 ++zExp;
1672 zSig0 = LIT64( 0x8000000000000000 );
1673 }
1674 else {
bb98fe42 1675 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
158142c2
FB
1676 }
1677 }
1678 else {
1679 if ( zSig0 == 0 ) zExp = 0;
1680 }
1681 return packFloatx80( zSign, zExp, zSig0 );
1682
1683}
1684
1685/*----------------------------------------------------------------------------
1686| Takes an abstract floating-point value having sign `zSign', exponent
1687| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
1688| and returns the proper extended double-precision floating-point value
1689| corresponding to the abstract input. This routine is just like
1690| `roundAndPackFloatx80' except that the input significand does not have to be
1691| normalized.
1692*----------------------------------------------------------------------------*/
1693
8f506c70 1694static floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
f4014512 1695 flag zSign, int32_t zExp,
e5a41ffa
PM
1696 uint64_t zSig0, uint64_t zSig1,
1697 float_status *status)
158142c2 1698{
8f506c70 1699 int8_t shiftCount;
158142c2
FB
1700
1701 if ( zSig0 == 0 ) {
1702 zSig0 = zSig1;
1703 zSig1 = 0;
1704 zExp -= 64;
1705 }
1706 shiftCount = countLeadingZeros64( zSig0 );
1707 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1708 zExp -= shiftCount;
ff32e16e
PM
1709 return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
1710 zSig0, zSig1, status);
158142c2
FB
1711
1712}
1713
158142c2
FB
1714/*----------------------------------------------------------------------------
1715| Returns the least-significant 64 fraction bits of the quadruple-precision
1716| floating-point value `a'.
1717*----------------------------------------------------------------------------*/
1718
a49db98d 1719static inline uint64_t extractFloat128Frac1( float128 a )
158142c2
FB
1720{
1721
1722 return a.low;
1723
1724}
1725
1726/*----------------------------------------------------------------------------
1727| Returns the most-significant 48 fraction bits of the quadruple-precision
1728| floating-point value `a'.
1729*----------------------------------------------------------------------------*/
1730
a49db98d 1731static inline uint64_t extractFloat128Frac0( float128 a )
158142c2
FB
1732{
1733
1734 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
1735
1736}
1737
1738/*----------------------------------------------------------------------------
1739| Returns the exponent bits of the quadruple-precision floating-point value
1740| `a'.
1741*----------------------------------------------------------------------------*/
1742
f4014512 1743static inline int32_t extractFloat128Exp( float128 a )
158142c2
FB
1744{
1745
1746 return ( a.high>>48 ) & 0x7FFF;
1747
1748}
1749
1750/*----------------------------------------------------------------------------
1751| Returns the sign bit of the quadruple-precision floating-point value `a'.
1752*----------------------------------------------------------------------------*/
1753
a49db98d 1754static inline flag extractFloat128Sign( float128 a )
158142c2
FB
1755{
1756
1757 return a.high>>63;
1758
1759}
1760
1761/*----------------------------------------------------------------------------
1762| Normalizes the subnormal quadruple-precision floating-point value
1763| represented by the denormalized significand formed by the concatenation of
1764| `aSig0' and `aSig1'. The normalized exponent is stored at the location
1765| pointed to by `zExpPtr'. The most significant 49 bits of the normalized
1766| significand are stored at the location pointed to by `zSig0Ptr', and the
1767| least significant 64 bits of the normalized significand are stored at the
1768| location pointed to by `zSig1Ptr'.
1769*----------------------------------------------------------------------------*/
1770
1771static void
1772 normalizeFloat128Subnormal(
bb98fe42
AF
1773 uint64_t aSig0,
1774 uint64_t aSig1,
f4014512 1775 int32_t *zExpPtr,
bb98fe42
AF
1776 uint64_t *zSig0Ptr,
1777 uint64_t *zSig1Ptr
158142c2
FB
1778 )
1779{
8f506c70 1780 int8_t shiftCount;
158142c2
FB
1781
1782 if ( aSig0 == 0 ) {
1783 shiftCount = countLeadingZeros64( aSig1 ) - 15;
1784 if ( shiftCount < 0 ) {
1785 *zSig0Ptr = aSig1>>( - shiftCount );
1786 *zSig1Ptr = aSig1<<( shiftCount & 63 );
1787 }
1788 else {
1789 *zSig0Ptr = aSig1<<shiftCount;
1790 *zSig1Ptr = 0;
1791 }
1792 *zExpPtr = - shiftCount - 63;
1793 }
1794 else {
1795 shiftCount = countLeadingZeros64( aSig0 ) - 15;
1796 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
1797 *zExpPtr = 1 - shiftCount;
1798 }
1799
1800}
1801
1802/*----------------------------------------------------------------------------
1803| Packs the sign `zSign', the exponent `zExp', and the significand formed
1804| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
1805| floating-point value, returning the result. After being shifted into the
1806| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
1807| added together to form the most significant 32 bits of the result. This
1808| means that any integer portion of `zSig0' will be added into the exponent.
1809| Since a properly normalized significand will have an integer portion equal
1810| to 1, the `zExp' input should be 1 less than the desired result exponent
1811| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
1812| significand.
1813*----------------------------------------------------------------------------*/
1814
a49db98d 1815static inline float128
f4014512 1816 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
158142c2
FB
1817{
1818 float128 z;
1819
1820 z.low = zSig1;
bb98fe42 1821 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
158142c2
FB
1822 return z;
1823
1824}
1825
1826/*----------------------------------------------------------------------------
1827| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1828| and extended significand formed by the concatenation of `zSig0', `zSig1',
1829| and `zSig2', and returns the proper quadruple-precision floating-point value
1830| corresponding to the abstract input. Ordinarily, the abstract value is
1831| simply rounded and packed into the quadruple-precision format, with the
1832| inexact exception raised if the abstract input cannot be represented
1833| exactly. However, if the abstract value is too large, the overflow and
1834| inexact exceptions are raised and an infinity or maximal finite value is
1835| returned. If the abstract value is too small, the input value is rounded to
1836| a subnormal number, and the underflow and inexact exceptions are raised if
1837| the abstract input cannot be represented exactly as a subnormal quadruple-
1838| precision floating-point number.
1839| The input significand must be normalized or smaller. If the input
1840| significand is not normalized, `zExp' must be 0; in that case, the result
1841| returned is a subnormal number, and it must not require rounding. In the
1842| usual case that the input significand is normalized, `zExp' must be 1 less
1843| than the ``true'' floating-point exponent. The handling of underflow and
1844| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1845*----------------------------------------------------------------------------*/
1846
f4014512 1847static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
e5a41ffa
PM
1848 uint64_t zSig0, uint64_t zSig1,
1849 uint64_t zSig2, float_status *status)
158142c2 1850{
8f506c70 1851 int8_t roundingMode;
158142c2
FB
1852 flag roundNearestEven, increment, isTiny;
1853
a2f2d288 1854 roundingMode = status->float_rounding_mode;
158142c2 1855 roundNearestEven = ( roundingMode == float_round_nearest_even );
dc355b76
PM
1856 switch (roundingMode) {
1857 case float_round_nearest_even:
f9288a76 1858 case float_round_ties_away:
dc355b76
PM
1859 increment = ((int64_t)zSig2 < 0);
1860 break;
1861 case float_round_to_zero:
1862 increment = 0;
1863 break;
1864 case float_round_up:
1865 increment = !zSign && zSig2;
1866 break;
1867 case float_round_down:
1868 increment = zSign && zSig2;
1869 break;
9ee6f678
BR
1870 case float_round_to_odd:
1871 increment = !(zSig1 & 0x1) && zSig2;
1872 break;
dc355b76
PM
1873 default:
1874 abort();
158142c2 1875 }
bb98fe42 1876 if ( 0x7FFD <= (uint32_t) zExp ) {
158142c2
FB
1877 if ( ( 0x7FFD < zExp )
1878 || ( ( zExp == 0x7FFD )
1879 && eq128(
1880 LIT64( 0x0001FFFFFFFFFFFF ),
1881 LIT64( 0xFFFFFFFFFFFFFFFF ),
1882 zSig0,
1883 zSig1
1884 )
1885 && increment
1886 )
1887 ) {
ff32e16e 1888 float_raise(float_flag_overflow | float_flag_inexact, status);
158142c2
FB
1889 if ( ( roundingMode == float_round_to_zero )
1890 || ( zSign && ( roundingMode == float_round_up ) )
1891 || ( ! zSign && ( roundingMode == float_round_down ) )
9ee6f678 1892 || (roundingMode == float_round_to_odd)
158142c2
FB
1893 ) {
1894 return
1895 packFloat128(
1896 zSign,
1897 0x7FFE,
1898 LIT64( 0x0000FFFFFFFFFFFF ),
1899 LIT64( 0xFFFFFFFFFFFFFFFF )
1900 );
1901 }
1902 return packFloat128( zSign, 0x7FFF, 0, 0 );
1903 }
1904 if ( zExp < 0 ) {
a2f2d288 1905 if (status->flush_to_zero) {
ff32e16e 1906 float_raise(float_flag_output_denormal, status);
e6afc87f
PM
1907 return packFloat128(zSign, 0, 0, 0);
1908 }
158142c2 1909 isTiny =
a2f2d288
PM
1910 (status->float_detect_tininess
1911 == float_tininess_before_rounding)
158142c2
FB
1912 || ( zExp < -1 )
1913 || ! increment
1914 || lt128(
1915 zSig0,
1916 zSig1,
1917 LIT64( 0x0001FFFFFFFFFFFF ),
1918 LIT64( 0xFFFFFFFFFFFFFFFF )
1919 );
1920 shift128ExtraRightJamming(
1921 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1922 zExp = 0;
ff32e16e
PM
1923 if (isTiny && zSig2) {
1924 float_raise(float_flag_underflow, status);
1925 }
dc355b76
PM
1926 switch (roundingMode) {
1927 case float_round_nearest_even:
f9288a76 1928 case float_round_ties_away:
dc355b76
PM
1929 increment = ((int64_t)zSig2 < 0);
1930 break;
1931 case float_round_to_zero:
1932 increment = 0;
1933 break;
1934 case float_round_up:
1935 increment = !zSign && zSig2;
1936 break;
1937 case float_round_down:
1938 increment = zSign && zSig2;
1939 break;
9ee6f678
BR
1940 case float_round_to_odd:
1941 increment = !(zSig1 & 0x1) && zSig2;
1942 break;
dc355b76
PM
1943 default:
1944 abort();
158142c2
FB
1945 }
1946 }
1947 }
a2f2d288
PM
1948 if (zSig2) {
1949 status->float_exception_flags |= float_flag_inexact;
1950 }
158142c2
FB
1951 if ( increment ) {
1952 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1953 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1954 }
1955 else {
1956 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1957 }
1958 return packFloat128( zSign, zExp, zSig0, zSig1 );
1959
1960}
1961
1962/*----------------------------------------------------------------------------
1963| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1964| and significand formed by the concatenation of `zSig0' and `zSig1', and
1965| returns the proper quadruple-precision floating-point value corresponding
1966| to the abstract input. This routine is just like `roundAndPackFloat128'
1967| except that the input significand has fewer bits and does not have to be
1968| normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1969| point exponent.
1970*----------------------------------------------------------------------------*/
1971
f4014512 1972static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
e5a41ffa
PM
1973 uint64_t zSig0, uint64_t zSig1,
1974 float_status *status)
158142c2 1975{
8f506c70 1976 int8_t shiftCount;
bb98fe42 1977 uint64_t zSig2;
158142c2
FB
1978
1979 if ( zSig0 == 0 ) {
1980 zSig0 = zSig1;
1981 zSig1 = 0;
1982 zExp -= 64;
1983 }
1984 shiftCount = countLeadingZeros64( zSig0 ) - 15;
1985 if ( 0 <= shiftCount ) {
1986 zSig2 = 0;
1987 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1988 }
1989 else {
1990 shift128ExtraRightJamming(
1991 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1992 }
1993 zExp -= shiftCount;
ff32e16e 1994 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
158142c2
FB
1995
1996}
1997
158142c2
FB
1998/*----------------------------------------------------------------------------
1999| Returns the result of converting the 32-bit two's complement integer `a'
2000| to the single-precision floating-point format. The conversion is performed
2001| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2002*----------------------------------------------------------------------------*/
2003
e5a41ffa 2004float32 int32_to_float32(int32_t a, float_status *status)
158142c2
FB
2005{
2006 flag zSign;
2007
f090c9d4 2008 if ( a == 0 ) return float32_zero;
bb98fe42 2009 if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
158142c2 2010 zSign = ( a < 0 );
ff32e16e 2011 return normalizeRoundAndPackFloat32(zSign, 0x9C, zSign ? -a : a, status);
158142c2
FB
2012}
2013
2014/*----------------------------------------------------------------------------
2015| Returns the result of converting the 32-bit two's complement integer `a'
2016| to the double-precision floating-point format. The conversion is performed
2017| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2018*----------------------------------------------------------------------------*/
2019
e5a41ffa 2020float64 int32_to_float64(int32_t a, float_status *status)
158142c2
FB
2021{
2022 flag zSign;
3a87d009 2023 uint32_t absA;
8f506c70 2024 int8_t shiftCount;
bb98fe42 2025 uint64_t zSig;
158142c2 2026
f090c9d4 2027 if ( a == 0 ) return float64_zero;
158142c2
FB
2028 zSign = ( a < 0 );
2029 absA = zSign ? - a : a;
2030 shiftCount = countLeadingZeros32( absA ) + 21;
2031 zSig = absA;
2032 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
2033
2034}
2035
158142c2
FB
2036/*----------------------------------------------------------------------------
2037| Returns the result of converting the 32-bit two's complement integer `a'
2038| to the extended double-precision floating-point format. The conversion
2039| is performed according to the IEC/IEEE Standard for Binary Floating-Point
2040| Arithmetic.
2041*----------------------------------------------------------------------------*/
2042
e5a41ffa 2043floatx80 int32_to_floatx80(int32_t a, float_status *status)
158142c2
FB
2044{
2045 flag zSign;
3a87d009 2046 uint32_t absA;
8f506c70 2047 int8_t shiftCount;
bb98fe42 2048 uint64_t zSig;
158142c2
FB
2049
2050 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
2051 zSign = ( a < 0 );
2052 absA = zSign ? - a : a;
2053 shiftCount = countLeadingZeros32( absA ) + 32;
2054 zSig = absA;
2055 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
2056
2057}
2058
158142c2
FB
2059/*----------------------------------------------------------------------------
2060| Returns the result of converting the 32-bit two's complement integer `a' to
2061| the quadruple-precision floating-point format. The conversion is performed
2062| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2063*----------------------------------------------------------------------------*/
2064
e5a41ffa 2065float128 int32_to_float128(int32_t a, float_status *status)
158142c2
FB
2066{
2067 flag zSign;
3a87d009 2068 uint32_t absA;
8f506c70 2069 int8_t shiftCount;
bb98fe42 2070 uint64_t zSig0;
158142c2
FB
2071
2072 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
2073 zSign = ( a < 0 );
2074 absA = zSign ? - a : a;
2075 shiftCount = countLeadingZeros32( absA ) + 17;
2076 zSig0 = absA;
2077 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
2078
2079}
2080
158142c2
FB
2081/*----------------------------------------------------------------------------
2082| Returns the result of converting the 64-bit two's complement integer `a'
2083| to the single-precision floating-point format. The conversion is performed
2084| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2085*----------------------------------------------------------------------------*/
2086
e5a41ffa 2087float32 int64_to_float32(int64_t a, float_status *status)
158142c2
FB
2088{
2089 flag zSign;
182f42fd 2090 uint64_t absA;
8f506c70 2091 int8_t shiftCount;
158142c2 2092
f090c9d4 2093 if ( a == 0 ) return float32_zero;
158142c2
FB
2094 zSign = ( a < 0 );
2095 absA = zSign ? - a : a;
2096 shiftCount = countLeadingZeros64( absA ) - 40;
2097 if ( 0 <= shiftCount ) {
2098 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
2099 }
2100 else {
2101 shiftCount += 7;
2102 if ( shiftCount < 0 ) {
2103 shift64RightJamming( absA, - shiftCount, &absA );
2104 }
2105 else {
2106 absA <<= shiftCount;
2107 }
ff32e16e 2108 return roundAndPackFloat32(zSign, 0x9C - shiftCount, absA, status);
158142c2
FB
2109 }
2110
2111}
2112
2113/*----------------------------------------------------------------------------
2114| Returns the result of converting the 64-bit two's complement integer `a'
2115| to the double-precision floating-point format. The conversion is performed
2116| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2117*----------------------------------------------------------------------------*/
2118
e5a41ffa 2119float64 int64_to_float64(int64_t a, float_status *status)
158142c2
FB
2120{
2121 flag zSign;
2122
f090c9d4 2123 if ( a == 0 ) return float64_zero;
bb98fe42 2124 if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
158142c2
FB
2125 return packFloat64( 1, 0x43E, 0 );
2126 }
2127 zSign = ( a < 0 );
ff32e16e 2128 return normalizeRoundAndPackFloat64(zSign, 0x43C, zSign ? -a : a, status);
158142c2
FB
2129}
2130
158142c2
FB
2131/*----------------------------------------------------------------------------
2132| Returns the result of converting the 64-bit two's complement integer `a'
2133| to the extended double-precision floating-point format. The conversion
2134| is performed according to the IEC/IEEE Standard for Binary Floating-Point
2135| Arithmetic.
2136*----------------------------------------------------------------------------*/
2137
e5a41ffa 2138floatx80 int64_to_floatx80(int64_t a, float_status *status)
158142c2
FB
2139{
2140 flag zSign;
182f42fd 2141 uint64_t absA;
8f506c70 2142 int8_t shiftCount;
158142c2
FB
2143
2144 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
2145 zSign = ( a < 0 );
2146 absA = zSign ? - a : a;
2147 shiftCount = countLeadingZeros64( absA );
2148 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
2149
2150}
2151
158142c2
FB
2152/*----------------------------------------------------------------------------
2153| Returns the result of converting the 64-bit two's complement integer `a' to
2154| the quadruple-precision floating-point format. The conversion is performed
2155| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2156*----------------------------------------------------------------------------*/
2157
e5a41ffa 2158float128 int64_to_float128(int64_t a, float_status *status)
158142c2
FB
2159{
2160 flag zSign;
182f42fd 2161 uint64_t absA;
8f506c70 2162 int8_t shiftCount;
f4014512 2163 int32_t zExp;
bb98fe42 2164 uint64_t zSig0, zSig1;
158142c2
FB
2165
2166 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
2167 zSign = ( a < 0 );
2168 absA = zSign ? - a : a;
2169 shiftCount = countLeadingZeros64( absA ) + 49;
2170 zExp = 0x406E - shiftCount;
2171 if ( 64 <= shiftCount ) {
2172 zSig1 = 0;
2173 zSig0 = absA;
2174 shiftCount -= 64;
2175 }
2176 else {
2177 zSig1 = absA;
2178 zSig0 = 0;
2179 }
2180 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
2181 return packFloat128( zSign, zExp, zSig0, zSig1 );
2182
2183}
2184
6bb8e0f1
PM
2185/*----------------------------------------------------------------------------
2186| Returns the result of converting the 64-bit unsigned integer `a'
2187| to the single-precision floating-point format. The conversion is performed
2188| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2189*----------------------------------------------------------------------------*/
2190
e5a41ffa 2191float32 uint64_to_float32(uint64_t a, float_status *status)
6bb8e0f1
PM
2192{
2193 int shiftcount;
2194
2195 if (a == 0) {
2196 return float32_zero;
2197 }
2198
2199 /* Determine (left) shift needed to put first set bit into bit posn 23
2200 * (since packFloat32() expects the binary point between bits 23 and 22);
2201 * this is the fast case for smallish numbers.
2202 */
2203 shiftcount = countLeadingZeros64(a) - 40;
2204 if (shiftcount >= 0) {
2205 return packFloat32(0, 0x95 - shiftcount, a << shiftcount);
2206 }
2207 /* Otherwise we need to do a round-and-pack. roundAndPackFloat32()
2208 * expects the binary point between bits 30 and 29, hence the + 7.
2209 */
2210 shiftcount += 7;
2211 if (shiftcount < 0) {
2212 shift64RightJamming(a, -shiftcount, &a);
2213 } else {
2214 a <<= shiftcount;
2215 }
2216
ff32e16e 2217 return roundAndPackFloat32(0, 0x9c - shiftcount, a, status);
6bb8e0f1
PM
2218}
2219
2220/*----------------------------------------------------------------------------
2221| Returns the result of converting the 64-bit unsigned integer `a'
2222| to the double-precision floating-point format. The conversion is performed
2223| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2224*----------------------------------------------------------------------------*/
2225
e5a41ffa 2226float64 uint64_to_float64(uint64_t a, float_status *status)
6bb8e0f1
PM
2227{
2228 int exp = 0x43C;
2229 int shiftcount;
2230
2231 if (a == 0) {
2232 return float64_zero;
2233 }
2234
2235 shiftcount = countLeadingZeros64(a) - 1;
2236 if (shiftcount < 0) {
2237 shift64RightJamming(a, -shiftcount, &a);
2238 } else {
2239 a <<= shiftcount;
2240 }
ff32e16e 2241 return roundAndPackFloat64(0, exp - shiftcount, a, status);
6bb8e0f1
PM
2242}
2243
2244/*----------------------------------------------------------------------------
2245| Returns the result of converting the 64-bit unsigned integer `a'
2246| to the quadruple-precision floating-point format. The conversion is performed
2247| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2248*----------------------------------------------------------------------------*/
2249
e5a41ffa 2250float128 uint64_to_float128(uint64_t a, float_status *status)
1e397ead
RH
2251{
2252 if (a == 0) {
2253 return float128_zero;
2254 }
ff32e16e 2255 return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status);
1e397ead
RH
2256}
2257
158142c2
FB
2258/*----------------------------------------------------------------------------
2259| Returns the result of converting the single-precision floating-point value
2260| `a' to the 32-bit two's complement integer format. The conversion is
2261| performed according to the IEC/IEEE Standard for Binary Floating-Point
2262| Arithmetic---which means in particular that the conversion is rounded
2263| according to the current rounding mode. If `a' is a NaN, the largest
2264| positive integer is returned. Otherwise, if the conversion overflows, the
2265| largest integer with the same sign as `a' is returned.
2266*----------------------------------------------------------------------------*/
2267
f4014512 2268int32_t float32_to_int32(float32 a, float_status *status)
158142c2
FB
2269{
2270 flag aSign;
0c48262d 2271 int aExp;
07d792d2 2272 int shiftCount;
bb98fe42
AF
2273 uint32_t aSig;
2274 uint64_t aSig64;
158142c2 2275
ff32e16e 2276 a = float32_squash_input_denormal(a, status);
158142c2
FB
2277 aSig = extractFloat32Frac( a );
2278 aExp = extractFloat32Exp( a );
2279 aSign = extractFloat32Sign( a );
2280 if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
2281 if ( aExp ) aSig |= 0x00800000;
2282 shiftCount = 0xAF - aExp;
2283 aSig64 = aSig;
2284 aSig64 <<= 32;
2285 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
ff32e16e 2286 return roundAndPackInt32(aSign, aSig64, status);
158142c2
FB
2287
2288}
2289
2290/*----------------------------------------------------------------------------
2291| Returns the result of converting the single-precision floating-point value
2292| `a' to the 32-bit two's complement integer format. The conversion is
2293| performed according to the IEC/IEEE Standard for Binary Floating-Point
2294| Arithmetic, except that the conversion is always rounded toward zero.
2295| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2296| the conversion overflows, the largest integer with the same sign as `a' is
2297| returned.
2298*----------------------------------------------------------------------------*/
2299
f4014512 2300int32_t float32_to_int32_round_to_zero(float32 a, float_status *status)
158142c2
FB
2301{
2302 flag aSign;
0c48262d 2303 int aExp;
07d792d2 2304 int shiftCount;
bb98fe42 2305 uint32_t aSig;
b3a6a2e0 2306 int32_t z;
ff32e16e 2307 a = float32_squash_input_denormal(a, status);
158142c2
FB
2308
2309 aSig = extractFloat32Frac( a );
2310 aExp = extractFloat32Exp( a );
2311 aSign = extractFloat32Sign( a );
2312 shiftCount = aExp - 0x9E;
2313 if ( 0 <= shiftCount ) {
f090c9d4 2314 if ( float32_val(a) != 0xCF000000 ) {
ff32e16e 2315 float_raise(float_flag_invalid, status);
158142c2
FB
2316 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
2317 }
bb98fe42 2318 return (int32_t) 0x80000000;
158142c2
FB
2319 }
2320 else if ( aExp <= 0x7E ) {
a2f2d288
PM
2321 if (aExp | aSig) {
2322 status->float_exception_flags |= float_flag_inexact;
2323 }
158142c2
FB
2324 return 0;
2325 }
2326 aSig = ( aSig | 0x00800000 )<<8;
2327 z = aSig>>( - shiftCount );
bb98fe42 2328 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
a2f2d288 2329 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
2330 }
2331 if ( aSign ) z = - z;
2332 return z;
2333
2334}
2335
cbcef455
PM
2336/*----------------------------------------------------------------------------
2337| Returns the result of converting the single-precision floating-point value
2338| `a' to the 16-bit two's complement integer format. The conversion is
2339| performed according to the IEC/IEEE Standard for Binary Floating-Point
2340| Arithmetic, except that the conversion is always rounded toward zero.
2341| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2342| the conversion overflows, the largest integer with the same sign as `a' is
2343| returned.
2344*----------------------------------------------------------------------------*/
2345
0bb721d7 2346int16_t float32_to_int16_round_to_zero(float32 a, float_status *status)
cbcef455
PM
2347{
2348 flag aSign;
0c48262d 2349 int aExp;
07d792d2 2350 int shiftCount;
bb98fe42 2351 uint32_t aSig;
f4014512 2352 int32_t z;
cbcef455
PM
2353
2354 aSig = extractFloat32Frac( a );
2355 aExp = extractFloat32Exp( a );
2356 aSign = extractFloat32Sign( a );
2357 shiftCount = aExp - 0x8E;
2358 if ( 0 <= shiftCount ) {
2359 if ( float32_val(a) != 0xC7000000 ) {
ff32e16e 2360 float_raise(float_flag_invalid, status);
cbcef455
PM
2361 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
2362 return 0x7FFF;
2363 }
2364 }
bb98fe42 2365 return (int32_t) 0xffff8000;
cbcef455
PM
2366 }
2367 else if ( aExp <= 0x7E ) {
2368 if ( aExp | aSig ) {
a2f2d288 2369 status->float_exception_flags |= float_flag_inexact;
cbcef455
PM
2370 }
2371 return 0;
2372 }
2373 shiftCount -= 0x10;
2374 aSig = ( aSig | 0x00800000 )<<8;
2375 z = aSig>>( - shiftCount );
bb98fe42 2376 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
a2f2d288 2377 status->float_exception_flags |= float_flag_inexact;
cbcef455
PM
2378 }
2379 if ( aSign ) {
2380 z = - z;
2381 }
2382 return z;
2383
2384}
2385
158142c2
FB
2386/*----------------------------------------------------------------------------
2387| Returns the result of converting the single-precision floating-point value
2388| `a' to the 64-bit two's complement integer format. The conversion is
2389| performed according to the IEC/IEEE Standard for Binary Floating-Point
2390| Arithmetic---which means in particular that the conversion is rounded
2391| according to the current rounding mode. If `a' is a NaN, the largest
2392| positive integer is returned. Otherwise, if the conversion overflows, the
2393| largest integer with the same sign as `a' is returned.
2394*----------------------------------------------------------------------------*/
2395
f42c2224 2396int64_t float32_to_int64(float32 a, float_status *status)
158142c2
FB
2397{
2398 flag aSign;
0c48262d 2399 int aExp;
07d792d2 2400 int shiftCount;
bb98fe42
AF
2401 uint32_t aSig;
2402 uint64_t aSig64, aSigExtra;
ff32e16e 2403 a = float32_squash_input_denormal(a, status);
158142c2
FB
2404
2405 aSig = extractFloat32Frac( a );
2406 aExp = extractFloat32Exp( a );
2407 aSign = extractFloat32Sign( a );
2408 shiftCount = 0xBE - aExp;
2409 if ( shiftCount < 0 ) {
ff32e16e 2410 float_raise(float_flag_invalid, status);
158142c2
FB
2411 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
2412 return LIT64( 0x7FFFFFFFFFFFFFFF );
2413 }
bb98fe42 2414 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
2415 }
2416 if ( aExp ) aSig |= 0x00800000;
2417 aSig64 = aSig;
2418 aSig64 <<= 40;
2419 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
ff32e16e 2420 return roundAndPackInt64(aSign, aSig64, aSigExtra, status);
158142c2
FB
2421
2422}
2423
2f18bbf9
TM
2424/*----------------------------------------------------------------------------
2425| Returns the result of converting the single-precision floating-point value
2426| `a' to the 64-bit unsigned integer format. The conversion is
2427| performed according to the IEC/IEEE Standard for Binary Floating-Point
2428| Arithmetic---which means in particular that the conversion is rounded
2429| according to the current rounding mode. If `a' is a NaN, the largest
2430| unsigned integer is returned. Otherwise, if the conversion overflows, the
2431| largest unsigned integer is returned. If the 'a' is negative, the result
2432| is rounded and zero is returned; values that do not round to zero will
2433| raise the inexact exception flag.
2434*----------------------------------------------------------------------------*/
2435
182f42fd 2436uint64_t float32_to_uint64(float32 a, float_status *status)
2f18bbf9
TM
2437{
2438 flag aSign;
0c48262d 2439 int aExp;
07d792d2 2440 int shiftCount;
2f18bbf9
TM
2441 uint32_t aSig;
2442 uint64_t aSig64, aSigExtra;
ff32e16e 2443 a = float32_squash_input_denormal(a, status);
2f18bbf9
TM
2444
2445 aSig = extractFloat32Frac(a);
2446 aExp = extractFloat32Exp(a);
2447 aSign = extractFloat32Sign(a);
2448 if ((aSign) && (aExp > 126)) {
ff32e16e 2449 float_raise(float_flag_invalid, status);
2f18bbf9
TM
2450 if (float32_is_any_nan(a)) {
2451 return LIT64(0xFFFFFFFFFFFFFFFF);
2452 } else {
2453 return 0;
2454 }
2455 }
2456 shiftCount = 0xBE - aExp;
2457 if (aExp) {
2458 aSig |= 0x00800000;
2459 }
2460 if (shiftCount < 0) {
ff32e16e 2461 float_raise(float_flag_invalid, status);
2f18bbf9
TM
2462 return LIT64(0xFFFFFFFFFFFFFFFF);
2463 }
2464
2465 aSig64 = aSig;
2466 aSig64 <<= 40;
2467 shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra);
ff32e16e 2468 return roundAndPackUint64(aSign, aSig64, aSigExtra, status);
2f18bbf9
TM
2469}
2470
a13d4489
TM
2471/*----------------------------------------------------------------------------
2472| Returns the result of converting the single-precision floating-point value
2473| `a' to the 64-bit unsigned integer format. The conversion is
2474| performed according to the IEC/IEEE Standard for Binary Floating-Point
2475| Arithmetic, except that the conversion is always rounded toward zero. If
2476| `a' is a NaN, the largest unsigned integer is returned. Otherwise, if the
2477| conversion overflows, the largest unsigned integer is returned. If the
2478| 'a' is negative, the result is rounded and zero is returned; values that do
2479| not round to zero will raise the inexact flag.
2480*----------------------------------------------------------------------------*/
2481
182f42fd 2482uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *status)
a13d4489 2483{
a2f2d288 2484 signed char current_rounding_mode = status->float_rounding_mode;
ff32e16e
PM
2485 set_float_rounding_mode(float_round_to_zero, status);
2486 int64_t v = float32_to_uint64(a, status);
2487 set_float_rounding_mode(current_rounding_mode, status);
a13d4489
TM
2488 return v;
2489}
2490
158142c2
FB
2491/*----------------------------------------------------------------------------
2492| Returns the result of converting the single-precision floating-point value
2493| `a' to the 64-bit two's complement integer format. The conversion is
2494| performed according to the IEC/IEEE Standard for Binary Floating-Point
2495| Arithmetic, except that the conversion is always rounded toward zero. If
2496| `a' is a NaN, the largest positive integer is returned. Otherwise, if the
2497| conversion overflows, the largest integer with the same sign as `a' is
2498| returned.
2499*----------------------------------------------------------------------------*/
2500
f42c2224 2501int64_t float32_to_int64_round_to_zero(float32 a, float_status *status)
158142c2
FB
2502{
2503 flag aSign;
0c48262d 2504 int aExp;
07d792d2 2505 int shiftCount;
bb98fe42
AF
2506 uint32_t aSig;
2507 uint64_t aSig64;
f42c2224 2508 int64_t z;
ff32e16e 2509 a = float32_squash_input_denormal(a, status);
158142c2
FB
2510
2511 aSig = extractFloat32Frac( a );
2512 aExp = extractFloat32Exp( a );
2513 aSign = extractFloat32Sign( a );
2514 shiftCount = aExp - 0xBE;
2515 if ( 0 <= shiftCount ) {
f090c9d4 2516 if ( float32_val(a) != 0xDF000000 ) {
ff32e16e 2517 float_raise(float_flag_invalid, status);
158142c2
FB
2518 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
2519 return LIT64( 0x7FFFFFFFFFFFFFFF );
2520 }
2521 }
bb98fe42 2522 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
2523 }
2524 else if ( aExp <= 0x7E ) {
a2f2d288
PM
2525 if (aExp | aSig) {
2526 status->float_exception_flags |= float_flag_inexact;
2527 }
158142c2
FB
2528 return 0;
2529 }
2530 aSig64 = aSig | 0x00800000;
2531 aSig64 <<= 40;
2532 z = aSig64>>( - shiftCount );
bb98fe42 2533 if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
a2f2d288 2534 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
2535 }
2536 if ( aSign ) z = - z;
2537 return z;
2538
2539}
2540
2541/*----------------------------------------------------------------------------
2542| Returns the result of converting the single-precision floating-point value
2543| `a' to the double-precision floating-point format. The conversion is
2544| performed according to the IEC/IEEE Standard for Binary Floating-Point
2545| Arithmetic.
2546*----------------------------------------------------------------------------*/
2547
e5a41ffa 2548float64 float32_to_float64(float32 a, float_status *status)
158142c2
FB
2549{
2550 flag aSign;
0c48262d 2551 int aExp;
bb98fe42 2552 uint32_t aSig;
ff32e16e 2553 a = float32_squash_input_denormal(a, status);
158142c2
FB
2554
2555 aSig = extractFloat32Frac( a );
2556 aExp = extractFloat32Exp( a );
2557 aSign = extractFloat32Sign( a );
2558 if ( aExp == 0xFF ) {
ff32e16e
PM
2559 if (aSig) {
2560 return commonNaNToFloat64(float32ToCommonNaN(a, status), status);
2561 }
158142c2
FB
2562 return packFloat64( aSign, 0x7FF, 0 );
2563 }
2564 if ( aExp == 0 ) {
2565 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
2566 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2567 --aExp;
2568 }
bb98fe42 2569 return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
158142c2
FB
2570
2571}
2572
158142c2
FB
2573/*----------------------------------------------------------------------------
2574| Returns the result of converting the single-precision floating-point value
2575| `a' to the extended double-precision floating-point format. The conversion
2576| is performed according to the IEC/IEEE Standard for Binary Floating-Point
2577| Arithmetic.
2578*----------------------------------------------------------------------------*/
2579
e5a41ffa 2580floatx80 float32_to_floatx80(float32 a, float_status *status)
158142c2
FB
2581{
2582 flag aSign;
0c48262d 2583 int aExp;
bb98fe42 2584 uint32_t aSig;
158142c2 2585
ff32e16e 2586 a = float32_squash_input_denormal(a, status);
158142c2
FB
2587 aSig = extractFloat32Frac( a );
2588 aExp = extractFloat32Exp( a );
2589 aSign = extractFloat32Sign( a );
2590 if ( aExp == 0xFF ) {
ff32e16e
PM
2591 if (aSig) {
2592 return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
2593 }
158142c2
FB
2594 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2595 }
2596 if ( aExp == 0 ) {
2597 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2598 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2599 }
2600 aSig |= 0x00800000;
bb98fe42 2601 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
158142c2
FB
2602
2603}
2604
158142c2
FB
2605/*----------------------------------------------------------------------------
2606| Returns the result of converting the single-precision floating-point value
2607| `a' to the double-precision floating-point format. The conversion is
2608| performed according to the IEC/IEEE Standard for Binary Floating-Point
2609| Arithmetic.
2610*----------------------------------------------------------------------------*/
2611
e5a41ffa 2612float128 float32_to_float128(float32 a, float_status *status)
158142c2
FB
2613{
2614 flag aSign;
0c48262d 2615 int aExp;
bb98fe42 2616 uint32_t aSig;
158142c2 2617
ff32e16e 2618 a = float32_squash_input_denormal(a, status);
158142c2
FB
2619 aSig = extractFloat32Frac( a );
2620 aExp = extractFloat32Exp( a );
2621 aSign = extractFloat32Sign( a );
2622 if ( aExp == 0xFF ) {
ff32e16e
PM
2623 if (aSig) {
2624 return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
2625 }
158142c2
FB
2626 return packFloat128( aSign, 0x7FFF, 0, 0 );
2627 }
2628 if ( aExp == 0 ) {
2629 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2630 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2631 --aExp;
2632 }
bb98fe42 2633 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
158142c2
FB
2634
2635}
2636
158142c2
FB
2637/*----------------------------------------------------------------------------
2638| Rounds the single-precision floating-point value `a' to an integer, and
2639| returns the result as a single-precision floating-point value. The
2640| operation is performed according to the IEC/IEEE Standard for Binary
2641| Floating-Point Arithmetic.
2642*----------------------------------------------------------------------------*/
2643
e5a41ffa 2644float32 float32_round_to_int(float32 a, float_status *status)
158142c2
FB
2645{
2646 flag aSign;
0c48262d 2647 int aExp;
bb98fe42 2648 uint32_t lastBitMask, roundBitsMask;
bb98fe42 2649 uint32_t z;
ff32e16e 2650 a = float32_squash_input_denormal(a, status);
158142c2
FB
2651
2652 aExp = extractFloat32Exp( a );
2653 if ( 0x96 <= aExp ) {
2654 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
ff32e16e 2655 return propagateFloat32NaN(a, a, status);
158142c2
FB
2656 }
2657 return a;
2658 }
2659 if ( aExp <= 0x7E ) {
bb98fe42 2660 if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
a2f2d288 2661 status->float_exception_flags |= float_flag_inexact;
158142c2 2662 aSign = extractFloat32Sign( a );
a2f2d288 2663 switch (status->float_rounding_mode) {
158142c2
FB
2664 case float_round_nearest_even:
2665 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
2666 return packFloat32( aSign, 0x7F, 0 );
2667 }
2668 break;
f9288a76
PM
2669 case float_round_ties_away:
2670 if (aExp == 0x7E) {
2671 return packFloat32(aSign, 0x7F, 0);
2672 }
2673 break;
158142c2 2674 case float_round_down:
f090c9d4 2675 return make_float32(aSign ? 0xBF800000 : 0);
158142c2 2676 case float_round_up:
f090c9d4 2677 return make_float32(aSign ? 0x80000000 : 0x3F800000);
158142c2
FB
2678 }
2679 return packFloat32( aSign, 0, 0 );
2680 }
2681 lastBitMask = 1;
2682 lastBitMask <<= 0x96 - aExp;
2683 roundBitsMask = lastBitMask - 1;
f090c9d4 2684 z = float32_val(a);
a2f2d288 2685 switch (status->float_rounding_mode) {
dc355b76 2686 case float_round_nearest_even:
158142c2 2687 z += lastBitMask>>1;
dc355b76
PM
2688 if ((z & roundBitsMask) == 0) {
2689 z &= ~lastBitMask;
2690 }
2691 break;
f9288a76
PM
2692 case float_round_ties_away:
2693 z += lastBitMask >> 1;
2694 break;
dc355b76
PM
2695 case float_round_to_zero:
2696 break;
2697 case float_round_up:
2698 if (!extractFloat32Sign(make_float32(z))) {
2699 z += roundBitsMask;
2700 }
2701 break;
2702 case float_round_down:
2703 if (extractFloat32Sign(make_float32(z))) {
158142c2
FB
2704 z += roundBitsMask;
2705 }
dc355b76
PM
2706 break;
2707 default:
2708 abort();
158142c2
FB
2709 }
2710 z &= ~ roundBitsMask;
a2f2d288
PM
2711 if (z != float32_val(a)) {
2712 status->float_exception_flags |= float_flag_inexact;
2713 }
f090c9d4 2714 return make_float32(z);
158142c2
FB
2715
2716}
2717
158142c2
FB
2718/*----------------------------------------------------------------------------
2719| Returns the remainder of the single-precision floating-point value `a'
2720| with respect to the corresponding value `b'. The operation is performed
2721| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2722*----------------------------------------------------------------------------*/
2723
e5a41ffa 2724float32 float32_rem(float32 a, float32 b, float_status *status)
158142c2 2725{
ed086f3d 2726 flag aSign, zSign;
0c48262d 2727 int aExp, bExp, expDiff;
bb98fe42
AF
2728 uint32_t aSig, bSig;
2729 uint32_t q;
2730 uint64_t aSig64, bSig64, q64;
2731 uint32_t alternateASig;
2732 int32_t sigMean;
ff32e16e
PM
2733 a = float32_squash_input_denormal(a, status);
2734 b = float32_squash_input_denormal(b, status);
158142c2
FB
2735
2736 aSig = extractFloat32Frac( a );
2737 aExp = extractFloat32Exp( a );
2738 aSign = extractFloat32Sign( a );
2739 bSig = extractFloat32Frac( b );
2740 bExp = extractFloat32Exp( b );
158142c2
FB
2741 if ( aExp == 0xFF ) {
2742 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
ff32e16e 2743 return propagateFloat32NaN(a, b, status);
158142c2 2744 }
ff32e16e 2745 float_raise(float_flag_invalid, status);
af39bc8c 2746 return float32_default_nan(status);
158142c2
FB
2747 }
2748 if ( bExp == 0xFF ) {
ff32e16e
PM
2749 if (bSig) {
2750 return propagateFloat32NaN(a, b, status);
2751 }
158142c2
FB
2752 return a;
2753 }
2754 if ( bExp == 0 ) {
2755 if ( bSig == 0 ) {
ff32e16e 2756 float_raise(float_flag_invalid, status);
af39bc8c 2757 return float32_default_nan(status);
158142c2
FB
2758 }
2759 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2760 }
2761 if ( aExp == 0 ) {
2762 if ( aSig == 0 ) return a;
2763 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2764 }
2765 expDiff = aExp - bExp;
2766 aSig |= 0x00800000;
2767 bSig |= 0x00800000;
2768 if ( expDiff < 32 ) {
2769 aSig <<= 8;
2770 bSig <<= 8;
2771 if ( expDiff < 0 ) {
2772 if ( expDiff < -1 ) return a;
2773 aSig >>= 1;
2774 }
2775 q = ( bSig <= aSig );
2776 if ( q ) aSig -= bSig;
2777 if ( 0 < expDiff ) {
bb98fe42 2778 q = ( ( (uint64_t) aSig )<<32 ) / bSig;
158142c2
FB
2779 q >>= 32 - expDiff;
2780 bSig >>= 2;
2781 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2782 }
2783 else {
2784 aSig >>= 2;
2785 bSig >>= 2;
2786 }
2787 }
2788 else {
2789 if ( bSig <= aSig ) aSig -= bSig;
bb98fe42
AF
2790 aSig64 = ( (uint64_t) aSig )<<40;
2791 bSig64 = ( (uint64_t) bSig )<<40;
158142c2
FB
2792 expDiff -= 64;
2793 while ( 0 < expDiff ) {
2794 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2795 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2796 aSig64 = - ( ( bSig * q64 )<<38 );
2797 expDiff -= 62;
2798 }
2799 expDiff += 64;
2800 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2801 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2802 q = q64>>( 64 - expDiff );
2803 bSig <<= 6;
2804 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2805 }
2806 do {
2807 alternateASig = aSig;
2808 ++q;
2809 aSig -= bSig;
bb98fe42 2810 } while ( 0 <= (int32_t) aSig );
158142c2
FB
2811 sigMean = aSig + alternateASig;
2812 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2813 aSig = alternateASig;
2814 }
bb98fe42 2815 zSign = ( (int32_t) aSig < 0 );
158142c2 2816 if ( zSign ) aSig = - aSig;
ff32e16e 2817 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
158142c2
FB
2818}
2819
369be8f6
PM
2820/*----------------------------------------------------------------------------
2821| Returns the result of multiplying the single-precision floating-point values
2822| `a' and `b' then adding 'c', with no intermediate rounding step after the
2823| multiplication. The operation is performed according to the IEC/IEEE
2824| Standard for Binary Floating-Point Arithmetic 754-2008.
2825| The flags argument allows the caller to select negation of the
2826| addend, the intermediate product, or the final result. (The difference
2827| between this and having the caller do a separate negation is that negating
2828| externally will flip the sign bit on NaNs.)
2829*----------------------------------------------------------------------------*/
2830
e5a41ffa
PM
2831float32 float32_muladd(float32 a, float32 b, float32 c, int flags,
2832 float_status *status)
369be8f6
PM
2833{
2834 flag aSign, bSign, cSign, zSign;
0c48262d 2835 int aExp, bExp, cExp, pExp, zExp, expDiff;
369be8f6
PM
2836 uint32_t aSig, bSig, cSig;
2837 flag pInf, pZero, pSign;
2838 uint64_t pSig64, cSig64, zSig64;
2839 uint32_t pSig;
2840 int shiftcount;
2841 flag signflip, infzero;
2842
ff32e16e
PM
2843 a = float32_squash_input_denormal(a, status);
2844 b = float32_squash_input_denormal(b, status);
2845 c = float32_squash_input_denormal(c, status);
369be8f6
PM
2846 aSig = extractFloat32Frac(a);
2847 aExp = extractFloat32Exp(a);
2848 aSign = extractFloat32Sign(a);
2849 bSig = extractFloat32Frac(b);
2850 bExp = extractFloat32Exp(b);
2851 bSign = extractFloat32Sign(b);
2852 cSig = extractFloat32Frac(c);
2853 cExp = extractFloat32Exp(c);
2854 cSign = extractFloat32Sign(c);
2855
2856 infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) ||
2857 (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0));
2858
2859 /* It is implementation-defined whether the cases of (0,inf,qnan)
2860 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
2861 * they return if they do), so we have to hand this information
2862 * off to the target-specific pick-a-NaN routine.
2863 */
2864 if (((aExp == 0xff) && aSig) ||
2865 ((bExp == 0xff) && bSig) ||
2866 ((cExp == 0xff) && cSig)) {
ff32e16e 2867 return propagateFloat32MulAddNaN(a, b, c, infzero, status);
369be8f6
PM
2868 }
2869
2870 if (infzero) {
ff32e16e 2871 float_raise(float_flag_invalid, status);
af39bc8c 2872 return float32_default_nan(status);
369be8f6
PM
2873 }
2874
2875 if (flags & float_muladd_negate_c) {
2876 cSign ^= 1;
2877 }
2878
2879 signflip = (flags & float_muladd_negate_result) ? 1 : 0;
2880
2881 /* Work out the sign and type of the product */
2882 pSign = aSign ^ bSign;
2883 if (flags & float_muladd_negate_product) {
2884 pSign ^= 1;
2885 }
2886 pInf = (aExp == 0xff) || (bExp == 0xff);
2887 pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
2888
2889 if (cExp == 0xff) {
2890 if (pInf && (pSign ^ cSign)) {
2891 /* addition of opposite-signed infinities => InvalidOperation */
ff32e16e 2892 float_raise(float_flag_invalid, status);
af39bc8c 2893 return float32_default_nan(status);
369be8f6
PM
2894 }
2895 /* Otherwise generate an infinity of the same sign */
2896 return packFloat32(cSign ^ signflip, 0xff, 0);
2897 }
2898
2899 if (pInf) {
2900 return packFloat32(pSign ^ signflip, 0xff, 0);
2901 }
2902
2903 if (pZero) {
2904 if (cExp == 0) {
2905 if (cSig == 0) {
2906 /* Adding two exact zeroes */
2907 if (pSign == cSign) {
2908 zSign = pSign;
a2f2d288 2909 } else if (status->float_rounding_mode == float_round_down) {
369be8f6
PM
2910 zSign = 1;
2911 } else {
2912 zSign = 0;
2913 }
2914 return packFloat32(zSign ^ signflip, 0, 0);
2915 }
2916 /* Exact zero plus a denorm */
a2f2d288 2917 if (status->flush_to_zero) {
ff32e16e 2918 float_raise(float_flag_output_denormal, status);
369be8f6
PM
2919 return packFloat32(cSign ^ signflip, 0, 0);
2920 }
2921 }
2922 /* Zero plus something non-zero : just return the something */
67d43538
PM
2923 if (flags & float_muladd_halve_result) {
2924 if (cExp == 0) {
2925 normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2926 }
2927 /* Subtract one to halve, and one again because roundAndPackFloat32
2928 * wants one less than the true exponent.
2929 */
2930 cExp -= 2;
2931 cSig = (cSig | 0x00800000) << 7;
ff32e16e 2932 return roundAndPackFloat32(cSign ^ signflip, cExp, cSig, status);
67d43538 2933 }
a6e7c184 2934 return packFloat32(cSign ^ signflip, cExp, cSig);
369be8f6
PM
2935 }
2936
2937 if (aExp == 0) {
2938 normalizeFloat32Subnormal(aSig, &aExp, &aSig);
2939 }
2940 if (bExp == 0) {
2941 normalizeFloat32Subnormal(bSig, &bExp, &bSig);
2942 }
2943
2944 /* Calculate the actual result a * b + c */
2945
2946 /* Multiply first; this is easy. */
2947 /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
2948 * because we want the true exponent, not the "one-less-than"
2949 * flavour that roundAndPackFloat32() takes.
2950 */
2951 pExp = aExp + bExp - 0x7e;
2952 aSig = (aSig | 0x00800000) << 7;
2953 bSig = (bSig | 0x00800000) << 8;
2954 pSig64 = (uint64_t)aSig * bSig;
2955 if ((int64_t)(pSig64 << 1) >= 0) {
2956 pSig64 <<= 1;
2957 pExp--;
2958 }
2959
2960 zSign = pSign ^ signflip;
2961
2962 /* Now pSig64 is the significand of the multiply, with the explicit bit in
2963 * position 62.
2964 */
2965 if (cExp == 0) {
2966 if (!cSig) {
2967 /* Throw out the special case of c being an exact zero now */
2968 shift64RightJamming(pSig64, 32, &pSig64);
2969 pSig = pSig64;
67d43538
PM
2970 if (flags & float_muladd_halve_result) {
2971 pExp--;
2972 }
369be8f6 2973 return roundAndPackFloat32(zSign, pExp - 1,
ff32e16e 2974 pSig, status);
369be8f6
PM
2975 }
2976 normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2977 }
2978
2979 cSig64 = (uint64_t)cSig << (62 - 23);
2980 cSig64 |= LIT64(0x4000000000000000);
2981 expDiff = pExp - cExp;
2982
2983 if (pSign == cSign) {
2984 /* Addition */
2985 if (expDiff > 0) {
2986 /* scale c to match p */
2987 shift64RightJamming(cSig64, expDiff, &cSig64);
2988 zExp = pExp;
2989 } else if (expDiff < 0) {
2990 /* scale p to match c */
2991 shift64RightJamming(pSig64, -expDiff, &pSig64);
2992 zExp = cExp;
2993 } else {
2994 /* no scaling needed */
2995 zExp = cExp;
2996 }
2997 /* Add significands and make sure explicit bit ends up in posn 62 */
2998 zSig64 = pSig64 + cSig64;
2999 if ((int64_t)zSig64 < 0) {
3000 shift64RightJamming(zSig64, 1, &zSig64);
3001 } else {
3002 zExp--;
3003 }
3004 } else {
3005 /* Subtraction */
3006 if (expDiff > 0) {
3007 shift64RightJamming(cSig64, expDiff, &cSig64);
3008 zSig64 = pSig64 - cSig64;
3009 zExp = pExp;
3010 } else if (expDiff < 0) {
3011 shift64RightJamming(pSig64, -expDiff, &pSig64);
3012 zSig64 = cSig64 - pSig64;
3013 zExp = cExp;
3014 zSign ^= 1;
3015 } else {
3016 zExp = pExp;
3017 if (cSig64 < pSig64) {
3018 zSig64 = pSig64 - cSig64;
3019 } else if (pSig64 < cSig64) {
3020 zSig64 = cSig64 - pSig64;
3021 zSign ^= 1;
3022 } else {
3023 /* Exact zero */
3024 zSign = signflip;
a2f2d288 3025 if (status->float_rounding_mode == float_round_down) {
369be8f6
PM
3026 zSign ^= 1;
3027 }
3028 return packFloat32(zSign, 0, 0);
3029 }
3030 }
3031 --zExp;
3032 /* Normalize to put the explicit bit back into bit 62. */
3033 shiftcount = countLeadingZeros64(zSig64) - 1;
3034 zSig64 <<= shiftcount;
3035 zExp -= shiftcount;
3036 }
67d43538
PM
3037 if (flags & float_muladd_halve_result) {
3038 zExp--;
3039 }
3040
369be8f6 3041 shift64RightJamming(zSig64, 32, &zSig64);
ff32e16e 3042 return roundAndPackFloat32(zSign, zExp, zSig64, status);
369be8f6
PM
3043}
3044
3045
158142c2
FB
3046/*----------------------------------------------------------------------------
3047| Returns the square root of the single-precision floating-point value `a'.
3048| The operation is performed according to the IEC/IEEE Standard for Binary
3049| Floating-Point Arithmetic.
3050*----------------------------------------------------------------------------*/
3051
e5a41ffa 3052float32 float32_sqrt(float32 a, float_status *status)
158142c2
FB
3053{
3054 flag aSign;
0c48262d 3055 int aExp, zExp;
bb98fe42
AF
3056 uint32_t aSig, zSig;
3057 uint64_t rem, term;
ff32e16e 3058 a = float32_squash_input_denormal(a, status);
158142c2
FB
3059
3060 aSig = extractFloat32Frac( a );
3061 aExp = extractFloat32Exp( a );
3062 aSign = extractFloat32Sign( a );
3063 if ( aExp == 0xFF ) {
ff32e16e
PM
3064 if (aSig) {
3065 return propagateFloat32NaN(a, float32_zero, status);
3066 }
158142c2 3067 if ( ! aSign ) return a;
ff32e16e 3068 float_raise(float_flag_invalid, status);
af39bc8c 3069 return float32_default_nan(status);
158142c2
FB
3070 }
3071 if ( aSign ) {
3072 if ( ( aExp | aSig ) == 0 ) return a;
ff32e16e 3073 float_raise(float_flag_invalid, status);
af39bc8c 3074 return float32_default_nan(status);
158142c2
FB
3075 }
3076 if ( aExp == 0 ) {
f090c9d4 3077 if ( aSig == 0 ) return float32_zero;
158142c2
FB
3078 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3079 }
3080 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
3081 aSig = ( aSig | 0x00800000 )<<8;
3082 zSig = estimateSqrt32( aExp, aSig ) + 2;
3083 if ( ( zSig & 0x7F ) <= 5 ) {
3084 if ( zSig < 2 ) {
3085 zSig = 0x7FFFFFFF;
3086 goto roundAndPack;
3087 }
3088 aSig >>= aExp & 1;
bb98fe42
AF
3089 term = ( (uint64_t) zSig ) * zSig;
3090 rem = ( ( (uint64_t) aSig )<<32 ) - term;
3091 while ( (int64_t) rem < 0 ) {
158142c2 3092 --zSig;
bb98fe42 3093 rem += ( ( (uint64_t) zSig )<<1 ) | 1;
158142c2
FB
3094 }
3095 zSig |= ( rem != 0 );
3096 }
3097 shift32RightJamming( zSig, 1, &zSig );
3098 roundAndPack:
ff32e16e 3099 return roundAndPackFloat32(0, zExp, zSig, status);
158142c2
FB
3100
3101}
3102
8229c991
AJ
3103/*----------------------------------------------------------------------------
3104| Returns the binary exponential of the single-precision floating-point value
3105| `a'. The operation is performed according to the IEC/IEEE Standard for
3106| Binary Floating-Point Arithmetic.
3107|
3108| Uses the following identities:
3109|
3110| 1. -------------------------------------------------------------------------
3111| x x*ln(2)
3112| 2 = e
3113|
3114| 2. -------------------------------------------------------------------------
3115| 2 3 4 5 n
3116| x x x x x x x
3117| e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
3118| 1! 2! 3! 4! 5! n!
3119*----------------------------------------------------------------------------*/
3120
3121static const float64 float32_exp2_coefficients[15] =
3122{
d5138cf4
PM
3123 const_float64( 0x3ff0000000000000ll ), /* 1 */
3124 const_float64( 0x3fe0000000000000ll ), /* 2 */
3125 const_float64( 0x3fc5555555555555ll ), /* 3 */
3126 const_float64( 0x3fa5555555555555ll ), /* 4 */
3127 const_float64( 0x3f81111111111111ll ), /* 5 */
3128 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
3129 const_float64( 0x3f2a01a01a01a01all ), /* 7 */
3130 const_float64( 0x3efa01a01a01a01all ), /* 8 */
3131 const_float64( 0x3ec71de3a556c734ll ), /* 9 */
3132 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
3133 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
3134 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
3135 const_float64( 0x3de6124613a86d09ll ), /* 13 */
3136 const_float64( 0x3da93974a8c07c9dll ), /* 14 */
3137 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
8229c991
AJ
3138};
3139
e5a41ffa 3140float32 float32_exp2(float32 a, float_status *status)
8229c991
AJ
3141{
3142 flag aSign;
0c48262d 3143 int aExp;
bb98fe42 3144 uint32_t aSig;
8229c991
AJ
3145 float64 r, x, xn;
3146 int i;
ff32e16e 3147 a = float32_squash_input_denormal(a, status);
8229c991
AJ
3148
3149 aSig = extractFloat32Frac( a );
3150 aExp = extractFloat32Exp( a );
3151 aSign = extractFloat32Sign( a );
3152
3153 if ( aExp == 0xFF) {
ff32e16e
PM
3154 if (aSig) {
3155 return propagateFloat32NaN(a, float32_zero, status);
3156 }
8229c991
AJ
3157 return (aSign) ? float32_zero : a;
3158 }
3159 if (aExp == 0) {
3160 if (aSig == 0) return float32_one;
3161 }
3162
ff32e16e 3163 float_raise(float_flag_inexact, status);
8229c991
AJ
3164
3165 /* ******************************* */
3166 /* using float64 for approximation */
3167 /* ******************************* */
ff32e16e
PM
3168 x = float32_to_float64(a, status);
3169 x = float64_mul(x, float64_ln2, status);
8229c991
AJ
3170
3171 xn = x;
3172 r = float64_one;
3173 for (i = 0 ; i < 15 ; i++) {
3174 float64 f;
3175
ff32e16e
PM
3176 f = float64_mul(xn, float32_exp2_coefficients[i], status);
3177 r = float64_add(r, f, status);
8229c991 3178
ff32e16e 3179 xn = float64_mul(xn, x, status);
8229c991
AJ
3180 }
3181
3182 return float64_to_float32(r, status);
3183}
3184
374dfc33
AJ
3185/*----------------------------------------------------------------------------
3186| Returns the binary log of the single-precision floating-point value `a'.
3187| The operation is performed according to the IEC/IEEE Standard for Binary
3188| Floating-Point Arithmetic.
3189*----------------------------------------------------------------------------*/
e5a41ffa 3190float32 float32_log2(float32 a, float_status *status)
374dfc33
AJ
3191{
3192 flag aSign, zSign;
0c48262d 3193 int aExp;
bb98fe42 3194 uint32_t aSig, zSig, i;
374dfc33 3195
ff32e16e 3196 a = float32_squash_input_denormal(a, status);
374dfc33
AJ
3197 aSig = extractFloat32Frac( a );
3198 aExp = extractFloat32Exp( a );
3199 aSign = extractFloat32Sign( a );
3200
3201 if ( aExp == 0 ) {
3202 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
3203 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3204 }
3205 if ( aSign ) {
ff32e16e 3206 float_raise(float_flag_invalid, status);
af39bc8c 3207 return float32_default_nan(status);
374dfc33
AJ
3208 }
3209 if ( aExp == 0xFF ) {
ff32e16e
PM
3210 if (aSig) {
3211 return propagateFloat32NaN(a, float32_zero, status);
3212 }
374dfc33
AJ
3213 return a;
3214 }
3215
3216 aExp -= 0x7F;
3217 aSig |= 0x00800000;
3218 zSign = aExp < 0;
3219 zSig = aExp << 23;
3220
3221 for (i = 1 << 22; i > 0; i >>= 1) {
bb98fe42 3222 aSig = ( (uint64_t)aSig * aSig ) >> 23;
374dfc33
AJ
3223 if ( aSig & 0x01000000 ) {
3224 aSig >>= 1;
3225 zSig |= i;
3226 }
3227 }
3228
3229 if ( zSign )
3230 zSig = -zSig;
3231
ff32e16e 3232 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
374dfc33
AJ
3233}
3234
158142c2
FB
3235/*----------------------------------------------------------------------------
3236| Returns 1 if the single-precision floating-point value `a' is equal to
b689362d
AJ
3237| the corresponding value `b', and 0 otherwise. The invalid exception is
3238| raised if either operand is a NaN. Otherwise, the comparison is performed
158142c2
FB
3239| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3240*----------------------------------------------------------------------------*/
3241
e5a41ffa 3242int float32_eq(float32 a, float32 b, float_status *status)
158142c2 3243{
b689362d 3244 uint32_t av, bv;
ff32e16e
PM
3245 a = float32_squash_input_denormal(a, status);
3246 b = float32_squash_input_denormal(b, status);
158142c2
FB
3247
3248 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3249 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3250 ) {
ff32e16e 3251 float_raise(float_flag_invalid, status);
158142c2
FB
3252 return 0;
3253 }
b689362d
AJ
3254 av = float32_val(a);
3255 bv = float32_val(b);
3256 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
158142c2
FB
3257}
3258
3259/*----------------------------------------------------------------------------
3260| Returns 1 if the single-precision floating-point value `a' is less than
f5a64251
AJ
3261| or equal to the corresponding value `b', and 0 otherwise. The invalid
3262| exception is raised if either operand is a NaN. The comparison is performed
3263| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
3264*----------------------------------------------------------------------------*/
3265
e5a41ffa 3266int float32_le(float32 a, float32 b, float_status *status)
158142c2
FB
3267{
3268 flag aSign, bSign;
bb98fe42 3269 uint32_t av, bv;
ff32e16e
PM
3270 a = float32_squash_input_denormal(a, status);
3271 b = float32_squash_input_denormal(b, status);
158142c2
FB
3272
3273 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3274 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3275 ) {
ff32e16e 3276 float_raise(float_flag_invalid, status);
158142c2
FB
3277 return 0;
3278 }
3279 aSign = extractFloat32Sign( a );
3280 bSign = extractFloat32Sign( b );
f090c9d4
PB
3281 av = float32_val(a);
3282 bv = float32_val(b);
bb98fe42 3283 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 3284 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
3285
3286}
3287
3288/*----------------------------------------------------------------------------
3289| Returns 1 if the single-precision floating-point value `a' is less than
f5a64251
AJ
3290| the corresponding value `b', and 0 otherwise. The invalid exception is
3291| raised if either operand is a NaN. The comparison is performed according
3292| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
3293*----------------------------------------------------------------------------*/
3294
e5a41ffa 3295int float32_lt(float32 a, float32 b, float_status *status)
158142c2
FB
3296{
3297 flag aSign, bSign;
bb98fe42 3298 uint32_t av, bv;
ff32e16e
PM
3299 a = float32_squash_input_denormal(a, status);
3300 b = float32_squash_input_denormal(b, status);
158142c2
FB
3301
3302 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3303 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3304 ) {
ff32e16e 3305 float_raise(float_flag_invalid, status);
158142c2
FB
3306 return 0;
3307 }
3308 aSign = extractFloat32Sign( a );
3309 bSign = extractFloat32Sign( b );
f090c9d4
PB
3310 av = float32_val(a);
3311 bv = float32_val(b);
bb98fe42 3312 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 3313 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
3314
3315}
3316
67b7861d
AJ
3317/*----------------------------------------------------------------------------
3318| Returns 1 if the single-precision floating-point values `a' and `b' cannot
f5a64251
AJ
3319| be compared, and 0 otherwise. The invalid exception is raised if either
3320| operand is a NaN. The comparison is performed according to the IEC/IEEE
3321| Standard for Binary Floating-Point Arithmetic.
67b7861d
AJ
3322*----------------------------------------------------------------------------*/
3323
e5a41ffa 3324int float32_unordered(float32 a, float32 b, float_status *status)
67b7861d 3325{
ff32e16e
PM
3326 a = float32_squash_input_denormal(a, status);
3327 b = float32_squash_input_denormal(b, status);
67b7861d
AJ
3328
3329 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3330 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3331 ) {
ff32e16e 3332 float_raise(float_flag_invalid, status);
67b7861d
AJ
3333 return 1;
3334 }
3335 return 0;
3336}
b689362d 3337
158142c2
FB
3338/*----------------------------------------------------------------------------
3339| Returns 1 if the single-precision floating-point value `a' is equal to
f5a64251
AJ
3340| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3341| exception. The comparison is performed according to the IEC/IEEE Standard
3342| for Binary Floating-Point Arithmetic.
158142c2
FB
3343*----------------------------------------------------------------------------*/
3344
e5a41ffa 3345int float32_eq_quiet(float32 a, float32 b, float_status *status)
158142c2 3346{
ff32e16e
PM
3347 a = float32_squash_input_denormal(a, status);
3348 b = float32_squash_input_denormal(b, status);
158142c2
FB
3349
3350 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3351 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3352 ) {
af39bc8c
AM
3353 if (float32_is_signaling_nan(a, status)
3354 || float32_is_signaling_nan(b, status)) {
ff32e16e 3355 float_raise(float_flag_invalid, status);
b689362d 3356 }
158142c2
FB
3357 return 0;
3358 }
b689362d
AJ
3359 return ( float32_val(a) == float32_val(b) ) ||
3360 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
158142c2
FB
3361}
3362
3363/*----------------------------------------------------------------------------
3364| Returns 1 if the single-precision floating-point value `a' is less than or
3365| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3366| cause an exception. Otherwise, the comparison is performed according to the
3367| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3368*----------------------------------------------------------------------------*/
3369
e5a41ffa 3370int float32_le_quiet(float32 a, float32 b, float_status *status)
158142c2
FB
3371{
3372 flag aSign, bSign;
bb98fe42 3373 uint32_t av, bv;
ff32e16e
PM
3374 a = float32_squash_input_denormal(a, status);
3375 b = float32_squash_input_denormal(b, status);
158142c2
FB
3376
3377 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3378 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3379 ) {
af39bc8c
AM
3380 if (float32_is_signaling_nan(a, status)
3381 || float32_is_signaling_nan(b, status)) {
ff32e16e 3382 float_raise(float_flag_invalid, status);
158142c2
FB
3383 }
3384 return 0;
3385 }
3386 aSign = extractFloat32Sign( a );
3387 bSign = extractFloat32Sign( b );
f090c9d4
PB
3388 av = float32_val(a);
3389 bv = float32_val(b);
bb98fe42 3390 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 3391 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
3392
3393}
3394
3395/*----------------------------------------------------------------------------
3396| Returns 1 if the single-precision floating-point value `a' is less than
3397| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3398| exception. Otherwise, the comparison is performed according to the IEC/IEEE
3399| Standard for Binary Floating-Point Arithmetic.
3400*----------------------------------------------------------------------------*/
3401
e5a41ffa 3402int float32_lt_quiet(float32 a, float32 b, float_status *status)
158142c2
FB
3403{
3404 flag aSign, bSign;
bb98fe42 3405 uint32_t av, bv;
ff32e16e
PM
3406 a = float32_squash_input_denormal(a, status);
3407 b = float32_squash_input_denormal(b, status);
158142c2
FB
3408
3409 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3410 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3411 ) {
af39bc8c
AM
3412 if (float32_is_signaling_nan(a, status)
3413 || float32_is_signaling_nan(b, status)) {
ff32e16e 3414 float_raise(float_flag_invalid, status);
158142c2
FB
3415 }
3416 return 0;
3417 }
3418 aSign = extractFloat32Sign( a );
3419 bSign = extractFloat32Sign( b );
f090c9d4
PB
3420 av = float32_val(a);
3421 bv = float32_val(b);
bb98fe42 3422 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 3423 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
3424
3425}
3426
67b7861d
AJ
3427/*----------------------------------------------------------------------------
3428| Returns 1 if the single-precision floating-point values `a' and `b' cannot
3429| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
3430| comparison is performed according to the IEC/IEEE Standard for Binary
3431| Floating-Point Arithmetic.
3432*----------------------------------------------------------------------------*/
3433
e5a41ffa 3434int float32_unordered_quiet(float32 a, float32 b, float_status *status)
67b7861d 3435{
ff32e16e
PM
3436 a = float32_squash_input_denormal(a, status);
3437 b = float32_squash_input_denormal(b, status);
67b7861d
AJ
3438
3439 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3440 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3441 ) {
af39bc8c
AM
3442 if (float32_is_signaling_nan(a, status)
3443 || float32_is_signaling_nan(b, status)) {
ff32e16e 3444 float_raise(float_flag_invalid, status);
67b7861d
AJ
3445 }
3446 return 1;
3447 }
3448 return 0;
3449}
3450
158142c2
FB
3451/*----------------------------------------------------------------------------
3452| Returns the result of converting the double-precision floating-point value
3453| `a' to the 32-bit two's complement integer format. The conversion is
3454| performed according to the IEC/IEEE Standard for Binary Floating-Point
3455| Arithmetic---which means in particular that the conversion is rounded
3456| according to the current rounding mode. If `a' is a NaN, the largest
3457| positive integer is returned. Otherwise, if the conversion overflows, the
3458| largest integer with the same sign as `a' is returned.
3459*----------------------------------------------------------------------------*/
3460
f4014512 3461int32_t float64_to_int32(float64 a, float_status *status)
158142c2
FB
3462{
3463 flag aSign;
0c48262d 3464 int aExp;
07d792d2 3465 int shiftCount;
bb98fe42 3466 uint64_t aSig;
ff32e16e 3467 a = float64_squash_input_denormal(a, status);
158142c2
FB
3468
3469 aSig = extractFloat64Frac( a );
3470 aExp = extractFloat64Exp( a );
3471 aSign = extractFloat64Sign( a );
3472 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3473 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3474 shiftCount = 0x42C - aExp;
3475 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
ff32e16e 3476 return roundAndPackInt32(aSign, aSig, status);
158142c2
FB
3477
3478}
3479
3480/*----------------------------------------------------------------------------
3481| Returns the result of converting the double-precision floating-point value
3482| `a' to the 32-bit two's complement integer format. The conversion is
3483| performed according to the IEC/IEEE Standard for Binary Floating-Point
3484| Arithmetic, except that the conversion is always rounded toward zero.
3485| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
3486| the conversion overflows, the largest integer with the same sign as `a' is
3487| returned.
3488*----------------------------------------------------------------------------*/
3489
f4014512 3490int32_t float64_to_int32_round_to_zero(float64 a, float_status *status)
158142c2
FB
3491{
3492 flag aSign;
0c48262d 3493 int aExp;
07d792d2 3494 int shiftCount;
bb98fe42 3495 uint64_t aSig, savedASig;
b3a6a2e0 3496 int32_t z;
ff32e16e 3497 a = float64_squash_input_denormal(a, status);
158142c2
FB
3498
3499 aSig = extractFloat64Frac( a );
3500 aExp = extractFloat64Exp( a );
3501 aSign = extractFloat64Sign( a );
3502 if ( 0x41E < aExp ) {
3503 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3504 goto invalid;
3505 }
3506 else if ( aExp < 0x3FF ) {
a2f2d288
PM
3507 if (aExp || aSig) {
3508 status->float_exception_flags |= float_flag_inexact;
3509 }
158142c2
FB
3510 return 0;
3511 }
3512 aSig |= LIT64( 0x0010000000000000 );
3513 shiftCount = 0x433 - aExp;
3514 savedASig = aSig;
3515 aSig >>= shiftCount;
3516 z = aSig;
3517 if ( aSign ) z = - z;
3518 if ( ( z < 0 ) ^ aSign ) {
3519 invalid:
ff32e16e 3520 float_raise(float_flag_invalid, status);
bb98fe42 3521 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2
FB
3522 }
3523 if ( ( aSig<<shiftCount ) != savedASig ) {
a2f2d288 3524 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
3525 }
3526 return z;
3527
3528}
3529
cbcef455
PM
3530/*----------------------------------------------------------------------------
3531| Returns the result of converting the double-precision floating-point value
3532| `a' to the 16-bit two's complement integer format. The conversion is
3533| performed according to the IEC/IEEE Standard for Binary Floating-Point
3534| Arithmetic, except that the conversion is always rounded toward zero.
3535| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
3536| the conversion overflows, the largest integer with the same sign as `a' is
3537| returned.
3538*----------------------------------------------------------------------------*/
3539
0bb721d7 3540int16_t float64_to_int16_round_to_zero(float64 a, float_status *status)
cbcef455
PM
3541{
3542 flag aSign;
0c48262d 3543 int aExp;
07d792d2 3544 int shiftCount;
bb98fe42 3545 uint64_t aSig, savedASig;
f4014512 3546 int32_t z;
cbcef455
PM
3547
3548 aSig = extractFloat64Frac( a );
3549 aExp = extractFloat64Exp( a );
3550 aSign = extractFloat64Sign( a );
3551 if ( 0x40E < aExp ) {
3552 if ( ( aExp == 0x7FF ) && aSig ) {
3553 aSign = 0;
3554 }
3555 goto invalid;
3556 }
3557 else if ( aExp < 0x3FF ) {
3558 if ( aExp || aSig ) {
a2f2d288 3559 status->float_exception_flags |= float_flag_inexact;
cbcef455
PM
3560 }
3561 return 0;
3562 }
3563 aSig |= LIT64( 0x0010000000000000 );
3564 shiftCount = 0x433 - aExp;
3565 savedASig = aSig;
3566 aSig >>= shiftCount;
3567 z = aSig;
3568 if ( aSign ) {
3569 z = - z;
3570 }
3571 if ( ( (int16_t)z < 0 ) ^ aSign ) {
3572 invalid:
ff32e16e 3573 float_raise(float_flag_invalid, status);
bb98fe42 3574 return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
cbcef455
PM
3575 }
3576 if ( ( aSig<<shiftCount ) != savedASig ) {
a2f2d288 3577 status->float_exception_flags |= float_flag_inexact;
cbcef455
PM
3578 }
3579 return z;
3580}
3581
158142c2
FB
3582/*----------------------------------------------------------------------------
3583| Returns the result of converting the double-precision floating-point value
3584| `a' to the 64-bit two's complement integer format. The conversion is
3585| performed according to the IEC/IEEE Standard for Binary Floating-Point
3586| Arithmetic---which means in particular that the conversion is rounded
3587| according to the current rounding mode. If `a' is a NaN, the largest
3588| positive integer is returned. Otherwise, if the conversion overflows, the
3589| largest integer with the same sign as `a' is returned.
3590*----------------------------------------------------------------------------*/
3591
f42c2224 3592int64_t float64_to_int64(float64 a, float_status *status)
158142c2
FB
3593{
3594 flag aSign;
0c48262d 3595 int aExp;
07d792d2 3596 int shiftCount;
bb98fe42 3597 uint64_t aSig, aSigExtra;
ff32e16e 3598 a = float64_squash_input_denormal(a, status);
158142c2
FB
3599
3600 aSig = extractFloat64Frac( a );
3601 aExp = extractFloat64Exp( a );
3602 aSign = extractFloat64Sign( a );
3603 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3604 shiftCount = 0x433 - aExp;
3605 if ( shiftCount <= 0 ) {
3606 if ( 0x43E < aExp ) {
ff32e16e 3607 float_raise(float_flag_invalid, status);
158142c2
FB
3608 if ( ! aSign
3609 || ( ( aExp == 0x7FF )
3610 && ( aSig != LIT64( 0x0010000000000000 ) ) )
3611 ) {
3612 return LIT64( 0x7FFFFFFFFFFFFFFF );
3613 }
bb98fe42 3614 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
3615 }
3616 aSigExtra = 0;
3617 aSig <<= - shiftCount;
3618 }
3619 else {
3620 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3621 }
ff32e16e 3622 return roundAndPackInt64(aSign, aSig, aSigExtra, status);
158142c2
FB
3623
3624}
3625
3626/*----------------------------------------------------------------------------
3627| Returns the result of converting the double-precision floating-point value
3628| `a' to the 64-bit two's complement integer format. The conversion is
3629| performed according to the IEC/IEEE Standard for Binary Floating-Point
3630| Arithmetic, except that the conversion is always rounded toward zero.
3631| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
3632| the conversion overflows, the largest integer with the same sign as `a' is
3633| returned.
3634*----------------------------------------------------------------------------*/
3635
f42c2224 3636int64_t float64_to_int64_round_to_zero(float64 a, float_status *status)
158142c2
FB
3637{
3638 flag aSign;
0c48262d 3639 int aExp;
07d792d2 3640 int shiftCount;
bb98fe42 3641 uint64_t aSig;
f42c2224 3642 int64_t z;
ff32e16e 3643 a = float64_squash_input_denormal(a, status);
158142c2
FB
3644
3645 aSig = extractFloat64Frac( a );
3646 aExp = extractFloat64Exp( a );
3647 aSign = extractFloat64Sign( a );
3648 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3649 shiftCount = aExp - 0x433;
3650 if ( 0 <= shiftCount ) {
3651 if ( 0x43E <= aExp ) {
f090c9d4 3652 if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
ff32e16e 3653 float_raise(float_flag_invalid, status);
158142c2
FB
3654 if ( ! aSign
3655 || ( ( aExp == 0x7FF )
3656 && ( aSig != LIT64( 0x0010000000000000 ) ) )
3657 ) {
3658 return LIT64( 0x7FFFFFFFFFFFFFFF );
3659 }
3660 }
bb98fe42 3661 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
3662 }
3663 z = aSig<<shiftCount;
3664 }
3665 else {
3666 if ( aExp < 0x3FE ) {
a2f2d288
PM
3667 if (aExp | aSig) {
3668 status->float_exception_flags |= float_flag_inexact;
3669 }
158142c2
FB
3670 return 0;
3671 }
3672 z = aSig>>( - shiftCount );
bb98fe42 3673 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
a2f2d288 3674 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
3675 }
3676 }
3677 if ( aSign ) z = - z;
3678 return z;
3679
3680}
3681
3682/*----------------------------------------------------------------------------
3683| Returns the result of converting the double-precision floating-point value
3684| `a' to the single-precision floating-point format. The conversion is
3685| performed according to the IEC/IEEE Standard for Binary Floating-Point
3686| Arithmetic.
3687*----------------------------------------------------------------------------*/
3688
e5a41ffa 3689float32 float64_to_float32(float64 a, float_status *status)
158142c2
FB
3690{
3691 flag aSign;
0c48262d 3692 int aExp;
bb98fe42
AF
3693 uint64_t aSig;
3694 uint32_t zSig;
ff32e16e 3695 a = float64_squash_input_denormal(a, status);
158142c2
FB
3696
3697 aSig = extractFloat64Frac( a );
3698 aExp = extractFloat64Exp( a );
3699 aSign = extractFloat64Sign( a );
3700 if ( aExp == 0x7FF ) {
ff32e16e
PM
3701 if (aSig) {
3702 return commonNaNToFloat32(float64ToCommonNaN(a, status), status);
3703 }
158142c2
FB
3704 return packFloat32( aSign, 0xFF, 0 );
3705 }
3706 shift64RightJamming( aSig, 22, &aSig );
3707 zSig = aSig;
3708 if ( aExp || zSig ) {
3709 zSig |= 0x40000000;
3710 aExp -= 0x381;
3711 }
ff32e16e 3712 return roundAndPackFloat32(aSign, aExp, zSig, status);
158142c2
FB
3713
3714}
3715
60011498
PB
3716
3717/*----------------------------------------------------------------------------
3718| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3719| half-precision floating-point value, returning the result. After being
3720| shifted into the proper positions, the three fields are simply added
3721| together to form the result. This means that any integer portion of `zSig'
3722| will be added into the exponent. Since a properly normalized significand
3723| will have an integer portion equal to 1, the `zExp' input should be 1 less
3724| than the desired result exponent whenever `zSig' is a complete, normalized
3725| significand.
3726*----------------------------------------------------------------------------*/
0c48262d 3727static float16 packFloat16(flag zSign, int zExp, uint16_t zSig)
60011498 3728{
bb4d4bb3 3729 return make_float16(
bb98fe42 3730 (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
60011498
PB
3731}
3732
c4a1c5e7
PM
3733/*----------------------------------------------------------------------------
3734| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3735| and significand `zSig', and returns the proper half-precision floating-
3736| point value corresponding to the abstract input. Ordinarily, the abstract
3737| value is simply rounded and packed into the half-precision format, with
3738| the inexact exception raised if the abstract input cannot be represented
3739| exactly. However, if the abstract value is too large, the overflow and
3740| inexact exceptions are raised and an infinity or maximal finite value is
3741| returned. If the abstract value is too small, the input value is rounded to
3742| a subnormal number, and the underflow and inexact exceptions are raised if
3743| the abstract input cannot be represented exactly as a subnormal half-
3744| precision floating-point number.
3745| The `ieee' flag indicates whether to use IEEE standard half precision, or
3746| ARM-style "alternative representation", which omits the NaN and Inf
3747| encodings in order to raise the maximum representable exponent by one.
3748| The input significand `zSig' has its binary point between bits 22
3749| and 23, which is 13 bits to the left of the usual location. This shifted
3750| significand must be normalized or smaller. If `zSig' is not normalized,
3751| `zExp' must be 0; in that case, the result returned is a subnormal number,
3752| and it must not require rounding. In the usual case that `zSig' is
3753| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3754| Note the slightly odd position of the binary point in zSig compared with the
3755| other roundAndPackFloat functions. This should probably be fixed if we
3756| need to implement more float16 routines than just conversion.
3757| The handling of underflow and overflow follows the IEC/IEEE Standard for
3758| Binary Floating-Point Arithmetic.
3759*----------------------------------------------------------------------------*/
3760
0c48262d 3761static float16 roundAndPackFloat16(flag zSign, int zExp,
e5a41ffa
PM
3762 uint32_t zSig, flag ieee,
3763 float_status *status)
c4a1c5e7
PM
3764{
3765 int maxexp = ieee ? 29 : 30;
3766 uint32_t mask;
3767 uint32_t increment;
c4a1c5e7
PM
3768 bool rounding_bumps_exp;
3769 bool is_tiny = false;
3770
3771 /* Calculate the mask of bits of the mantissa which are not
3772 * representable in half-precision and will be lost.
3773 */
3774 if (zExp < 1) {
3775 /* Will be denormal in halfprec */
3776 mask = 0x00ffffff;
3777 if (zExp >= -11) {
3778 mask >>= 11 + zExp;
3779 }
3780 } else {
3781 /* Normal number in halfprec */
3782 mask = 0x00001fff;
3783 }
3784
a2f2d288 3785 switch (status->float_rounding_mode) {
c4a1c5e7
PM
3786 case float_round_nearest_even:
3787 increment = (mask + 1) >> 1;
3788 if ((zSig & mask) == increment) {
3789 increment = zSig & (increment << 1);
3790 }
3791 break;
f9288a76
PM
3792 case float_round_ties_away:
3793 increment = (mask + 1) >> 1;
3794 break;
c4a1c5e7
PM
3795 case float_round_up:
3796 increment = zSign ? 0 : mask;
3797 break;
3798 case float_round_down:
3799 increment = zSign ? mask : 0;
3800 break;
3801 default: /* round_to_zero */
3802 increment = 0;
3803 break;
3804 }
3805
3806 rounding_bumps_exp = (zSig + increment >= 0x01000000);
3807
3808 if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3809 if (ieee) {
ff32e16e 3810 float_raise(float_flag_overflow | float_flag_inexact, status);
c4a1c5e7
PM
3811 return packFloat16(zSign, 0x1f, 0);
3812 } else {
ff32e16e 3813 float_raise(float_flag_invalid, status);
c4a1c5e7
PM
3814 return packFloat16(zSign, 0x1f, 0x3ff);
3815 }
3816 }
3817
3818 if (zExp < 0) {
3819 /* Note that flush-to-zero does not affect half-precision results */
3820 is_tiny =
a2f2d288 3821 (status->float_detect_tininess == float_tininess_before_rounding)
c4a1c5e7
PM
3822 || (zExp < -1)
3823 || (!rounding_bumps_exp);
3824 }
3825 if (zSig & mask) {
ff32e16e 3826 float_raise(float_flag_inexact, status);
c4a1c5e7 3827 if (is_tiny) {
ff32e16e 3828 float_raise(float_flag_underflow, status);
c4a1c5e7
PM
3829 }
3830 }
3831
3832 zSig += increment;
3833 if (rounding_bumps_exp) {
3834 zSig >>= 1;
3835 zExp++;
3836 }
3837
3838 if (zExp < -10) {
3839 return packFloat16(zSign, 0, 0);
3840 }
3841 if (zExp < 0) {
3842 zSig >>= -zExp;
3843 zExp = 0;
3844 }
3845 return packFloat16(zSign, zExp, zSig >> 13);
3846}
3847
210cbd49
AB
3848/*----------------------------------------------------------------------------
3849| If `a' is denormal and we are in flush-to-zero mode then set the
3850| input-denormal exception and return zero. Otherwise just return the value.
3851*----------------------------------------------------------------------------*/
3852float16 float16_squash_input_denormal(float16 a, float_status *status)
3853{
3854 if (status->flush_inputs_to_zero) {
3855 if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) {
3856 float_raise(float_flag_input_denormal, status);
3857 return make_float16(float16_val(a) & 0x8000);
3858 }
3859 }
3860 return a;
3861}
3862
0c48262d 3863static void normalizeFloat16Subnormal(uint32_t aSig, int *zExpPtr,
c4a1c5e7
PM
3864 uint32_t *zSigPtr)
3865{
3866 int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3867 *zSigPtr = aSig << shiftCount;
3868 *zExpPtr = 1 - shiftCount;
3869}
3870
60011498
PB
3871/* Half precision floats come in two formats: standard IEEE and "ARM" format.
3872 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
bb4d4bb3 3873
e5a41ffa 3874float32 float16_to_float32(float16 a, flag ieee, float_status *status)
60011498
PB
3875{
3876 flag aSign;
0c48262d 3877 int aExp;
bb98fe42 3878 uint32_t aSig;
60011498 3879
bb4d4bb3
PM
3880 aSign = extractFloat16Sign(a);
3881 aExp = extractFloat16Exp(a);
3882 aSig = extractFloat16Frac(a);
60011498
PB
3883
3884 if (aExp == 0x1f && ieee) {
3885 if (aSig) {
ff32e16e 3886 return commonNaNToFloat32(float16ToCommonNaN(a, status), status);
60011498 3887 }
4be8eeac 3888 return packFloat32(aSign, 0xff, 0);
60011498
PB
3889 }
3890 if (aExp == 0) {
60011498
PB
3891 if (aSig == 0) {
3892 return packFloat32(aSign, 0, 0);
3893 }
3894
c4a1c5e7
PM
3895 normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3896 aExp--;
60011498
PB
3897 }
3898 return packFloat32( aSign, aExp + 0x70, aSig << 13);
3899}
3900
e5a41ffa 3901float16 float32_to_float16(float32 a, flag ieee, float_status *status)
60011498
PB
3902{
3903 flag aSign;
0c48262d 3904 int aExp;
bb98fe42 3905 uint32_t aSig;
38970efa 3906
ff32e16e 3907 a = float32_squash_input_denormal(a, status);
60011498
PB
3908
3909 aSig = extractFloat32Frac( a );
3910 aExp = extractFloat32Exp( a );
3911 aSign = extractFloat32Sign( a );
3912 if ( aExp == 0xFF ) {
3913 if (aSig) {
600e30d2 3914 /* Input is a NaN */
600e30d2 3915 if (!ieee) {
ff32e16e 3916 float_raise(float_flag_invalid, status);
600e30d2
PM
3917 return packFloat16(aSign, 0, 0);
3918 }
38970efa 3919 return commonNaNToFloat16(
ff32e16e 3920 float32ToCommonNaN(a, status), status);
60011498 3921 }
600e30d2
PM
3922 /* Infinity */
3923 if (!ieee) {
ff32e16e 3924 float_raise(float_flag_invalid, status);
600e30d2
PM
3925 return packFloat16(aSign, 0x1f, 0x3ff);
3926 }
3927 return packFloat16(aSign, 0x1f, 0);
60011498 3928 }
600e30d2 3929 if (aExp == 0 && aSig == 0) {
60011498
PB
3930 return packFloat16(aSign, 0, 0);
3931 }
38970efa
PM
3932 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3933 * even if the input is denormal; however this is harmless because
3934 * the largest possible single-precision denormal is still smaller
3935 * than the smallest representable half-precision denormal, and so we
3936 * will end up ignoring aSig and returning via the "always return zero"
3937 * codepath.
3938 */
60011498 3939 aSig |= 0x00800000;
c4a1c5e7 3940 aExp -= 0x71;
60011498 3941
ff32e16e 3942 return roundAndPackFloat16(aSign, aExp, aSig, ieee, status);
60011498
PB
3943}
3944
e5a41ffa 3945float64 float16_to_float64(float16 a, flag ieee, float_status *status)
14c9a07e
PM
3946{
3947 flag aSign;
0c48262d 3948 int aExp;
14c9a07e
PM
3949 uint32_t aSig;
3950
3951 aSign = extractFloat16Sign(a);
3952 aExp = extractFloat16Exp(a);
3953 aSig = extractFloat16Frac(a);
3954
3955 if (aExp == 0x1f && ieee) {
3956 if (aSig) {
3957 return commonNaNToFloat64(
ff32e16e 3958 float16ToCommonNaN(a, status), status);
14c9a07e
PM
3959 }
3960 return packFloat64(aSign, 0x7ff, 0);
3961 }
3962 if (aExp == 0) {
3963 if (aSig == 0) {
3964 return packFloat64(aSign, 0, 0);
3965 }
3966
3967 normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3968 aExp--;
3969 }
3970 return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);
3971}
3972
e5a41ffa 3973float16 float64_to_float16(float64 a, flag ieee, float_status *status)
14c9a07e
PM
3974{
3975 flag aSign;
0c48262d 3976 int aExp;
14c9a07e
PM
3977 uint64_t aSig;
3978 uint32_t zSig;
3979
ff32e16e 3980 a = float64_squash_input_denormal(a, status);
14c9a07e
PM
3981
3982 aSig = extractFloat64Frac(a);
3983 aExp = extractFloat64Exp(a);
3984 aSign = extractFloat64Sign(a);
3985 if (aExp == 0x7FF) {
3986 if (aSig) {
3987 /* Input is a NaN */
3988 if (!ieee) {
ff32e16e 3989 float_raise(float_flag_invalid, status);
14c9a07e
PM
3990 return packFloat16(aSign, 0, 0);
3991 }
3992 return commonNaNToFloat16(
ff32e16e 3993 float64ToCommonNaN(a, status), status);
14c9a07e
PM
3994 }
3995 /* Infinity */
3996 if (!ieee) {
ff32e16e 3997 float_raise(float_flag_invalid, status);
14c9a07e
PM
3998 return packFloat16(aSign, 0x1f, 0x3ff);
3999 }
4000 return packFloat16(aSign, 0x1f, 0);
4001 }
4002 shift64RightJamming(aSig, 29, &aSig);
4003 zSig = aSig;
4004 if (aExp == 0 && zSig == 0) {
4005 return packFloat16(aSign, 0, 0);
4006 }
4007 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
4008 * even if the input is denormal; however this is harmless because
4009 * the largest possible single-precision denormal is still smaller
4010 * than the smallest representable half-precision denormal, and so we
4011 * will end up ignoring aSig and returning via the "always return zero"
4012 * codepath.
4013 */
4014 zSig |= 0x00800000;
4015 aExp -= 0x3F1;
4016
ff32e16e 4017 return roundAndPackFloat16(aSign, aExp, zSig, ieee, status);
14c9a07e
PM
4018}
4019
158142c2
FB
4020/*----------------------------------------------------------------------------
4021| Returns the result of converting the double-precision floating-point value
4022| `a' to the extended double-precision floating-point format. The conversion
4023| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4024| Arithmetic.
4025*----------------------------------------------------------------------------*/
4026
e5a41ffa 4027floatx80 float64_to_floatx80(float64 a, float_status *status)
158142c2
FB
4028{
4029 flag aSign;
0c48262d 4030 int aExp;
bb98fe42 4031 uint64_t aSig;
158142c2 4032
ff32e16e 4033 a = float64_squash_input_denormal(a, status);
158142c2
FB
4034 aSig = extractFloat64Frac( a );
4035 aExp = extractFloat64Exp( a );
4036 aSign = extractFloat64Sign( a );
4037 if ( aExp == 0x7FF ) {
ff32e16e
PM
4038 if (aSig) {
4039 return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
4040 }
158142c2
FB
4041 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4042 }
4043 if ( aExp == 0 ) {
4044 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4045 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4046 }
4047 return
4048 packFloatx80(
4049 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
4050
4051}
4052
158142c2
FB
4053/*----------------------------------------------------------------------------
4054| Returns the result of converting the double-precision floating-point value
4055| `a' to the quadruple-precision floating-point format. The conversion is
4056| performed according to the IEC/IEEE Standard for Binary Floating-Point
4057| Arithmetic.
4058*----------------------------------------------------------------------------*/
4059
e5a41ffa 4060float128 float64_to_float128(float64 a, float_status *status)
158142c2
FB
4061{
4062 flag aSign;
0c48262d 4063 int aExp;
bb98fe42 4064 uint64_t aSig, zSig0, zSig1;
158142c2 4065
ff32e16e 4066 a = float64_squash_input_denormal(a, status);
158142c2
FB
4067 aSig = extractFloat64Frac( a );
4068 aExp = extractFloat64Exp( a );
4069 aSign = extractFloat64Sign( a );
4070 if ( aExp == 0x7FF ) {
ff32e16e
PM
4071 if (aSig) {
4072 return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
4073 }
158142c2
FB
4074 return packFloat128( aSign, 0x7FFF, 0, 0 );
4075 }
4076 if ( aExp == 0 ) {
4077 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4078 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4079 --aExp;
4080 }
4081 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
4082 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
4083
4084}
4085
158142c2
FB
4086/*----------------------------------------------------------------------------
4087| Rounds the double-precision floating-point value `a' to an integer, and
4088| returns the result as a double-precision floating-point value. The
4089| operation is performed according to the IEC/IEEE Standard for Binary
4090| Floating-Point Arithmetic.
4091*----------------------------------------------------------------------------*/
4092
e5a41ffa 4093float64 float64_round_to_int(float64 a, float_status *status)
158142c2
FB
4094{
4095 flag aSign;
0c48262d 4096 int aExp;
bb98fe42 4097 uint64_t lastBitMask, roundBitsMask;
bb98fe42 4098 uint64_t z;
ff32e16e 4099 a = float64_squash_input_denormal(a, status);
158142c2
FB
4100
4101 aExp = extractFloat64Exp( a );
4102 if ( 0x433 <= aExp ) {
4103 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
ff32e16e 4104 return propagateFloat64NaN(a, a, status);
158142c2
FB
4105 }
4106 return a;
4107 }
4108 if ( aExp < 0x3FF ) {
bb98fe42 4109 if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
a2f2d288 4110 status->float_exception_flags |= float_flag_inexact;
158142c2 4111 aSign = extractFloat64Sign( a );
a2f2d288 4112 switch (status->float_rounding_mode) {
158142c2
FB
4113 case float_round_nearest_even:
4114 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
4115 return packFloat64( aSign, 0x3FF, 0 );
4116 }
4117 break;
f9288a76
PM
4118 case float_round_ties_away:
4119 if (aExp == 0x3FE) {
4120 return packFloat64(aSign, 0x3ff, 0);
4121 }
4122 break;
158142c2 4123 case float_round_down:
f090c9d4 4124 return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
158142c2 4125 case float_round_up:
f090c9d4
PB
4126 return make_float64(
4127 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
158142c2
FB
4128 }
4129 return packFloat64( aSign, 0, 0 );
4130 }
4131 lastBitMask = 1;
4132 lastBitMask <<= 0x433 - aExp;
4133 roundBitsMask = lastBitMask - 1;
f090c9d4 4134 z = float64_val(a);
a2f2d288 4135 switch (status->float_rounding_mode) {
dc355b76
PM
4136 case float_round_nearest_even:
4137 z += lastBitMask >> 1;
4138 if ((z & roundBitsMask) == 0) {
4139 z &= ~lastBitMask;
4140 }
4141 break;
f9288a76
PM
4142 case float_round_ties_away:
4143 z += lastBitMask >> 1;
4144 break;
dc355b76
PM
4145 case float_round_to_zero:
4146 break;
4147 case float_round_up:
4148 if (!extractFloat64Sign(make_float64(z))) {
4149 z += roundBitsMask;
4150 }
4151 break;
4152 case float_round_down:
4153 if (extractFloat64Sign(make_float64(z))) {
158142c2
FB
4154 z += roundBitsMask;
4155 }
dc355b76
PM
4156 break;
4157 default:
4158 abort();
158142c2
FB
4159 }
4160 z &= ~ roundBitsMask;
a2f2d288
PM
4161 if (z != float64_val(a)) {
4162 status->float_exception_flags |= float_flag_inexact;
4163 }
f090c9d4 4164 return make_float64(z);
158142c2
FB
4165
4166}
4167
e5a41ffa 4168float64 float64_trunc_to_int(float64 a, float_status *status)
e6e5906b
PB
4169{
4170 int oldmode;
4171 float64 res;
a2f2d288
PM
4172 oldmode = status->float_rounding_mode;
4173 status->float_rounding_mode = float_round_to_zero;
ff32e16e 4174 res = float64_round_to_int(a, status);
a2f2d288 4175 status->float_rounding_mode = oldmode;
e6e5906b
PB
4176 return res;
4177}
4178
158142c2
FB
4179
4180/*----------------------------------------------------------------------------
4181| Returns the remainder of the double-precision floating-point value `a'
4182| with respect to the corresponding value `b'. The operation is performed
4183| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4184*----------------------------------------------------------------------------*/
4185
e5a41ffa 4186float64 float64_rem(float64 a, float64 b, float_status *status)
158142c2 4187{
ed086f3d 4188 flag aSign, zSign;
0c48262d 4189 int aExp, bExp, expDiff;
bb98fe42
AF
4190 uint64_t aSig, bSig;
4191 uint64_t q, alternateASig;
4192 int64_t sigMean;
158142c2 4193
ff32e16e
PM
4194 a = float64_squash_input_denormal(a, status);
4195 b = float64_squash_input_denormal(b, status);
158142c2
FB
4196 aSig = extractFloat64Frac( a );
4197 aExp = extractFloat64Exp( a );
4198 aSign = extractFloat64Sign( a );
4199 bSig = extractFloat64Frac( b );
4200 bExp = extractFloat64Exp( b );
158142c2
FB
4201 if ( aExp == 0x7FF ) {
4202 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
ff32e16e 4203 return propagateFloat64NaN(a, b, status);
158142c2 4204 }
ff32e16e 4205 float_raise(float_flag_invalid, status);
af39bc8c 4206 return float64_default_nan(status);
158142c2
FB
4207 }
4208 if ( bExp == 0x7FF ) {
ff32e16e
PM
4209 if (bSig) {
4210 return propagateFloat64NaN(a, b, status);
4211 }
158142c2
FB
4212 return a;
4213 }
4214 if ( bExp == 0 ) {
4215 if ( bSig == 0 ) {
ff32e16e 4216 float_raise(float_flag_invalid, status);
af39bc8c 4217 return float64_default_nan(status);
158142c2
FB
4218 }
4219 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4220 }
4221 if ( aExp == 0 ) {
4222 if ( aSig == 0 ) return a;
4223 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4224 }
4225 expDiff = aExp - bExp;
4226 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4227 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4228 if ( expDiff < 0 ) {
4229 if ( expDiff < -1 ) return a;
4230 aSig >>= 1;
4231 }
4232 q = ( bSig <= aSig );
4233 if ( q ) aSig -= bSig;
4234 expDiff -= 64;
4235 while ( 0 < expDiff ) {
4236 q = estimateDiv128To64( aSig, 0, bSig );
4237 q = ( 2 < q ) ? q - 2 : 0;
4238 aSig = - ( ( bSig>>2 ) * q );
4239 expDiff -= 62;
4240 }
4241 expDiff += 64;
4242 if ( 0 < expDiff ) {
4243 q = estimateDiv128To64( aSig, 0, bSig );
4244 q = ( 2 < q ) ? q - 2 : 0;
4245 q >>= 64 - expDiff;
4246 bSig >>= 2;
4247 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4248 }
4249 else {
4250 aSig >>= 2;
4251 bSig >>= 2;
4252 }
4253 do {
4254 alternateASig = aSig;
4255 ++q;
4256 aSig -= bSig;
bb98fe42 4257 } while ( 0 <= (int64_t) aSig );
158142c2
FB
4258 sigMean = aSig + alternateASig;
4259 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4260 aSig = alternateASig;
4261 }
bb98fe42 4262 zSign = ( (int64_t) aSig < 0 );
158142c2 4263 if ( zSign ) aSig = - aSig;
ff32e16e 4264 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
158142c2
FB
4265
4266}
4267
369be8f6
PM
4268/*----------------------------------------------------------------------------
4269| Returns the result of multiplying the double-precision floating-point values
4270| `a' and `b' then adding 'c', with no intermediate rounding step after the
4271| multiplication. The operation is performed according to the IEC/IEEE
4272| Standard for Binary Floating-Point Arithmetic 754-2008.
4273| The flags argument allows the caller to select negation of the
4274| addend, the intermediate product, or the final result. (The difference
4275| between this and having the caller do a separate negation is that negating
4276| externally will flip the sign bit on NaNs.)
4277*----------------------------------------------------------------------------*/
4278
e5a41ffa
PM
4279float64 float64_muladd(float64 a, float64 b, float64 c, int flags,
4280 float_status *status)
369be8f6
PM
4281{
4282 flag aSign, bSign, cSign, zSign;
0c48262d 4283 int aExp, bExp, cExp, pExp, zExp, expDiff;
369be8f6
PM
4284 uint64_t aSig, bSig, cSig;
4285 flag pInf, pZero, pSign;
4286 uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1;
4287 int shiftcount;
4288 flag signflip, infzero;
4289
ff32e16e
PM
4290 a = float64_squash_input_denormal(a, status);
4291 b = float64_squash_input_denormal(b, status);
4292 c = float64_squash_input_denormal(c, status);
369be8f6
PM
4293 aSig = extractFloat64Frac(a);
4294 aExp = extractFloat64Exp(a);
4295 aSign = extractFloat64Sign(a);
4296 bSig = extractFloat64Frac(b);
4297 bExp = extractFloat64Exp(b);
4298 bSign = extractFloat64Sign(b);
4299 cSig = extractFloat64Frac(c);
4300 cExp = extractFloat64Exp(c);
4301 cSign = extractFloat64Sign(c);
4302
4303 infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) ||
4304 (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0));
4305
4306 /* It is implementation-defined whether the cases of (0,inf,qnan)
4307 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
4308 * they return if they do), so we have to hand this information
4309 * off to the target-specific pick-a-NaN routine.
4310 */
4311 if (((aExp == 0x7ff) && aSig) ||
4312 ((bExp == 0x7ff) && bSig) ||
4313 ((cExp == 0x7ff) && cSig)) {
ff32e16e 4314 return propagateFloat64MulAddNaN(a, b, c, infzero, status);
369be8f6
PM
4315 }
4316
4317 if (infzero) {
ff32e16e 4318 float_raise(float_flag_invalid, status);
af39bc8c 4319 return float64_default_nan(status);
369be8f6
PM
4320 }
4321
4322 if (flags & float_muladd_negate_c) {
4323 cSign ^= 1;
4324 }
4325
4326 signflip = (flags & float_muladd_negate_result) ? 1 : 0;
4327
4328 /* Work out the sign and type of the product */
4329 pSign = aSign ^ bSign;
4330 if (flags & float_muladd_negate_product) {
4331 pSign ^= 1;
4332 }
4333 pInf = (aExp == 0x7ff) || (bExp == 0x7ff);
4334 pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
4335
4336 if (cExp == 0x7ff) {
4337 if (pInf && (pSign ^ cSign)) {
4338 /* addition of opposite-signed infinities => InvalidOperation */
ff32e16e 4339 float_raise(float_flag_invalid, status);
af39bc8c 4340 return float64_default_nan(status);
369be8f6
PM
4341 }
4342 /* Otherwise generate an infinity of the same sign */
4343 return packFloat64(cSign ^ signflip, 0x7ff, 0);
4344 }
4345
4346 if (pInf) {
4347 return packFloat64(pSign ^ signflip, 0x7ff, 0);
4348 }
4349
4350 if (pZero) {
4351 if (cExp == 0) {
4352 if (cSig == 0) {
4353 /* Adding two exact zeroes */
4354 if (pSign == cSign) {
4355 zSign = pSign;
a2f2d288 4356 } else if (status->float_rounding_mode == float_round_down) {
369be8f6
PM
4357 zSign = 1;
4358 } else {
4359 zSign = 0;
4360 }
4361 return packFloat64(zSign ^ signflip, 0, 0);
4362 }
4363 /* Exact zero plus a denorm */
a2f2d288 4364 if (status->flush_to_zero) {
ff32e16e 4365 float_raise(float_flag_output_denormal, status);
369be8f6
PM
4366 return packFloat64(cSign ^ signflip, 0, 0);
4367 }
4368 }
4369 /* Zero plus something non-zero : just return the something */
67d43538
PM
4370 if (flags & float_muladd_halve_result) {
4371 if (cExp == 0) {
4372 normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4373 }
4374 /* Subtract one to halve, and one again because roundAndPackFloat64
4375 * wants one less than the true exponent.
4376 */
4377 cExp -= 2;
4378 cSig = (cSig | 0x0010000000000000ULL) << 10;
ff32e16e 4379 return roundAndPackFloat64(cSign ^ signflip, cExp, cSig, status);
67d43538 4380 }
a6e7c184 4381 return packFloat64(cSign ^ signflip, cExp, cSig);
369be8f6
PM
4382 }
4383
4384 if (aExp == 0) {
4385 normalizeFloat64Subnormal(aSig, &aExp, &aSig);
4386 }
4387 if (bExp == 0) {
4388 normalizeFloat64Subnormal(bSig, &bExp, &bSig);
4389 }
4390
4391 /* Calculate the actual result a * b + c */
4392
4393 /* Multiply first; this is easy. */
4394 /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff
4395 * because we want the true exponent, not the "one-less-than"
4396 * flavour that roundAndPackFloat64() takes.
4397 */
4398 pExp = aExp + bExp - 0x3fe;
4399 aSig = (aSig | LIT64(0x0010000000000000))<<10;
4400 bSig = (bSig | LIT64(0x0010000000000000))<<11;
4401 mul64To128(aSig, bSig, &pSig0, &pSig1);
4402 if ((int64_t)(pSig0 << 1) >= 0) {
4403 shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1);
4404 pExp--;
4405 }
4406
4407 zSign = pSign ^ signflip;
4408
4409 /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit
4410 * bit in position 126.
4411 */
4412 if (cExp == 0) {
4413 if (!cSig) {
4414 /* Throw out the special case of c being an exact zero now */
4415 shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1);
67d43538
PM
4416 if (flags & float_muladd_halve_result) {
4417 pExp--;
4418 }
369be8f6 4419 return roundAndPackFloat64(zSign, pExp - 1,
ff32e16e 4420 pSig1, status);
369be8f6
PM
4421 }
4422 normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4423 }
4424
4425 /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the
4426 * significand of the addend, with the explicit bit in position 126.
4427 */
4428 cSig0 = cSig << (126 - 64 - 52);
4429 cSig1 = 0;
4430 cSig0 |= LIT64(0x4000000000000000);
4431 expDiff = pExp - cExp;
4432
4433 if (pSign == cSign) {
4434 /* Addition */
4435 if (expDiff > 0) {
4436 /* scale c to match p */
4437 shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4438 zExp = pExp;
4439 } else if (expDiff < 0) {
4440 /* scale p to match c */
4441 shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4442 zExp = cExp;
4443 } else {
4444 /* no scaling needed */
4445 zExp = cExp;
4446 }
4447 /* Add significands and make sure explicit bit ends up in posn 126 */
4448 add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4449 if ((int64_t)zSig0 < 0) {
4450 shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);
4451 } else {
4452 zExp--;
4453 }
4454 shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1);
67d43538
PM
4455 if (flags & float_muladd_halve_result) {
4456 zExp--;
4457 }
ff32e16e 4458 return roundAndPackFloat64(zSign, zExp, zSig1, status);
369be8f6
PM
4459 } else {
4460 /* Subtraction */
4461 if (expDiff > 0) {
4462 shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4463 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4464 zExp = pExp;
4465 } else if (expDiff < 0) {
4466 shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4467 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4468 zExp = cExp;
4469 zSign ^= 1;
4470 } else {
4471 zExp = pExp;
4472 if (lt128(cSig0, cSig1, pSig0, pSig1)) {
4473 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4474 } else if (lt128(pSig0, pSig1, cSig0, cSig1)) {
4475 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4476 zSign ^= 1;
4477 } else {
4478 /* Exact zero */
4479 zSign = signflip;
a2f2d288 4480 if (status->float_rounding_mode == float_round_down) {
369be8f6
PM
4481 zSign ^= 1;
4482 }
4483 return packFloat64(zSign, 0, 0);
4484 }
4485 }
4486 --zExp;
4487 /* Do the equivalent of normalizeRoundAndPackFloat64() but
4488 * starting with the significand in a pair of uint64_t.
4489 */
4490 if (zSig0) {
4491 shiftcount = countLeadingZeros64(zSig0) - 1;
4492 shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1);
4493 if (zSig1) {
4494 zSig0 |= 1;
4495 }
4496 zExp -= shiftcount;
4497 } else {
e3d142d0
PM
4498 shiftcount = countLeadingZeros64(zSig1);
4499 if (shiftcount == 0) {
4500 zSig0 = (zSig1 >> 1) | (zSig1 & 1);
4501 zExp -= 63;
4502 } else {
4503 shiftcount--;
4504 zSig0 = zSig1 << shiftcount;
4505 zExp -= (shiftcount + 64);
4506 }
369be8f6 4507 }
67d43538
PM
4508 if (flags & float_muladd_halve_result) {
4509 zExp--;
4510 }
ff32e16e 4511 return roundAndPackFloat64(zSign, zExp, zSig0, status);
369be8f6
PM
4512 }
4513}
4514
158142c2
FB
4515/*----------------------------------------------------------------------------
4516| Returns the square root of the double-precision floating-point value `a'.
4517| The operation is performed according to the IEC/IEEE Standard for Binary
4518| Floating-Point Arithmetic.
4519*----------------------------------------------------------------------------*/
4520
e5a41ffa 4521float64 float64_sqrt(float64 a, float_status *status)
158142c2
FB
4522{
4523 flag aSign;
0c48262d 4524 int aExp, zExp;
bb98fe42
AF
4525 uint64_t aSig, zSig, doubleZSig;
4526 uint64_t rem0, rem1, term0, term1;
ff32e16e 4527 a = float64_squash_input_denormal(a, status);
158142c2
FB
4528
4529 aSig = extractFloat64Frac( a );
4530 aExp = extractFloat64Exp( a );
4531 aSign = extractFloat64Sign( a );
4532 if ( aExp == 0x7FF ) {
ff32e16e
PM
4533 if (aSig) {
4534 return propagateFloat64NaN(a, a, status);
4535 }
158142c2 4536 if ( ! aSign ) return a;
ff32e16e 4537 float_raise(float_flag_invalid, status);
af39bc8c 4538 return float64_default_nan(status);
158142c2
FB
4539 }
4540 if ( aSign ) {
4541 if ( ( aExp | aSig ) == 0 ) return a;
ff32e16e 4542 float_raise(float_flag_invalid, status);
af39bc8c 4543 return float64_default_nan(status);
158142c2
FB
4544 }
4545 if ( aExp == 0 ) {
f090c9d4 4546 if ( aSig == 0 ) return float64_zero;
158142c2
FB
4547 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4548 }
4549 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
4550 aSig |= LIT64( 0x0010000000000000 );
4551 zSig = estimateSqrt32( aExp, aSig>>21 );
4552 aSig <<= 9 - ( aExp & 1 );
4553 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
4554 if ( ( zSig & 0x1FF ) <= 5 ) {
4555 doubleZSig = zSig<<1;
4556 mul64To128( zSig, zSig, &term0, &term1 );
4557 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
bb98fe42 4558 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
4559 --zSig;
4560 doubleZSig -= 2;
4561 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
4562 }
4563 zSig |= ( ( rem0 | rem1 ) != 0 );
4564 }
ff32e16e 4565 return roundAndPackFloat64(0, zExp, zSig, status);
158142c2
FB
4566
4567}
4568
374dfc33
AJ
4569/*----------------------------------------------------------------------------
4570| Returns the binary log of the double-precision floating-point value `a'.
4571| The operation is performed according to the IEC/IEEE Standard for Binary
4572| Floating-Point Arithmetic.
4573*----------------------------------------------------------------------------*/
e5a41ffa 4574float64 float64_log2(float64 a, float_status *status)
374dfc33
AJ
4575{
4576 flag aSign, zSign;
0c48262d 4577 int aExp;
bb98fe42 4578 uint64_t aSig, aSig0, aSig1, zSig, i;
ff32e16e 4579 a = float64_squash_input_denormal(a, status);
374dfc33
AJ
4580
4581 aSig = extractFloat64Frac( a );
4582 aExp = extractFloat64Exp( a );
4583 aSign = extractFloat64Sign( a );
4584
4585 if ( aExp == 0 ) {
4586 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4587 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4588 }
4589 if ( aSign ) {
ff32e16e 4590 float_raise(float_flag_invalid, status);
af39bc8c 4591 return float64_default_nan(status);
374dfc33
AJ
4592 }
4593 if ( aExp == 0x7FF ) {
ff32e16e
PM
4594 if (aSig) {
4595 return propagateFloat64NaN(a, float64_zero, status);
4596 }
374dfc33
AJ
4597 return a;
4598 }
4599
4600 aExp -= 0x3FF;
4601 aSig |= LIT64( 0x0010000000000000 );
4602 zSign = aExp < 0;
bb98fe42 4603 zSig = (uint64_t)aExp << 52;
374dfc33
AJ
4604 for (i = 1LL << 51; i > 0; i >>= 1) {
4605 mul64To128( aSig, aSig, &aSig0, &aSig1 );
4606 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4607 if ( aSig & LIT64( 0x0020000000000000 ) ) {
4608 aSig >>= 1;
4609 zSig |= i;
4610 }
4611 }
4612
4613 if ( zSign )
4614 zSig = -zSig;
ff32e16e 4615 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
374dfc33
AJ
4616}
4617
158142c2
FB
4618/*----------------------------------------------------------------------------
4619| Returns 1 if the double-precision floating-point value `a' is equal to the
b689362d
AJ
4620| corresponding value `b', and 0 otherwise. The invalid exception is raised
4621| if either operand is a NaN. Otherwise, the comparison is performed
158142c2
FB
4622| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4623*----------------------------------------------------------------------------*/
4624
e5a41ffa 4625int float64_eq(float64 a, float64 b, float_status *status)
158142c2 4626{
bb98fe42 4627 uint64_t av, bv;
ff32e16e
PM
4628 a = float64_squash_input_denormal(a, status);
4629 b = float64_squash_input_denormal(b, status);
158142c2
FB
4630
4631 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4632 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4633 ) {
ff32e16e 4634 float_raise(float_flag_invalid, status);
158142c2
FB
4635 return 0;
4636 }
f090c9d4 4637 av = float64_val(a);
a1b91bb4 4638 bv = float64_val(b);
bb98fe42 4639 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
158142c2
FB
4640
4641}
4642
4643/*----------------------------------------------------------------------------
4644| Returns 1 if the double-precision floating-point value `a' is less than or
f5a64251
AJ
4645| equal to the corresponding value `b', and 0 otherwise. The invalid
4646| exception is raised if either operand is a NaN. The comparison is performed
4647| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
4648*----------------------------------------------------------------------------*/
4649
e5a41ffa 4650int float64_le(float64 a, float64 b, float_status *status)
158142c2
FB
4651{
4652 flag aSign, bSign;
bb98fe42 4653 uint64_t av, bv;
ff32e16e
PM
4654 a = float64_squash_input_denormal(a, status);
4655 b = float64_squash_input_denormal(b, status);
158142c2
FB
4656
4657 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4658 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4659 ) {
ff32e16e 4660 float_raise(float_flag_invalid, status);
158142c2
FB
4661 return 0;
4662 }
4663 aSign = extractFloat64Sign( a );
4664 bSign = extractFloat64Sign( b );
f090c9d4 4665 av = float64_val(a);
a1b91bb4 4666 bv = float64_val(b);
bb98fe42 4667 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 4668 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
4669
4670}
4671
4672/*----------------------------------------------------------------------------
4673| Returns 1 if the double-precision floating-point value `a' is less than
f5a64251
AJ
4674| the corresponding value `b', and 0 otherwise. The invalid exception is
4675| raised if either operand is a NaN. The comparison is performed according
4676| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
4677*----------------------------------------------------------------------------*/
4678
e5a41ffa 4679int float64_lt(float64 a, float64 b, float_status *status)
158142c2
FB
4680{
4681 flag aSign, bSign;
bb98fe42 4682 uint64_t av, bv;
158142c2 4683
ff32e16e
PM
4684 a = float64_squash_input_denormal(a, status);
4685 b = float64_squash_input_denormal(b, status);
158142c2
FB
4686 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4687 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4688 ) {
ff32e16e 4689 float_raise(float_flag_invalid, status);
158142c2
FB
4690 return 0;
4691 }
4692 aSign = extractFloat64Sign( a );
4693 bSign = extractFloat64Sign( b );
f090c9d4 4694 av = float64_val(a);
a1b91bb4 4695 bv = float64_val(b);
bb98fe42 4696 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 4697 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
4698
4699}
4700
67b7861d
AJ
4701/*----------------------------------------------------------------------------
4702| Returns 1 if the double-precision floating-point values `a' and `b' cannot
f5a64251
AJ
4703| be compared, and 0 otherwise. The invalid exception is raised if either
4704| operand is a NaN. The comparison is performed according to the IEC/IEEE
4705| Standard for Binary Floating-Point Arithmetic.
67b7861d
AJ
4706*----------------------------------------------------------------------------*/
4707
e5a41ffa 4708int float64_unordered(float64 a, float64 b, float_status *status)
67b7861d 4709{
ff32e16e
PM
4710 a = float64_squash_input_denormal(a, status);
4711 b = float64_squash_input_denormal(b, status);
67b7861d
AJ
4712
4713 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4714 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4715 ) {
ff32e16e 4716 float_raise(float_flag_invalid, status);
67b7861d
AJ
4717 return 1;
4718 }
4719 return 0;
4720}
4721
158142c2
FB
4722/*----------------------------------------------------------------------------
4723| Returns 1 if the double-precision floating-point value `a' is equal to the
f5a64251
AJ
4724| corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4725| exception.The comparison is performed according to the IEC/IEEE Standard
4726| for Binary Floating-Point Arithmetic.
158142c2
FB
4727*----------------------------------------------------------------------------*/
4728
e5a41ffa 4729int float64_eq_quiet(float64 a, float64 b, float_status *status)
158142c2 4730{
bb98fe42 4731 uint64_t av, bv;
ff32e16e
PM
4732 a = float64_squash_input_denormal(a, status);
4733 b = float64_squash_input_denormal(b, status);
158142c2
FB
4734
4735 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4736 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4737 ) {
af39bc8c
AM
4738 if (float64_is_signaling_nan(a, status)
4739 || float64_is_signaling_nan(b, status)) {
ff32e16e 4740 float_raise(float_flag_invalid, status);
b689362d 4741 }
158142c2
FB
4742 return 0;
4743 }
f090c9d4 4744 av = float64_val(a);
a1b91bb4 4745 bv = float64_val(b);
bb98fe42 4746 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
158142c2
FB
4747
4748}
4749
4750/*----------------------------------------------------------------------------
4751| Returns 1 if the double-precision floating-point value `a' is less than or
4752| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4753| cause an exception. Otherwise, the comparison is performed according to the
4754| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4755*----------------------------------------------------------------------------*/
4756
e5a41ffa 4757int float64_le_quiet(float64 a, float64 b, float_status *status)
158142c2
FB
4758{
4759 flag aSign, bSign;
bb98fe42 4760 uint64_t av, bv;
ff32e16e
PM
4761 a = float64_squash_input_denormal(a, status);
4762 b = float64_squash_input_denormal(b, status);
158142c2
FB
4763
4764 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4765 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4766 ) {
af39bc8c
AM
4767 if (float64_is_signaling_nan(a, status)
4768 || float64_is_signaling_nan(b, status)) {
ff32e16e 4769 float_raise(float_flag_invalid, status);
158142c2
FB
4770 }
4771 return 0;
4772 }
4773 aSign = extractFloat64Sign( a );
4774 bSign = extractFloat64Sign( b );
f090c9d4 4775 av = float64_val(a);
a1b91bb4 4776 bv = float64_val(b);
bb98fe42 4777 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 4778 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
4779
4780}
4781
4782/*----------------------------------------------------------------------------
4783| Returns 1 if the double-precision floating-point value `a' is less than
4784| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4785| exception. Otherwise, the comparison is performed according to the IEC/IEEE
4786| Standard for Binary Floating-Point Arithmetic.
4787*----------------------------------------------------------------------------*/
4788
e5a41ffa 4789int float64_lt_quiet(float64 a, float64 b, float_status *status)
158142c2
FB
4790{
4791 flag aSign, bSign;
bb98fe42 4792 uint64_t av, bv;
ff32e16e
PM
4793 a = float64_squash_input_denormal(a, status);
4794 b = float64_squash_input_denormal(b, status);
158142c2
FB
4795
4796 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4797 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4798 ) {
af39bc8c
AM
4799 if (float64_is_signaling_nan(a, status)
4800 || float64_is_signaling_nan(b, status)) {
ff32e16e 4801 float_raise(float_flag_invalid, status);
158142c2
FB
4802 }
4803 return 0;
4804 }
4805 aSign = extractFloat64Sign( a );
4806 bSign = extractFloat64Sign( b );
f090c9d4 4807 av = float64_val(a);
a1b91bb4 4808 bv = float64_val(b);
bb98fe42 4809 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 4810 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
4811
4812}
4813
67b7861d
AJ
4814/*----------------------------------------------------------------------------
4815| Returns 1 if the double-precision floating-point values `a' and `b' cannot
4816| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4817| comparison is performed according to the IEC/IEEE Standard for Binary
4818| Floating-Point Arithmetic.
4819*----------------------------------------------------------------------------*/
4820
e5a41ffa 4821int float64_unordered_quiet(float64 a, float64 b, float_status *status)
67b7861d 4822{
ff32e16e
PM
4823 a = float64_squash_input_denormal(a, status);
4824 b = float64_squash_input_denormal(b, status);
67b7861d
AJ
4825
4826 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4827 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4828 ) {
af39bc8c
AM
4829 if (float64_is_signaling_nan(a, status)
4830 || float64_is_signaling_nan(b, status)) {
ff32e16e 4831 float_raise(float_flag_invalid, status);
67b7861d
AJ
4832 }
4833 return 1;
4834 }
4835 return 0;
4836}
4837
158142c2
FB
4838/*----------------------------------------------------------------------------
4839| Returns the result of converting the extended double-precision floating-
4840| point value `a' to the 32-bit two's complement integer format. The
4841| conversion is performed according to the IEC/IEEE Standard for Binary
4842| Floating-Point Arithmetic---which means in particular that the conversion
4843| is rounded according to the current rounding mode. If `a' is a NaN, the
4844| largest positive integer is returned. Otherwise, if the conversion
4845| overflows, the largest integer with the same sign as `a' is returned.
4846*----------------------------------------------------------------------------*/
4847
f4014512 4848int32_t floatx80_to_int32(floatx80 a, float_status *status)
158142c2
FB
4849{
4850 flag aSign;
f4014512 4851 int32_t aExp, shiftCount;
bb98fe42 4852 uint64_t aSig;
158142c2 4853
d1eb8f2a
AD
4854 if (floatx80_invalid_encoding(a)) {
4855 float_raise(float_flag_invalid, status);
4856 return 1 << 31;
4857 }
158142c2
FB
4858 aSig = extractFloatx80Frac( a );
4859 aExp = extractFloatx80Exp( a );
4860 aSign = extractFloatx80Sign( a );
bb98fe42 4861 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
158142c2
FB
4862 shiftCount = 0x4037 - aExp;
4863 if ( shiftCount <= 0 ) shiftCount = 1;
4864 shift64RightJamming( aSig, shiftCount, &aSig );
ff32e16e 4865 return roundAndPackInt32(aSign, aSig, status);
158142c2
FB
4866
4867}
4868
4869/*----------------------------------------------------------------------------
4870| Returns the result of converting the extended double-precision floating-
4871| point value `a' to the 32-bit two's complement integer format. The
4872| conversion is performed according to the IEC/IEEE Standard for Binary
4873| Floating-Point Arithmetic, except that the conversion is always rounded
4874| toward zero. If `a' is a NaN, the largest positive integer is returned.
4875| Otherwise, if the conversion overflows, the largest integer with the same
4876| sign as `a' is returned.
4877*----------------------------------------------------------------------------*/
4878
f4014512 4879int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
158142c2
FB
4880{
4881 flag aSign;
f4014512 4882 int32_t aExp, shiftCount;
bb98fe42 4883 uint64_t aSig, savedASig;
b3a6a2e0 4884 int32_t z;
158142c2 4885
d1eb8f2a
AD
4886 if (floatx80_invalid_encoding(a)) {
4887 float_raise(float_flag_invalid, status);
4888 return 1 << 31;
4889 }
158142c2
FB
4890 aSig = extractFloatx80Frac( a );
4891 aExp = extractFloatx80Exp( a );
4892 aSign = extractFloatx80Sign( a );
4893 if ( 0x401E < aExp ) {
bb98fe42 4894 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
158142c2
FB
4895 goto invalid;
4896 }
4897 else if ( aExp < 0x3FFF ) {
a2f2d288
PM
4898 if (aExp || aSig) {
4899 status->float_exception_flags |= float_flag_inexact;
4900 }
158142c2
FB
4901 return 0;
4902 }
4903 shiftCount = 0x403E - aExp;
4904 savedASig = aSig;
4905 aSig >>= shiftCount;
4906 z = aSig;
4907 if ( aSign ) z = - z;
4908 if ( ( z < 0 ) ^ aSign ) {
4909 invalid:
ff32e16e 4910 float_raise(float_flag_invalid, status);
bb98fe42 4911 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2
FB
4912 }
4913 if ( ( aSig<<shiftCount ) != savedASig ) {
a2f2d288 4914 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
4915 }
4916 return z;
4917
4918}
4919
4920/*----------------------------------------------------------------------------
4921| Returns the result of converting the extended double-precision floating-
4922| point value `a' to the 64-bit two's complement integer format. The
4923| conversion is performed according to the IEC/IEEE Standard for Binary
4924| Floating-Point Arithmetic---which means in particular that the conversion
4925| is rounded according to the current rounding mode. If `a' is a NaN,
4926| the largest positive integer is returned. Otherwise, if the conversion
4927| overflows, the largest integer with the same sign as `a' is returned.
4928*----------------------------------------------------------------------------*/
4929
f42c2224 4930int64_t floatx80_to_int64(floatx80 a, float_status *status)
158142c2
FB
4931{
4932 flag aSign;
f4014512 4933 int32_t aExp, shiftCount;
bb98fe42 4934 uint64_t aSig, aSigExtra;
158142c2 4935
d1eb8f2a
AD
4936 if (floatx80_invalid_encoding(a)) {
4937 float_raise(float_flag_invalid, status);
4938 return 1ULL << 63;
4939 }
158142c2
FB
4940 aSig = extractFloatx80Frac( a );
4941 aExp = extractFloatx80Exp( a );
4942 aSign = extractFloatx80Sign( a );
4943 shiftCount = 0x403E - aExp;
4944 if ( shiftCount <= 0 ) {
4945 if ( shiftCount ) {
ff32e16e 4946 float_raise(float_flag_invalid, status);
158142c2
FB
4947 if ( ! aSign
4948 || ( ( aExp == 0x7FFF )
4949 && ( aSig != LIT64( 0x8000000000000000 ) ) )
4950 ) {
4951 return LIT64( 0x7FFFFFFFFFFFFFFF );
4952 }
bb98fe42 4953 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
4954 }
4955 aSigExtra = 0;
4956 }
4957 else {
4958 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4959 }
ff32e16e 4960 return roundAndPackInt64(aSign, aSig, aSigExtra, status);
158142c2
FB
4961
4962}
4963
4964/*----------------------------------------------------------------------------
4965| Returns the result of converting the extended double-precision floating-
4966| point value `a' to the 64-bit two's complement integer format. The
4967| conversion is performed according to the IEC/IEEE Standard for Binary
4968| Floating-Point Arithmetic, except that the conversion is always rounded
4969| toward zero. If `a' is a NaN, the largest positive integer is returned.
4970| Otherwise, if the conversion overflows, the largest integer with the same
4971| sign as `a' is returned.
4972*----------------------------------------------------------------------------*/
4973
f42c2224 4974int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
158142c2
FB
4975{
4976 flag aSign;
f4014512 4977 int32_t aExp, shiftCount;
bb98fe42 4978 uint64_t aSig;
f42c2224 4979 int64_t z;
158142c2 4980
d1eb8f2a
AD
4981 if (floatx80_invalid_encoding(a)) {
4982 float_raise(float_flag_invalid, status);
4983 return 1ULL << 63;
4984 }
158142c2
FB
4985 aSig = extractFloatx80Frac( a );
4986 aExp = extractFloatx80Exp( a );
4987 aSign = extractFloatx80Sign( a );
4988 shiftCount = aExp - 0x403E;
4989 if ( 0 <= shiftCount ) {
4990 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4991 if ( ( a.high != 0xC03E ) || aSig ) {
ff32e16e 4992 float_raise(float_flag_invalid, status);
158142c2
FB
4993 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4994 return LIT64( 0x7FFFFFFFFFFFFFFF );
4995 }
4996 }
bb98fe42 4997 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
4998 }
4999 else if ( aExp < 0x3FFF ) {
a2f2d288
PM
5000 if (aExp | aSig) {
5001 status->float_exception_flags |= float_flag_inexact;
5002 }
158142c2
FB
5003 return 0;
5004 }
5005 z = aSig>>( - shiftCount );
bb98fe42 5006 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
a2f2d288 5007 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
5008 }
5009 if ( aSign ) z = - z;
5010 return z;
5011
5012}
5013
5014/*----------------------------------------------------------------------------
5015| Returns the result of converting the extended double-precision floating-
5016| point value `a' to the single-precision floating-point format. The
5017| conversion is performed according to the IEC/IEEE Standard for Binary
5018| Floating-Point Arithmetic.
5019*----------------------------------------------------------------------------*/
5020
e5a41ffa 5021float32 floatx80_to_float32(floatx80 a, float_status *status)
158142c2
FB
5022{
5023 flag aSign;
f4014512 5024 int32_t aExp;
bb98fe42 5025 uint64_t aSig;
158142c2 5026
d1eb8f2a
AD
5027 if (floatx80_invalid_encoding(a)) {
5028 float_raise(float_flag_invalid, status);
5029 return float32_default_nan(status);
5030 }
158142c2
FB
5031 aSig = extractFloatx80Frac( a );
5032 aExp = extractFloatx80Exp( a );
5033 aSign = extractFloatx80Sign( a );
5034 if ( aExp == 0x7FFF ) {
bb98fe42 5035 if ( (uint64_t) ( aSig<<1 ) ) {
ff32e16e 5036 return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
158142c2
FB
5037 }
5038 return packFloat32( aSign, 0xFF, 0 );
5039 }
5040 shift64RightJamming( aSig, 33, &aSig );
5041 if ( aExp || aSig ) aExp -= 0x3F81;
ff32e16e 5042 return roundAndPackFloat32(aSign, aExp, aSig, status);
158142c2
FB
5043
5044}
5045
5046/*----------------------------------------------------------------------------
5047| Returns the result of converting the extended double-precision floating-
5048| point value `a' to the double-precision floating-point format. The
5049| conversion is performed according to the IEC/IEEE Standard for Binary
5050| Floating-Point Arithmetic.
5051*----------------------------------------------------------------------------*/
5052
e5a41ffa 5053float64 floatx80_to_float64(floatx80 a, float_status *status)
158142c2
FB
5054{
5055 flag aSign;
f4014512 5056 int32_t aExp;
bb98fe42 5057 uint64_t aSig, zSig;
158142c2 5058
d1eb8f2a
AD
5059 if (floatx80_invalid_encoding(a)) {
5060 float_raise(float_flag_invalid, status);
5061 return float64_default_nan(status);
5062 }
158142c2
FB
5063 aSig = extractFloatx80Frac( a );
5064 aExp = extractFloatx80Exp( a );
5065 aSign = extractFloatx80Sign( a );
5066 if ( aExp == 0x7FFF ) {
bb98fe42 5067 if ( (uint64_t) ( aSig<<1 ) ) {
ff32e16e 5068 return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
158142c2
FB
5069 }
5070 return packFloat64( aSign, 0x7FF, 0 );
5071 }
5072 shift64RightJamming( aSig, 1, &zSig );
5073 if ( aExp || aSig ) aExp -= 0x3C01;
ff32e16e 5074 return roundAndPackFloat64(aSign, aExp, zSig, status);
158142c2
FB
5075
5076}
5077
158142c2
FB
5078/*----------------------------------------------------------------------------
5079| Returns the result of converting the extended double-precision floating-
5080| point value `a' to the quadruple-precision floating-point format. The
5081| conversion is performed according to the IEC/IEEE Standard for Binary
5082| Floating-Point Arithmetic.
5083*----------------------------------------------------------------------------*/
5084
e5a41ffa 5085float128 floatx80_to_float128(floatx80 a, float_status *status)
158142c2
FB
5086{
5087 flag aSign;
0c48262d 5088 int aExp;
bb98fe42 5089 uint64_t aSig, zSig0, zSig1;
158142c2 5090
d1eb8f2a
AD
5091 if (floatx80_invalid_encoding(a)) {
5092 float_raise(float_flag_invalid, status);
5093 return float128_default_nan(status);
5094 }
158142c2
FB
5095 aSig = extractFloatx80Frac( a );
5096 aExp = extractFloatx80Exp( a );
5097 aSign = extractFloatx80Sign( a );
bb98fe42 5098 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
ff32e16e 5099 return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
158142c2
FB
5100 }
5101 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5102 return packFloat128( aSign, aExp, zSig0, zSig1 );
5103
5104}
5105
0f721292
LV
5106/*----------------------------------------------------------------------------
5107| Rounds the extended double-precision floating-point value `a'
5108| to the precision provided by floatx80_rounding_precision and returns the
5109| result as an extended double-precision floating-point value.
5110| The operation is performed according to the IEC/IEEE Standard for Binary
5111| Floating-Point Arithmetic.
5112*----------------------------------------------------------------------------*/
5113
5114floatx80 floatx80_round(floatx80 a, float_status *status)
5115{
5116 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5117 extractFloatx80Sign(a),
5118 extractFloatx80Exp(a),
5119 extractFloatx80Frac(a), 0, status);
5120}
5121
158142c2
FB
5122/*----------------------------------------------------------------------------
5123| Rounds the extended double-precision floating-point value `a' to an integer,
5124| and returns the result as an extended quadruple-precision floating-point
5125| value. The operation is performed according to the IEC/IEEE Standard for
5126| Binary Floating-Point Arithmetic.
5127*----------------------------------------------------------------------------*/
5128
e5a41ffa 5129floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
158142c2
FB
5130{
5131 flag aSign;
f4014512 5132 int32_t aExp;
bb98fe42 5133 uint64_t lastBitMask, roundBitsMask;
158142c2
FB
5134 floatx80 z;
5135
d1eb8f2a
AD
5136 if (floatx80_invalid_encoding(a)) {
5137 float_raise(float_flag_invalid, status);
5138 return floatx80_default_nan(status);
5139 }
158142c2
FB
5140 aExp = extractFloatx80Exp( a );
5141 if ( 0x403E <= aExp ) {
bb98fe42 5142 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
ff32e16e 5143 return propagateFloatx80NaN(a, a, status);
158142c2
FB
5144 }
5145 return a;
5146 }
5147 if ( aExp < 0x3FFF ) {
5148 if ( ( aExp == 0 )
bb98fe42 5149 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
158142c2
FB
5150 return a;
5151 }
a2f2d288 5152 status->float_exception_flags |= float_flag_inexact;
158142c2 5153 aSign = extractFloatx80Sign( a );
a2f2d288 5154 switch (status->float_rounding_mode) {
158142c2 5155 case float_round_nearest_even:
bb98fe42 5156 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
158142c2
FB
5157 ) {
5158 return
5159 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
5160 }
5161 break;
f9288a76
PM
5162 case float_round_ties_away:
5163 if (aExp == 0x3FFE) {
5164 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
5165 }
5166 break;
158142c2
FB
5167 case float_round_down:
5168 return
5169 aSign ?
5170 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
5171 : packFloatx80( 0, 0, 0 );
5172 case float_round_up:
5173 return
5174 aSign ? packFloatx80( 1, 0, 0 )
5175 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
5176 }
5177 return packFloatx80( aSign, 0, 0 );
5178 }
5179 lastBitMask = 1;
5180 lastBitMask <<= 0x403E - aExp;
5181 roundBitsMask = lastBitMask - 1;
5182 z = a;
a2f2d288 5183 switch (status->float_rounding_mode) {
dc355b76 5184 case float_round_nearest_even:
158142c2 5185 z.low += lastBitMask>>1;
dc355b76
PM
5186 if ((z.low & roundBitsMask) == 0) {
5187 z.low &= ~lastBitMask;
5188 }
5189 break;
f9288a76
PM
5190 case float_round_ties_away:
5191 z.low += lastBitMask >> 1;
5192 break;
dc355b76
PM
5193 case float_round_to_zero:
5194 break;
5195 case float_round_up:
5196 if (!extractFloatx80Sign(z)) {
5197 z.low += roundBitsMask;
5198 }
5199 break;
5200 case float_round_down:
5201 if (extractFloatx80Sign(z)) {
158142c2
FB
5202 z.low += roundBitsMask;
5203 }
dc355b76
PM
5204 break;
5205 default:
5206 abort();
158142c2
FB
5207 }
5208 z.low &= ~ roundBitsMask;
5209 if ( z.low == 0 ) {
5210 ++z.high;
5211 z.low = LIT64( 0x8000000000000000 );
5212 }
a2f2d288
PM
5213 if (z.low != a.low) {
5214 status->float_exception_flags |= float_flag_inexact;
5215 }
158142c2
FB
5216 return z;
5217
5218}
5219
5220/*----------------------------------------------------------------------------
5221| Returns the result of adding the absolute values of the extended double-
5222| precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
5223| negated before being returned. `zSign' is ignored if the result is a NaN.
5224| The addition is performed according to the IEC/IEEE Standard for Binary
5225| Floating-Point Arithmetic.
5226*----------------------------------------------------------------------------*/
5227
e5a41ffa
PM
5228static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5229 float_status *status)
158142c2 5230{
f4014512 5231 int32_t aExp, bExp, zExp;
bb98fe42 5232 uint64_t aSig, bSig, zSig0, zSig1;
f4014512 5233 int32_t expDiff;
158142c2
FB
5234
5235 aSig = extractFloatx80Frac( a );
5236 aExp = extractFloatx80Exp( a );
5237 bSig = extractFloatx80Frac( b );
5238 bExp = extractFloatx80Exp( b );
5239 expDiff = aExp - bExp;
5240 if ( 0 < expDiff ) {
5241 if ( aExp == 0x7FFF ) {
ff32e16e
PM
5242 if ((uint64_t)(aSig << 1)) {
5243 return propagateFloatx80NaN(a, b, status);
5244 }
158142c2
FB
5245 return a;
5246 }
5247 if ( bExp == 0 ) --expDiff;
5248 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5249 zExp = aExp;
5250 }
5251 else if ( expDiff < 0 ) {
5252 if ( bExp == 0x7FFF ) {
ff32e16e
PM
5253 if ((uint64_t)(bSig << 1)) {
5254 return propagateFloatx80NaN(a, b, status);
5255 }
158142c2
FB
5256 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5257 }
5258 if ( aExp == 0 ) ++expDiff;
5259 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5260 zExp = bExp;
5261 }
5262 else {
5263 if ( aExp == 0x7FFF ) {
bb98fe42 5264 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
ff32e16e 5265 return propagateFloatx80NaN(a, b, status);
158142c2
FB
5266 }
5267 return a;
5268 }
5269 zSig1 = 0;
5270 zSig0 = aSig + bSig;
5271 if ( aExp == 0 ) {
5272 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5273 goto roundAndPack;
5274 }
5275 zExp = aExp;
5276 goto shiftRight1;
5277 }
5278 zSig0 = aSig + bSig;
bb98fe42 5279 if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
158142c2
FB
5280 shiftRight1:
5281 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5282 zSig0 |= LIT64( 0x8000000000000000 );
5283 ++zExp;
5284 roundAndPack:
a2f2d288 5285 return roundAndPackFloatx80(status->floatx80_rounding_precision,
ff32e16e 5286 zSign, zExp, zSig0, zSig1, status);
158142c2
FB
5287}
5288
5289/*----------------------------------------------------------------------------
5290| Returns the result of subtracting the absolute values of the extended
5291| double-precision floating-point values `a' and `b'. If `zSign' is 1, the
5292| difference is negated before being returned. `zSign' is ignored if the
5293| result is a NaN. The subtraction is performed according to the IEC/IEEE
5294| Standard for Binary Floating-Point Arithmetic.
5295*----------------------------------------------------------------------------*/
5296
e5a41ffa
PM
5297static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5298 float_status *status)
158142c2 5299{
f4014512 5300 int32_t aExp, bExp, zExp;
bb98fe42 5301 uint64_t aSig, bSig, zSig0, zSig1;
f4014512 5302 int32_t expDiff;
158142c2
FB
5303
5304 aSig = extractFloatx80Frac( a );
5305 aExp = extractFloatx80Exp( a );
5306 bSig = extractFloatx80Frac( b );
5307 bExp = extractFloatx80Exp( b );
5308 expDiff = aExp - bExp;
5309 if ( 0 < expDiff ) goto aExpBigger;
5310 if ( expDiff < 0 ) goto bExpBigger;
5311 if ( aExp == 0x7FFF ) {
bb98fe42 5312 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
ff32e16e 5313 return propagateFloatx80NaN(a, b, status);
158142c2 5314 }
ff32e16e 5315 float_raise(float_flag_invalid, status);
af39bc8c 5316 return floatx80_default_nan(status);
158142c2
FB
5317 }
5318 if ( aExp == 0 ) {
5319 aExp = 1;
5320 bExp = 1;
5321 }
5322 zSig1 = 0;
5323 if ( bSig < aSig ) goto aBigger;
5324 if ( aSig < bSig ) goto bBigger;
a2f2d288 5325 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
158142c2
FB
5326 bExpBigger:
5327 if ( bExp == 0x7FFF ) {
ff32e16e
PM
5328 if ((uint64_t)(bSig << 1)) {
5329 return propagateFloatx80NaN(a, b, status);
5330 }
158142c2
FB
5331 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
5332 }
5333 if ( aExp == 0 ) ++expDiff;
5334 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5335 bBigger:
5336 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5337 zExp = bExp;
5338 zSign ^= 1;
5339 goto normalizeRoundAndPack;
5340 aExpBigger:
5341 if ( aExp == 0x7FFF ) {
ff32e16e
PM
5342 if ((uint64_t)(aSig << 1)) {
5343 return propagateFloatx80NaN(a, b, status);
5344 }
158142c2
FB
5345 return a;
5346 }
5347 if ( bExp == 0 ) --expDiff;
5348 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5349 aBigger:
5350 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5351 zExp = aExp;
5352 normalizeRoundAndPack:
a2f2d288 5353 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
ff32e16e 5354 zSign, zExp, zSig0, zSig1, status);
158142c2
FB
5355}
5356
5357/*----------------------------------------------------------------------------
5358| Returns the result of adding the extended double-precision floating-point
5359| values `a' and `b'. The operation is performed according to the IEC/IEEE
5360| Standard for Binary Floating-Point Arithmetic.
5361*----------------------------------------------------------------------------*/
5362
e5a41ffa 5363floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5364{
5365 flag aSign, bSign;
5366
d1eb8f2a
AD
5367 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5368 float_raise(float_flag_invalid, status);
5369 return floatx80_default_nan(status);
5370 }
158142c2
FB
5371 aSign = extractFloatx80Sign( a );
5372 bSign = extractFloatx80Sign( b );
5373 if ( aSign == bSign ) {
ff32e16e 5374 return addFloatx80Sigs(a, b, aSign, status);
158142c2
FB
5375 }
5376 else {
ff32e16e 5377 return subFloatx80Sigs(a, b, aSign, status);
158142c2
FB
5378 }
5379
5380}
5381
5382/*----------------------------------------------------------------------------
5383| Returns the result of subtracting the extended double-precision floating-
5384| point values `a' and `b'. The operation is performed according to the
5385| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5386*----------------------------------------------------------------------------*/
5387
e5a41ffa 5388floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5389{
5390 flag aSign, bSign;
5391
d1eb8f2a
AD
5392 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5393 float_raise(float_flag_invalid, status);
5394 return floatx80_default_nan(status);
5395 }
158142c2
FB
5396 aSign = extractFloatx80Sign( a );
5397 bSign = extractFloatx80Sign( b );
5398 if ( aSign == bSign ) {
ff32e16e 5399 return subFloatx80Sigs(a, b, aSign, status);
158142c2
FB
5400 }
5401 else {
ff32e16e 5402 return addFloatx80Sigs(a, b, aSign, status);
158142c2
FB
5403 }
5404
5405}
5406
5407/*----------------------------------------------------------------------------
5408| Returns the result of multiplying the extended double-precision floating-
5409| point values `a' and `b'. The operation is performed according to the
5410| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5411*----------------------------------------------------------------------------*/
5412
e5a41ffa 5413floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5414{
5415 flag aSign, bSign, zSign;
f4014512 5416 int32_t aExp, bExp, zExp;
bb98fe42 5417 uint64_t aSig, bSig, zSig0, zSig1;
158142c2 5418
d1eb8f2a
AD
5419 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5420 float_raise(float_flag_invalid, status);
5421 return floatx80_default_nan(status);
5422 }
158142c2
FB
5423 aSig = extractFloatx80Frac( a );
5424 aExp = extractFloatx80Exp( a );
5425 aSign = extractFloatx80Sign( a );
5426 bSig = extractFloatx80Frac( b );
5427 bExp = extractFloatx80Exp( b );
5428 bSign = extractFloatx80Sign( b );
5429 zSign = aSign ^ bSign;
5430 if ( aExp == 0x7FFF ) {
bb98fe42
AF
5431 if ( (uint64_t) ( aSig<<1 )
5432 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
ff32e16e 5433 return propagateFloatx80NaN(a, b, status);
158142c2
FB
5434 }
5435 if ( ( bExp | bSig ) == 0 ) goto invalid;
5436 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5437 }
5438 if ( bExp == 0x7FFF ) {
ff32e16e
PM
5439 if ((uint64_t)(bSig << 1)) {
5440 return propagateFloatx80NaN(a, b, status);
5441 }
158142c2
FB
5442 if ( ( aExp | aSig ) == 0 ) {
5443 invalid:
ff32e16e 5444 float_raise(float_flag_invalid, status);
af39bc8c 5445 return floatx80_default_nan(status);
158142c2
FB
5446 }
5447 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5448 }
5449 if ( aExp == 0 ) {
5450 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5451 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5452 }
5453 if ( bExp == 0 ) {
5454 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5455 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5456 }
5457 zExp = aExp + bExp - 0x3FFE;
5458 mul64To128( aSig, bSig, &zSig0, &zSig1 );
bb98fe42 5459 if ( 0 < (int64_t) zSig0 ) {
158142c2
FB
5460 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5461 --zExp;
5462 }
a2f2d288 5463 return roundAndPackFloatx80(status->floatx80_rounding_precision,
ff32e16e 5464 zSign, zExp, zSig0, zSig1, status);
158142c2
FB
5465}
5466
5467/*----------------------------------------------------------------------------
5468| Returns the result of dividing the extended double-precision floating-point
5469| value `a' by the corresponding value `b'. The operation is performed
5470| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5471*----------------------------------------------------------------------------*/
5472
e5a41ffa 5473floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5474{
5475 flag aSign, bSign, zSign;
f4014512 5476 int32_t aExp, bExp, zExp;
bb98fe42
AF
5477 uint64_t aSig, bSig, zSig0, zSig1;
5478 uint64_t rem0, rem1, rem2, term0, term1, term2;
158142c2 5479
d1eb8f2a
AD
5480 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5481 float_raise(float_flag_invalid, status);
5482 return floatx80_default_nan(status);
5483 }
158142c2
FB
5484 aSig = extractFloatx80Frac( a );
5485 aExp = extractFloatx80Exp( a );
5486 aSign = extractFloatx80Sign( a );
5487 bSig = extractFloatx80Frac( b );
5488 bExp = extractFloatx80Exp( b );
5489 bSign = extractFloatx80Sign( b );
5490 zSign = aSign ^ bSign;
5491 if ( aExp == 0x7FFF ) {
ff32e16e
PM
5492 if ((uint64_t)(aSig << 1)) {
5493 return propagateFloatx80NaN(a, b, status);
5494 }
158142c2 5495 if ( bExp == 0x7FFF ) {
ff32e16e
PM
5496 if ((uint64_t)(bSig << 1)) {
5497 return propagateFloatx80NaN(a, b, status);
5498 }
158142c2
FB
5499 goto invalid;
5500 }
5501 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5502 }
5503 if ( bExp == 0x7FFF ) {
ff32e16e
PM
5504 if ((uint64_t)(bSig << 1)) {
5505 return propagateFloatx80NaN(a, b, status);
5506 }
158142c2
FB
5507 return packFloatx80( zSign, 0, 0 );
5508 }
5509 if ( bExp == 0 ) {
5510 if ( bSig == 0 ) {
5511 if ( ( aExp | aSig ) == 0 ) {
5512 invalid:
ff32e16e 5513 float_raise(float_flag_invalid, status);
af39bc8c 5514 return floatx80_default_nan(status);
158142c2 5515 }
ff32e16e 5516 float_raise(float_flag_divbyzero, status);
158142c2
FB
5517 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5518 }
5519 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5520 }
5521 if ( aExp == 0 ) {
5522 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5523 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5524 }
5525 zExp = aExp - bExp + 0x3FFE;
5526 rem1 = 0;
5527 if ( bSig <= aSig ) {
5528 shift128Right( aSig, 0, 1, &aSig, &rem1 );
5529 ++zExp;
5530 }
5531 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5532 mul64To128( bSig, zSig0, &term0, &term1 );
5533 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
bb98fe42 5534 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
5535 --zSig0;
5536 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5537 }
5538 zSig1 = estimateDiv128To64( rem1, 0, bSig );
bb98fe42 5539 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
158142c2
FB
5540 mul64To128( bSig, zSig1, &term1, &term2 );
5541 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
bb98fe42 5542 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
5543 --zSig1;
5544 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5545 }
5546 zSig1 |= ( ( rem1 | rem2 ) != 0 );
5547 }
a2f2d288 5548 return roundAndPackFloatx80(status->floatx80_rounding_precision,
ff32e16e 5549 zSign, zExp, zSig0, zSig1, status);
158142c2
FB
5550}
5551
5552/*----------------------------------------------------------------------------
5553| Returns the remainder of the extended double-precision floating-point value
5554| `a' with respect to the corresponding value `b'. The operation is performed
5555| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5556*----------------------------------------------------------------------------*/
5557
e5a41ffa 5558floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
158142c2 5559{
ed086f3d 5560 flag aSign, zSign;
f4014512 5561 int32_t aExp, bExp, expDiff;
bb98fe42
AF
5562 uint64_t aSig0, aSig1, bSig;
5563 uint64_t q, term0, term1, alternateASig0, alternateASig1;
158142c2 5564
d1eb8f2a
AD
5565 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5566 float_raise(float_flag_invalid, status);
5567 return floatx80_default_nan(status);
5568 }
158142c2
FB
5569 aSig0 = extractFloatx80Frac( a );
5570 aExp = extractFloatx80Exp( a );
5571 aSign = extractFloatx80Sign( a );
5572 bSig = extractFloatx80Frac( b );
5573 bExp = extractFloatx80Exp( b );
158142c2 5574 if ( aExp == 0x7FFF ) {
bb98fe42
AF
5575 if ( (uint64_t) ( aSig0<<1 )
5576 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
ff32e16e 5577 return propagateFloatx80NaN(a, b, status);
158142c2
FB
5578 }
5579 goto invalid;
5580 }
5581 if ( bExp == 0x7FFF ) {
ff32e16e
PM
5582 if ((uint64_t)(bSig << 1)) {
5583 return propagateFloatx80NaN(a, b, status);
5584 }
158142c2
FB
5585 return a;
5586 }
5587 if ( bExp == 0 ) {
5588 if ( bSig == 0 ) {
5589 invalid:
ff32e16e 5590 float_raise(float_flag_invalid, status);
af39bc8c 5591 return floatx80_default_nan(status);
158142c2
FB
5592 }
5593 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5594 }
5595 if ( aExp == 0 ) {
bb98fe42 5596 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
158142c2
FB
5597 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5598 }
5599 bSig |= LIT64( 0x8000000000000000 );
5600 zSign = aSign;
5601 expDiff = aExp - bExp;
5602 aSig1 = 0;
5603 if ( expDiff < 0 ) {
5604 if ( expDiff < -1 ) return a;
5605 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5606 expDiff = 0;
5607 }
5608 q = ( bSig <= aSig0 );
5609 if ( q ) aSig0 -= bSig;
5610 expDiff -= 64;
5611 while ( 0 < expDiff ) {
5612 q = estimateDiv128To64( aSig0, aSig1, bSig );
5613 q = ( 2 < q ) ? q - 2 : 0;
5614 mul64To128( bSig, q, &term0, &term1 );
5615 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5616 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5617 expDiff -= 62;
5618 }
5619 expDiff += 64;
5620 if ( 0 < expDiff ) {
5621 q = estimateDiv128To64( aSig0, aSig1, bSig );
5622 q = ( 2 < q ) ? q - 2 : 0;
5623 q >>= 64 - expDiff;
5624 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5625 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5626 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5627 while ( le128( term0, term1, aSig0, aSig1 ) ) {
5628 ++q;
5629 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5630 }
5631 }
5632 else {
5633 term1 = 0;
5634 term0 = bSig;
5635 }
5636 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5637 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5638 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5639 && ( q & 1 ) )
5640 ) {
5641 aSig0 = alternateASig0;
5642 aSig1 = alternateASig1;
5643 zSign = ! zSign;
5644 }
5645 return
5646 normalizeRoundAndPackFloatx80(
ff32e16e 5647 80, zSign, bExp + expDiff, aSig0, aSig1, status);
158142c2
FB
5648
5649}
5650
5651/*----------------------------------------------------------------------------
5652| Returns the square root of the extended double-precision floating-point
5653| value `a'. The operation is performed according to the IEC/IEEE Standard
5654| for Binary Floating-Point Arithmetic.
5655*----------------------------------------------------------------------------*/
5656
e5a41ffa 5657floatx80 floatx80_sqrt(floatx80 a, float_status *status)
158142c2
FB
5658{
5659 flag aSign;
f4014512 5660 int32_t aExp, zExp;
bb98fe42
AF
5661 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5662 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
158142c2 5663
d1eb8f2a
AD
5664 if (floatx80_invalid_encoding(a)) {
5665 float_raise(float_flag_invalid, status);
5666 return floatx80_default_nan(status);
5667 }
158142c2
FB
5668 aSig0 = extractFloatx80Frac( a );
5669 aExp = extractFloatx80Exp( a );
5670 aSign = extractFloatx80Sign( a );
5671 if ( aExp == 0x7FFF ) {
ff32e16e
PM
5672 if ((uint64_t)(aSig0 << 1)) {
5673 return propagateFloatx80NaN(a, a, status);
5674 }
158142c2
FB
5675 if ( ! aSign ) return a;
5676 goto invalid;
5677 }
5678 if ( aSign ) {
5679 if ( ( aExp | aSig0 ) == 0 ) return a;
5680 invalid:
ff32e16e 5681 float_raise(float_flag_invalid, status);
af39bc8c 5682 return floatx80_default_nan(status);
158142c2
FB
5683 }
5684 if ( aExp == 0 ) {
5685 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5686 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5687 }
5688 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5689 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5690 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5691 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5692 doubleZSig0 = zSig0<<1;
5693 mul64To128( zSig0, zSig0, &term0, &term1 );
5694 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
bb98fe42 5695 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
5696 --zSig0;
5697 doubleZSig0 -= 2;
5698 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5699 }
5700 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5701 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5702 if ( zSig1 == 0 ) zSig1 = 1;
5703 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5704 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5705 mul64To128( zSig1, zSig1, &term2, &term3 );
5706 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
bb98fe42 5707 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
5708 --zSig1;
5709 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5710 term3 |= 1;
5711 term2 |= doubleZSig0;
5712 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5713 }
5714 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5715 }
5716 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5717 zSig0 |= doubleZSig0;
a2f2d288
PM
5718 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5719 0, zExp, zSig0, zSig1, status);
158142c2
FB
5720}
5721
5722/*----------------------------------------------------------------------------
b689362d
AJ
5723| Returns 1 if the extended double-precision floating-point value `a' is equal
5724| to the corresponding value `b', and 0 otherwise. The invalid exception is
5725| raised if either operand is a NaN. Otherwise, the comparison is performed
5726| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
5727*----------------------------------------------------------------------------*/
5728
e5a41ffa 5729int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5730{
5731
d1eb8f2a
AD
5732 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5733 || (extractFloatx80Exp(a) == 0x7FFF
5734 && (uint64_t) (extractFloatx80Frac(a) << 1))
5735 || (extractFloatx80Exp(b) == 0x7FFF
5736 && (uint64_t) (extractFloatx80Frac(b) << 1))
158142c2 5737 ) {
ff32e16e 5738 float_raise(float_flag_invalid, status);
158142c2
FB
5739 return 0;
5740 }
5741 return
5742 ( a.low == b.low )
5743 && ( ( a.high == b.high )
5744 || ( ( a.low == 0 )
bb98fe42 5745 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
5746 );
5747
5748}
5749
5750/*----------------------------------------------------------------------------
5751| Returns 1 if the extended double-precision floating-point value `a' is
5752| less than or equal to the corresponding value `b', and 0 otherwise. The
f5a64251
AJ
5753| invalid exception is raised if either operand is a NaN. The comparison is
5754| performed according to the IEC/IEEE Standard for Binary Floating-Point
5755| Arithmetic.
158142c2
FB
5756*----------------------------------------------------------------------------*/
5757
e5a41ffa 5758int floatx80_le(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5759{
5760 flag aSign, bSign;
5761
d1eb8f2a
AD
5762 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5763 || (extractFloatx80Exp(a) == 0x7FFF
5764 && (uint64_t) (extractFloatx80Frac(a) << 1))
5765 || (extractFloatx80Exp(b) == 0x7FFF
5766 && (uint64_t) (extractFloatx80Frac(b) << 1))
158142c2 5767 ) {
ff32e16e 5768 float_raise(float_flag_invalid, status);
158142c2
FB
5769 return 0;
5770 }
5771 aSign = extractFloatx80Sign( a );
5772 bSign = extractFloatx80Sign( b );
5773 if ( aSign != bSign ) {
5774 return
5775 aSign
bb98fe42 5776 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5777 == 0 );
5778 }
5779 return
5780 aSign ? le128( b.high, b.low, a.high, a.low )
5781 : le128( a.high, a.low, b.high, b.low );
5782
5783}
5784
5785/*----------------------------------------------------------------------------
5786| Returns 1 if the extended double-precision floating-point value `a' is
f5a64251
AJ
5787| less than the corresponding value `b', and 0 otherwise. The invalid
5788| exception is raised if either operand is a NaN. The comparison is performed
5789| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
5790*----------------------------------------------------------------------------*/
5791
e5a41ffa 5792int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5793{
5794 flag aSign, bSign;
5795
d1eb8f2a
AD
5796 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5797 || (extractFloatx80Exp(a) == 0x7FFF
5798 && (uint64_t) (extractFloatx80Frac(a) << 1))
5799 || (extractFloatx80Exp(b) == 0x7FFF
5800 && (uint64_t) (extractFloatx80Frac(b) << 1))
158142c2 5801 ) {
ff32e16e 5802 float_raise(float_flag_invalid, status);
158142c2
FB
5803 return 0;
5804 }
5805 aSign = extractFloatx80Sign( a );
5806 bSign = extractFloatx80Sign( b );
5807 if ( aSign != bSign ) {
5808 return
5809 aSign
bb98fe42 5810 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5811 != 0 );
5812 }
5813 return
5814 aSign ? lt128( b.high, b.low, a.high, a.low )
5815 : lt128( a.high, a.low, b.high, b.low );
5816
5817}
5818
67b7861d
AJ
5819/*----------------------------------------------------------------------------
5820| Returns 1 if the extended double-precision floating-point values `a' and `b'
f5a64251
AJ
5821| cannot be compared, and 0 otherwise. The invalid exception is raised if
5822| either operand is a NaN. The comparison is performed according to the
5823| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
67b7861d 5824*----------------------------------------------------------------------------*/
e5a41ffa 5825int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
67b7861d 5826{
d1eb8f2a
AD
5827 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5828 || (extractFloatx80Exp(a) == 0x7FFF
5829 && (uint64_t) (extractFloatx80Frac(a) << 1))
5830 || (extractFloatx80Exp(b) == 0x7FFF
5831 && (uint64_t) (extractFloatx80Frac(b) << 1))
67b7861d 5832 ) {
ff32e16e 5833 float_raise(float_flag_invalid, status);
67b7861d
AJ
5834 return 1;
5835 }
5836 return 0;
5837}
5838
158142c2 5839/*----------------------------------------------------------------------------
b689362d 5840| Returns 1 if the extended double-precision floating-point value `a' is
f5a64251
AJ
5841| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5842| cause an exception. The comparison is performed according to the IEC/IEEE
5843| Standard for Binary Floating-Point Arithmetic.
158142c2
FB
5844*----------------------------------------------------------------------------*/
5845
e5a41ffa 5846int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5847{
5848
d1eb8f2a
AD
5849 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5850 float_raise(float_flag_invalid, status);
5851 return 0;
5852 }
158142c2 5853 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5854 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5855 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5856 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2 5857 ) {
af39bc8c
AM
5858 if (floatx80_is_signaling_nan(a, status)
5859 || floatx80_is_signaling_nan(b, status)) {
ff32e16e 5860 float_raise(float_flag_invalid, status);
b689362d 5861 }
158142c2
FB
5862 return 0;
5863 }
5864 return
5865 ( a.low == b.low )
5866 && ( ( a.high == b.high )
5867 || ( ( a.low == 0 )
bb98fe42 5868 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
5869 );
5870
5871}
5872
5873/*----------------------------------------------------------------------------
5874| Returns 1 if the extended double-precision floating-point value `a' is less
5875| than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
5876| do not cause an exception. Otherwise, the comparison is performed according
5877| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5878*----------------------------------------------------------------------------*/
5879
e5a41ffa 5880int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5881{
5882 flag aSign, bSign;
5883
d1eb8f2a
AD
5884 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5885 float_raise(float_flag_invalid, status);
5886 return 0;
5887 }
158142c2 5888 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5889 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5890 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5891 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2 5892 ) {
af39bc8c
AM
5893 if (floatx80_is_signaling_nan(a, status)
5894 || floatx80_is_signaling_nan(b, status)) {
ff32e16e 5895 float_raise(float_flag_invalid, status);
158142c2
FB
5896 }
5897 return 0;
5898 }
5899 aSign = extractFloatx80Sign( a );
5900 bSign = extractFloatx80Sign( b );
5901 if ( aSign != bSign ) {
5902 return
5903 aSign
bb98fe42 5904 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5905 == 0 );
5906 }
5907 return
5908 aSign ? le128( b.high, b.low, a.high, a.low )
5909 : le128( a.high, a.low, b.high, b.low );
5910
5911}
5912
5913/*----------------------------------------------------------------------------
5914| Returns 1 if the extended double-precision floating-point value `a' is less
5915| than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
5916| an exception. Otherwise, the comparison is performed according to the
5917| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5918*----------------------------------------------------------------------------*/
5919
e5a41ffa 5920int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5921{
5922 flag aSign, bSign;
5923
d1eb8f2a
AD
5924 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5925 float_raise(float_flag_invalid, status);
5926 return 0;
5927 }
158142c2 5928 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5929 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5930 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5931 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2 5932 ) {
af39bc8c
AM
5933 if (floatx80_is_signaling_nan(a, status)
5934 || floatx80_is_signaling_nan(b, status)) {
ff32e16e 5935 float_raise(float_flag_invalid, status);
158142c2
FB
5936 }
5937 return 0;
5938 }
5939 aSign = extractFloatx80Sign( a );
5940 bSign = extractFloatx80Sign( b );
5941 if ( aSign != bSign ) {
5942 return
5943 aSign
bb98fe42 5944 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5945 != 0 );
5946 }
5947 return
5948 aSign ? lt128( b.high, b.low, a.high, a.low )
5949 : lt128( a.high, a.low, b.high, b.low );
5950
5951}
5952
67b7861d
AJ
5953/*----------------------------------------------------------------------------
5954| Returns 1 if the extended double-precision floating-point values `a' and `b'
5955| cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
5956| The comparison is performed according to the IEC/IEEE Standard for Binary
5957| Floating-Point Arithmetic.
5958*----------------------------------------------------------------------------*/
e5a41ffa 5959int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
67b7861d 5960{
d1eb8f2a
AD
5961 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5962 float_raise(float_flag_invalid, status);
5963 return 1;
5964 }
67b7861d
AJ
5965 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5966 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5967 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5968 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5969 ) {
af39bc8c
AM
5970 if (floatx80_is_signaling_nan(a, status)
5971 || floatx80_is_signaling_nan(b, status)) {
ff32e16e 5972 float_raise(float_flag_invalid, status);
67b7861d
AJ
5973 }
5974 return 1;
5975 }
5976 return 0;
5977}
5978
158142c2
FB
5979/*----------------------------------------------------------------------------
5980| Returns the result of converting the quadruple-precision floating-point
5981| value `a' to the 32-bit two's complement integer format. The conversion
5982| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5983| Arithmetic---which means in particular that the conversion is rounded
5984| according to the current rounding mode. If `a' is a NaN, the largest
5985| positive integer is returned. Otherwise, if the conversion overflows, the
5986| largest integer with the same sign as `a' is returned.
5987*----------------------------------------------------------------------------*/
5988
f4014512 5989int32_t float128_to_int32(float128 a, float_status *status)
158142c2
FB
5990{
5991 flag aSign;
f4014512 5992 int32_t aExp, shiftCount;
bb98fe42 5993 uint64_t aSig0, aSig1;
158142c2
FB
5994
5995 aSig1 = extractFloat128Frac1( a );
5996 aSig0 = extractFloat128Frac0( a );
5997 aExp = extractFloat128Exp( a );
5998 aSign = extractFloat128Sign( a );
5999 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
6000 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6001 aSig0 |= ( aSig1 != 0 );
6002 shiftCount = 0x4028 - aExp;
6003 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
ff32e16e 6004 return roundAndPackInt32(aSign, aSig0, status);
158142c2
FB
6005
6006}
6007
6008/*----------------------------------------------------------------------------
6009| Returns the result of converting the quadruple-precision floating-point
6010| value `a' to the 32-bit two's complement integer format. The conversion
6011| is performed according to the IEC/IEEE Standard for Binary Floating-Point
6012| Arithmetic, except that the conversion is always rounded toward zero. If
6013| `a' is a NaN, the largest positive integer is returned. Otherwise, if the
6014| conversion overflows, the largest integer with the same sign as `a' is
6015| returned.
6016*----------------------------------------------------------------------------*/
6017
f4014512 6018int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
158142c2
FB
6019{
6020 flag aSign;
f4014512 6021 int32_t aExp, shiftCount;
bb98fe42 6022 uint64_t aSig0, aSig1, savedASig;
b3a6a2e0 6023 int32_t z;
158142c2
FB
6024
6025 aSig1 = extractFloat128Frac1( a );
6026 aSig0 = extractFloat128Frac0( a );
6027 aExp = extractFloat128Exp( a );
6028 aSign = extractFloat128Sign( a );
6029 aSig0 |= ( aSig1 != 0 );
6030 if ( 0x401E < aExp ) {
6031 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
6032 goto invalid;
6033 }
6034 else if ( aExp < 0x3FFF ) {
a2f2d288
PM
6035 if (aExp || aSig0) {
6036 status->float_exception_flags |= float_flag_inexact;
6037 }
158142c2
FB
6038 return 0;
6039 }
6040 aSig0 |= LIT64( 0x0001000000000000 );
6041 shiftCount = 0x402F - aExp;
6042 savedASig = aSig0;
6043 aSig0 >>= shiftCount;
6044 z = aSig0;
6045 if ( aSign ) z = - z;
6046 if ( ( z < 0 ) ^ aSign ) {
6047 invalid:
ff32e16e 6048 float_raise(float_flag_invalid, status);
bb98fe42 6049 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2
FB
6050 }
6051 if ( ( aSig0<<shiftCount ) != savedASig ) {
a2f2d288 6052 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
6053 }
6054 return z;
6055
6056}
6057
6058/*----------------------------------------------------------------------------
6059| Returns the result of converting the quadruple-precision floating-point
6060| value `a' to the 64-bit two's complement integer format. The conversion
6061| is performed according to the IEC/IEEE Standard for Binary Floating-Point
6062| Arithmetic---which means in particular that the conversion is rounded
6063| according to the current rounding mode. If `a' is a NaN, the largest
6064| positive integer is returned. Otherwise, if the conversion overflows, the
6065| largest integer with the same sign as `a' is returned.
6066*----------------------------------------------------------------------------*/
6067
f42c2224 6068int64_t float128_to_int64(float128 a, float_status *status)
158142c2
FB
6069{
6070 flag aSign;
f4014512 6071 int32_t aExp, shiftCount;
bb98fe42 6072 uint64_t aSig0, aSig1;
158142c2
FB
6073
6074 aSig1 = extractFloat128Frac1( a );
6075 aSig0 = extractFloat128Frac0( a );
6076 aExp = extractFloat128Exp( a );
6077 aSign = extractFloat128Sign( a );
6078 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6079 shiftCount = 0x402F - aExp;
6080 if ( shiftCount <= 0 ) {
6081 if ( 0x403E < aExp ) {
ff32e16e 6082 float_raise(float_flag_invalid, status);
158142c2
FB
6083 if ( ! aSign
6084 || ( ( aExp == 0x7FFF )
6085 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
6086 )
6087 ) {
6088 return LIT64( 0x7FFFFFFFFFFFFFFF );
6089 }
bb98fe42 6090 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
6091 }
6092 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
6093 }
6094 else {
6095 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
6096 }
ff32e16e 6097 return roundAndPackInt64(aSign, aSig0, aSig1, status);
158142c2
FB
6098
6099}
6100
6101/*----------------------------------------------------------------------------
6102| Returns the result of converting the quadruple-precision floating-point
6103| value `a' to the 64-bit two's complement integer format. The conversion
6104| is performed according to the IEC/IEEE Standard for Binary Floating-Point
6105| Arithmetic, except that the conversion is always rounded toward zero.
6106| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
6107| the conversion overflows, the largest integer with the same sign as `a' is
6108| returned.
6109*----------------------------------------------------------------------------*/
6110
f42c2224 6111int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
158142c2
FB
6112{
6113 flag aSign;
f4014512 6114 int32_t aExp, shiftCount;
bb98fe42 6115 uint64_t aSig0, aSig1;
f42c2224 6116 int64_t z;
158142c2
FB
6117
6118 aSig1 = extractFloat128Frac1( a );
6119 aSig0 = extractFloat128Frac0( a );
6120 aExp = extractFloat128Exp( a );
6121 aSign = extractFloat128Sign( a );
6122 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6123 shiftCount = aExp - 0x402F;
6124 if ( 0 < shiftCount ) {
6125 if ( 0x403E <= aExp ) {
6126 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
6127 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
6128 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
a2f2d288
PM
6129 if (aSig1) {
6130 status->float_exception_flags |= float_flag_inexact;
6131 }
158142c2
FB
6132 }
6133 else {
ff32e16e 6134 float_raise(float_flag_invalid, status);
158142c2
FB
6135 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
6136 return LIT64( 0x7FFFFFFFFFFFFFFF );
6137 }
6138 }
bb98fe42 6139 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
6140 }
6141 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
bb98fe42 6142 if ( (uint64_t) ( aSig1<<shiftCount ) ) {
a2f2d288 6143 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
6144 }
6145 }
6146 else {
6147 if ( aExp < 0x3FFF ) {
6148 if ( aExp | aSig0 | aSig1 ) {
a2f2d288 6149 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
6150 }
6151 return 0;
6152 }
6153 z = aSig0>>( - shiftCount );
6154 if ( aSig1
bb98fe42 6155 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
a2f2d288 6156 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
6157 }
6158 }
6159 if ( aSign ) z = - z;
6160 return z;
6161
6162}
6163
2e6d8568
BR
6164/*----------------------------------------------------------------------------
6165| Returns the result of converting the quadruple-precision floating-point value
6166| `a' to the 64-bit unsigned integer format. The conversion is
6167| performed according to the IEC/IEEE Standard for Binary Floating-Point
6168| Arithmetic---which means in particular that the conversion is rounded
6169| according to the current rounding mode. If `a' is a NaN, the largest
6170| positive integer is returned. If the conversion overflows, the
6171| largest unsigned integer is returned. If 'a' is negative, the value is
6172| rounded and zero is returned; negative values that do not round to zero
6173| will raise the inexact exception.
6174*----------------------------------------------------------------------------*/
6175
6176uint64_t float128_to_uint64(float128 a, float_status *status)
6177{
6178 flag aSign;
6179 int aExp;
6180 int shiftCount;
6181 uint64_t aSig0, aSig1;
6182
6183 aSig0 = extractFloat128Frac0(a);
6184 aSig1 = extractFloat128Frac1(a);
6185 aExp = extractFloat128Exp(a);
6186 aSign = extractFloat128Sign(a);
6187 if (aSign && (aExp > 0x3FFE)) {
6188 float_raise(float_flag_invalid, status);
6189 if (float128_is_any_nan(a)) {
6190 return LIT64(0xFFFFFFFFFFFFFFFF);
6191 } else {
6192 return 0;
6193 }
6194 }
6195 if (aExp) {
6196 aSig0 |= LIT64(0x0001000000000000);
6197 }
6198 shiftCount = 0x402F - aExp;
6199 if (shiftCount <= 0) {
6200 if (0x403E < aExp) {
6201 float_raise(float_flag_invalid, status);
6202 return LIT64(0xFFFFFFFFFFFFFFFF);
6203 }
6204 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
6205 } else {
6206 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
6207 }
6208 return roundAndPackUint64(aSign, aSig0, aSig1, status);
6209}
6210
6211uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
6212{
6213 uint64_t v;
6214 signed char current_rounding_mode = status->float_rounding_mode;
6215
6216 set_float_rounding_mode(float_round_to_zero, status);
6217 v = float128_to_uint64(a, status);
6218 set_float_rounding_mode(current_rounding_mode, status);
6219
6220 return v;
6221}
6222
158142c2
FB
6223/*----------------------------------------------------------------------------
6224| Returns the result of converting the quadruple-precision floating-point
fd425037
BR
6225| value `a' to the 32-bit unsigned integer format. The conversion
6226| is performed according to the IEC/IEEE Standard for Binary Floating-Point
6227| Arithmetic except that the conversion is always rounded toward zero.
6228| If `a' is a NaN, the largest positive integer is returned. Otherwise,
6229| if the conversion overflows, the largest unsigned integer is returned.
6230| If 'a' is negative, the value is rounded and zero is returned; negative
6231| values that do not round to zero will raise the inexact exception.
6232*----------------------------------------------------------------------------*/
6233
6234uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
6235{
6236 uint64_t v;
6237 uint32_t res;
6238 int old_exc_flags = get_float_exception_flags(status);
6239
6240 v = float128_to_uint64_round_to_zero(a, status);
6241 if (v > 0xffffffff) {
6242 res = 0xffffffff;
6243 } else {
6244 return v;
6245 }
6246 set_float_exception_flags(old_exc_flags, status);
6247 float_raise(float_flag_invalid, status);
6248 return res;
6249}
6250
6251/*----------------------------------------------------------------------------
6252| Returns the result of converting the quadruple-precision floating-point
158142c2
FB
6253| value `a' to the single-precision floating-point format. The conversion
6254| is performed according to the IEC/IEEE Standard for Binary Floating-Point
6255| Arithmetic.
6256*----------------------------------------------------------------------------*/
6257
e5a41ffa 6258float32 float128_to_float32(float128 a, float_status *status)
158142c2
FB
6259{
6260 flag aSign;
f4014512 6261 int32_t aExp;
bb98fe42
AF
6262 uint64_t aSig0, aSig1;
6263 uint32_t zSig;
158142c2
FB
6264
6265 aSig1 = extractFloat128Frac1( a );
6266 aSig0 = extractFloat128Frac0( a );
6267 aExp = extractFloat128Exp( a );
6268 aSign = extractFloat128Sign( a );
6269 if ( aExp == 0x7FFF ) {
6270 if ( aSig0 | aSig1 ) {
ff32e16e 6271 return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
158142c2
FB
6272 }
6273 return packFloat32( aSign, 0xFF, 0 );
6274 }
6275 aSig0 |= ( aSig1 != 0 );
6276 shift64RightJamming( aSig0, 18, &aSig0 );
6277 zSig = aSig0;
6278 if ( aExp || zSig ) {
6279 zSig |= 0x40000000;
6280 aExp -= 0x3F81;
6281 }
ff32e16e 6282 return roundAndPackFloat32(aSign, aExp, zSig, status);
158142c2
FB
6283
6284}
6285
6286/*----------------------------------------------------------------------------
6287| Returns the result of converting the quadruple-precision floating-point
6288| value `a' to the double-precision floating-point format. The conversion
6289| is performed according to the IEC/IEEE Standard for Binary Floating-Point
6290| Arithmetic.
6291*----------------------------------------------------------------------------*/
6292
e5a41ffa 6293float64 float128_to_float64(float128 a, float_status *status)
158142c2
FB
6294{
6295 flag aSign;
f4014512 6296 int32_t aExp;
bb98fe42 6297 uint64_t aSig0, aSig1;
158142c2
FB
6298
6299 aSig1 = extractFloat128Frac1( a );
6300 aSig0 = extractFloat128Frac0( a );
6301 aExp = extractFloat128Exp( a );
6302 aSign = extractFloat128Sign( a );
6303 if ( aExp == 0x7FFF ) {
6304 if ( aSig0 | aSig1 ) {
ff32e16e 6305 return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
158142c2
FB
6306 }
6307 return packFloat64( aSign, 0x7FF, 0 );
6308 }
6309 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6310 aSig0 |= ( aSig1 != 0 );
6311 if ( aExp || aSig0 ) {
6312 aSig0 |= LIT64( 0x4000000000000000 );
6313 aExp -= 0x3C01;
6314 }
ff32e16e 6315 return roundAndPackFloat64(aSign, aExp, aSig0, status);
158142c2
FB
6316
6317}
6318
158142c2
FB
6319/*----------------------------------------------------------------------------
6320| Returns the result of converting the quadruple-precision floating-point
6321| value `a' to the extended double-precision floating-point format. The
6322| conversion is performed according to the IEC/IEEE Standard for Binary
6323| Floating-Point Arithmetic.
6324*----------------------------------------------------------------------------*/
6325
e5a41ffa 6326floatx80 float128_to_floatx80(float128 a, float_status *status)
158142c2
FB
6327{
6328 flag aSign;
f4014512 6329 int32_t aExp;
bb98fe42 6330 uint64_t aSig0, aSig1;
158142c2
FB
6331
6332 aSig1 = extractFloat128Frac1( a );
6333 aSig0 = extractFloat128Frac0( a );
6334 aExp = extractFloat128Exp( a );
6335 aSign = extractFloat128Sign( a );
6336 if ( aExp == 0x7FFF ) {
6337 if ( aSig0 | aSig1 ) {
ff32e16e 6338 return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
158142c2
FB
6339 }
6340 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
6341 }
6342 if ( aExp == 0 ) {
6343 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6344 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6345 }
6346 else {
6347 aSig0 |= LIT64( 0x0001000000000000 );
6348 }
6349 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
ff32e16e 6350 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
158142c2
FB
6351
6352}
6353
158142c2
FB
6354/*----------------------------------------------------------------------------
6355| Rounds the quadruple-precision floating-point value `a' to an integer, and
6356| returns the result as a quadruple-precision floating-point value. The
6357| operation is performed according to the IEC/IEEE Standard for Binary
6358| Floating-Point Arithmetic.
6359*----------------------------------------------------------------------------*/
6360
e5a41ffa 6361float128 float128_round_to_int(float128 a, float_status *status)
158142c2
FB
6362{
6363 flag aSign;
f4014512 6364 int32_t aExp;
bb98fe42 6365 uint64_t lastBitMask, roundBitsMask;
158142c2
FB
6366 float128 z;
6367
6368 aExp = extractFloat128Exp( a );
6369 if ( 0x402F <= aExp ) {
6370 if ( 0x406F <= aExp ) {
6371 if ( ( aExp == 0x7FFF )
6372 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6373 ) {
ff32e16e 6374 return propagateFloat128NaN(a, a, status);
158142c2
FB
6375 }
6376 return a;
6377 }
6378 lastBitMask = 1;
6379 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6380 roundBitsMask = lastBitMask - 1;
6381 z = a;
a2f2d288 6382 switch (status->float_rounding_mode) {
dc355b76 6383 case float_round_nearest_even:
158142c2
FB
6384 if ( lastBitMask ) {
6385 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6386 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6387 }
6388 else {
bb98fe42 6389 if ( (int64_t) z.low < 0 ) {
158142c2 6390 ++z.high;
bb98fe42 6391 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
158142c2
FB
6392 }
6393 }
dc355b76 6394 break;
f9288a76
PM
6395 case float_round_ties_away:
6396 if (lastBitMask) {
6397 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6398 } else {
6399 if ((int64_t) z.low < 0) {
6400 ++z.high;
6401 }
6402 }
6403 break;
dc355b76
PM
6404 case float_round_to_zero:
6405 break;
6406 case float_round_up:
6407 if (!extractFloat128Sign(z)) {
6408 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6409 }
6410 break;
6411 case float_round_down:
6412 if (extractFloat128Sign(z)) {
6413 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
158142c2 6414 }
dc355b76
PM
6415 break;
6416 default:
6417 abort();
158142c2
FB
6418 }
6419 z.low &= ~ roundBitsMask;
6420 }
6421 else {
6422 if ( aExp < 0x3FFF ) {
bb98fe42 6423 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
a2f2d288 6424 status->float_exception_flags |= float_flag_inexact;
158142c2 6425 aSign = extractFloat128Sign( a );
a2f2d288 6426 switch (status->float_rounding_mode) {
158142c2
FB
6427 case float_round_nearest_even:
6428 if ( ( aExp == 0x3FFE )
6429 && ( extractFloat128Frac0( a )
6430 | extractFloat128Frac1( a ) )
6431 ) {
6432 return packFloat128( aSign, 0x3FFF, 0, 0 );
6433 }
6434 break;
f9288a76
PM
6435 case float_round_ties_away:
6436 if (aExp == 0x3FFE) {
6437 return packFloat128(aSign, 0x3FFF, 0, 0);
6438 }
6439 break;
158142c2
FB
6440 case float_round_down:
6441 return
6442 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6443 : packFloat128( 0, 0, 0, 0 );
6444 case float_round_up:
6445 return
6446 aSign ? packFloat128( 1, 0, 0, 0 )
6447 : packFloat128( 0, 0x3FFF, 0, 0 );
6448 }
6449 return packFloat128( aSign, 0, 0, 0 );
6450 }
6451 lastBitMask = 1;
6452 lastBitMask <<= 0x402F - aExp;
6453 roundBitsMask = lastBitMask - 1;
6454 z.low = 0;
6455 z.high = a.high;
a2f2d288 6456 switch (status->float_rounding_mode) {
dc355b76 6457 case float_round_nearest_even:
158142c2
FB
6458 z.high += lastBitMask>>1;
6459 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6460 z.high &= ~ lastBitMask;
6461 }
dc355b76 6462 break;
f9288a76
PM
6463 case float_round_ties_away:
6464 z.high += lastBitMask>>1;
6465 break;
dc355b76
PM
6466 case float_round_to_zero:
6467 break;
6468 case float_round_up:
6469 if (!extractFloat128Sign(z)) {
158142c2
FB
6470 z.high |= ( a.low != 0 );
6471 z.high += roundBitsMask;
6472 }
dc355b76
PM
6473 break;
6474 case float_round_down:
6475 if (extractFloat128Sign(z)) {
6476 z.high |= (a.low != 0);
6477 z.high += roundBitsMask;
6478 }
6479 break;
6480 default:
6481 abort();
158142c2
FB
6482 }
6483 z.high &= ~ roundBitsMask;
6484 }
6485 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
a2f2d288 6486 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
6487 }
6488 return z;
6489
6490}
6491
6492/*----------------------------------------------------------------------------
6493| Returns the result of adding the absolute values of the quadruple-precision
6494| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
6495| before being returned. `zSign' is ignored if the result is a NaN.
6496| The addition is performed according to the IEC/IEEE Standard for Binary
6497| Floating-Point Arithmetic.
6498*----------------------------------------------------------------------------*/
6499
e5a41ffa
PM
6500static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
6501 float_status *status)
158142c2 6502{
f4014512 6503 int32_t aExp, bExp, zExp;
bb98fe42 6504 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
f4014512 6505 int32_t expDiff;
158142c2
FB
6506
6507 aSig1 = extractFloat128Frac1( a );
6508 aSig0 = extractFloat128Frac0( a );
6509 aExp = extractFloat128Exp( a );
6510 bSig1 = extractFloat128Frac1( b );
6511 bSig0 = extractFloat128Frac0( b );
6512 bExp = extractFloat128Exp( b );
6513 expDiff = aExp - bExp;
6514 if ( 0 < expDiff ) {
6515 if ( aExp == 0x7FFF ) {
ff32e16e
PM
6516 if (aSig0 | aSig1) {
6517 return propagateFloat128NaN(a, b, status);
6518 }
158142c2
FB
6519 return a;
6520 }
6521 if ( bExp == 0 ) {
6522 --expDiff;
6523 }
6524 else {
6525 bSig0 |= LIT64( 0x0001000000000000 );
6526 }
6527 shift128ExtraRightJamming(
6528 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6529 zExp = aExp;
6530 }
6531 else if ( expDiff < 0 ) {
6532 if ( bExp == 0x7FFF ) {
ff32e16e
PM
6533 if (bSig0 | bSig1) {
6534 return propagateFloat128NaN(a, b, status);
6535 }
158142c2
FB
6536 return packFloat128( zSign, 0x7FFF, 0, 0 );
6537 }
6538 if ( aExp == 0 ) {
6539 ++expDiff;
6540 }
6541 else {
6542 aSig0 |= LIT64( 0x0001000000000000 );
6543 }
6544 shift128ExtraRightJamming(
6545 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6546 zExp = bExp;
6547 }
6548 else {
6549 if ( aExp == 0x7FFF ) {
6550 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
ff32e16e 6551 return propagateFloat128NaN(a, b, status);
158142c2
FB
6552 }
6553 return a;
6554 }
6555 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
fe76d976 6556 if ( aExp == 0 ) {
a2f2d288 6557 if (status->flush_to_zero) {
e6afc87f 6558 if (zSig0 | zSig1) {
ff32e16e 6559 float_raise(float_flag_output_denormal, status);
e6afc87f
PM
6560 }
6561 return packFloat128(zSign, 0, 0, 0);
6562 }
fe76d976
PB
6563 return packFloat128( zSign, 0, zSig0, zSig1 );
6564 }
158142c2
FB
6565 zSig2 = 0;
6566 zSig0 |= LIT64( 0x0002000000000000 );
6567 zExp = aExp;
6568 goto shiftRight1;
6569 }
6570 aSig0 |= LIT64( 0x0001000000000000 );
6571 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6572 --zExp;
6573 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6574 ++zExp;
6575 shiftRight1:
6576 shift128ExtraRightJamming(
6577 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6578 roundAndPack:
ff32e16e 6579 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
158142c2
FB
6580
6581}
6582
6583/*----------------------------------------------------------------------------
6584| Returns the result of subtracting the absolute values of the quadruple-
6585| precision floating-point values `a' and `b'. If `zSign' is 1, the
6586| difference is negated before being returned. `zSign' is ignored if the
6587| result is a NaN. The subtraction is performed according to the IEC/IEEE
6588| Standard for Binary Floating-Point Arithmetic.
6589*----------------------------------------------------------------------------*/
6590
e5a41ffa
PM
6591static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6592 float_status *status)
158142c2 6593{
f4014512 6594 int32_t aExp, bExp, zExp;
bb98fe42 6595 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
f4014512 6596 int32_t expDiff;
158142c2
FB
6597
6598 aSig1 = extractFloat128Frac1( a );
6599 aSig0 = extractFloat128Frac0( a );
6600 aExp = extractFloat128Exp( a );
6601 bSig1 = extractFloat128Frac1( b );
6602 bSig0 = extractFloat128Frac0( b );
6603 bExp = extractFloat128Exp( b );
6604 expDiff = aExp - bExp;
6605 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6606 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6607 if ( 0 < expDiff ) goto aExpBigger;
6608 if ( expDiff < 0 ) goto bExpBigger;
6609 if ( aExp == 0x7FFF ) {
6610 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
ff32e16e 6611 return propagateFloat128NaN(a, b, status);
158142c2 6612 }
ff32e16e 6613 float_raise(float_flag_invalid, status);
af39bc8c 6614 return float128_default_nan(status);
158142c2
FB
6615 }
6616 if ( aExp == 0 ) {
6617 aExp = 1;
6618 bExp = 1;
6619 }
6620 if ( bSig0 < aSig0 ) goto aBigger;
6621 if ( aSig0 < bSig0 ) goto bBigger;
6622 if ( bSig1 < aSig1 ) goto aBigger;
6623 if ( aSig1 < bSig1 ) goto bBigger;
a2f2d288
PM
6624 return packFloat128(status->float_rounding_mode == float_round_down,
6625 0, 0, 0);
158142c2
FB
6626 bExpBigger:
6627 if ( bExp == 0x7FFF ) {
ff32e16e
PM
6628 if (bSig0 | bSig1) {
6629 return propagateFloat128NaN(a, b, status);
6630 }
158142c2
FB
6631 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6632 }
6633 if ( aExp == 0 ) {
6634 ++expDiff;
6635 }
6636 else {
6637 aSig0 |= LIT64( 0x4000000000000000 );
6638 }
6639 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6640 bSig0 |= LIT64( 0x4000000000000000 );
6641 bBigger:
6642 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6643 zExp = bExp;
6644 zSign ^= 1;
6645 goto normalizeRoundAndPack;
6646 aExpBigger:
6647 if ( aExp == 0x7FFF ) {
ff32e16e
PM
6648 if (aSig0 | aSig1) {
6649 return propagateFloat128NaN(a, b, status);
6650 }
158142c2
FB
6651 return a;
6652 }
6653 if ( bExp == 0 ) {
6654 --expDiff;
6655 }
6656 else {
6657 bSig0 |= LIT64( 0x4000000000000000 );
6658 }
6659 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6660 aSig0 |= LIT64( 0x4000000000000000 );
6661 aBigger:
6662 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6663 zExp = aExp;
6664 normalizeRoundAndPack:
6665 --zExp;
ff32e16e
PM
6666 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6667 status);
158142c2
FB
6668
6669}
6670
6671/*----------------------------------------------------------------------------
6672| Returns the result of adding the quadruple-precision floating-point values
6673| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
6674| for Binary Floating-Point Arithmetic.
6675*----------------------------------------------------------------------------*/
6676
e5a41ffa 6677float128 float128_add(float128 a, float128 b, float_status *status)
158142c2
FB
6678{
6679 flag aSign, bSign;
6680
6681 aSign = extractFloat128Sign( a );
6682 bSign = extractFloat128Sign( b );
6683 if ( aSign == bSign ) {
ff32e16e 6684 return addFloat128Sigs(a, b, aSign, status);
158142c2
FB
6685 }
6686 else {
ff32e16e 6687 return subFloat128Sigs(a, b, aSign, status);
158142c2
FB
6688 }
6689
6690}
6691
6692/*----------------------------------------------------------------------------
6693| Returns the result of subtracting the quadruple-precision floating-point
6694| values `a' and `b'. The operation is performed according to the IEC/IEEE
6695| Standard for Binary Floating-Point Arithmetic.
6696*----------------------------------------------------------------------------*/
6697
e5a41ffa 6698float128 float128_sub(float128 a, float128 b, float_status *status)
158142c2
FB
6699{
6700 flag aSign, bSign;
6701
6702 aSign = extractFloat128Sign( a );
6703 bSign = extractFloat128Sign( b );
6704 if ( aSign == bSign ) {
ff32e16e 6705 return subFloat128Sigs(a, b, aSign, status);
158142c2
FB
6706 }
6707 else {
ff32e16e 6708 return addFloat128Sigs(a, b, aSign, status);
158142c2
FB
6709 }
6710
6711}
6712
6713/*----------------------------------------------------------------------------
6714| Returns the result of multiplying the quadruple-precision floating-point
6715| values `a' and `b'. The operation is performed according to the IEC/IEEE
6716| Standard for Binary Floating-Point Arithmetic.
6717*----------------------------------------------------------------------------*/
6718
e5a41ffa 6719float128 float128_mul(float128 a, float128 b, float_status *status)
158142c2
FB
6720{
6721 flag aSign, bSign, zSign;
f4014512 6722 int32_t aExp, bExp, zExp;
bb98fe42 6723 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
158142c2
FB
6724
6725 aSig1 = extractFloat128Frac1( a );
6726 aSig0 = extractFloat128Frac0( a );
6727 aExp = extractFloat128Exp( a );
6728 aSign = extractFloat128Sign( a );
6729 bSig1 = extractFloat128Frac1( b );
6730 bSig0 = extractFloat128Frac0( b );
6731 bExp = extractFloat128Exp( b );
6732 bSign = extractFloat128Sign( b );
6733 zSign = aSign ^ bSign;
6734 if ( aExp == 0x7FFF ) {
6735 if ( ( aSig0 | aSig1 )
6736 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
ff32e16e 6737 return propagateFloat128NaN(a, b, status);
158142c2
FB
6738 }
6739 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6740 return packFloat128( zSign, 0x7FFF, 0, 0 );
6741 }
6742 if ( bExp == 0x7FFF ) {
ff32e16e
PM
6743 if (bSig0 | bSig1) {
6744 return propagateFloat128NaN(a, b, status);
6745 }
158142c2
FB
6746 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6747 invalid:
ff32e16e 6748 float_raise(float_flag_invalid, status);
af39bc8c 6749 return float128_default_nan(status);
158142c2
FB
6750 }
6751 return packFloat128( zSign, 0x7FFF, 0, 0 );
6752 }
6753 if ( aExp == 0 ) {
6754 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6755 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6756 }
6757 if ( bExp == 0 ) {
6758 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6759 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6760 }
6761 zExp = aExp + bExp - 0x4000;
6762 aSig0 |= LIT64( 0x0001000000000000 );
6763 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6764 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6765 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6766 zSig2 |= ( zSig3 != 0 );
6767 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6768 shift128ExtraRightJamming(
6769 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6770 ++zExp;
6771 }
ff32e16e 6772 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
158142c2
FB
6773
6774}
6775
6776/*----------------------------------------------------------------------------
6777| Returns the result of dividing the quadruple-precision floating-point value
6778| `a' by the corresponding value `b'. The operation is performed according to
6779| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6780*----------------------------------------------------------------------------*/
6781
e5a41ffa 6782float128 float128_div(float128 a, float128 b, float_status *status)
158142c2
FB
6783{
6784 flag aSign, bSign, zSign;
f4014512 6785 int32_t aExp, bExp, zExp;
bb98fe42
AF
6786 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6787 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
158142c2
FB
6788
6789 aSig1 = extractFloat128Frac1( a );
6790 aSig0 = extractFloat128Frac0( a );
6791 aExp = extractFloat128Exp( a );
6792 aSign = extractFloat128Sign( a );
6793 bSig1 = extractFloat128Frac1( b );
6794 bSig0 = extractFloat128Frac0( b );
6795 bExp = extractFloat128Exp( b );
6796 bSign = extractFloat128Sign( b );
6797 zSign = aSign ^ bSign;
6798 if ( aExp == 0x7FFF ) {
ff32e16e
PM
6799 if (aSig0 | aSig1) {
6800 return propagateFloat128NaN(a, b, status);
6801 }
158142c2 6802 if ( bExp == 0x7FFF ) {
ff32e16e
PM
6803 if (bSig0 | bSig1) {
6804 return propagateFloat128NaN(a, b, status);
6805 }
158142c2
FB
6806 goto invalid;
6807 }
6808 return packFloat128( zSign, 0x7FFF, 0, 0 );
6809 }
6810 if ( bExp == 0x7FFF ) {
ff32e16e
PM
6811 if (bSig0 | bSig1) {
6812 return propagateFloat128NaN(a, b, status);
6813 }
158142c2
FB
6814 return packFloat128( zSign, 0, 0, 0 );
6815 }
6816 if ( bExp == 0 ) {
6817 if ( ( bSig0 | bSig1 ) == 0 ) {
6818 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6819 invalid:
ff32e16e 6820 float_raise(float_flag_invalid, status);
af39bc8c 6821 return float128_default_nan(status);
158142c2 6822 }
ff32e16e 6823 float_raise(float_flag_divbyzero, status);
158142c2
FB
6824 return packFloat128( zSign, 0x7FFF, 0, 0 );
6825 }
6826 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6827 }
6828 if ( aExp == 0 ) {
6829 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6830 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6831 }
6832 zExp = aExp - bExp + 0x3FFD;
6833 shortShift128Left(
6834 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6835 shortShift128Left(
6836 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6837 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6838 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6839 ++zExp;
6840 }
6841 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6842 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6843 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
bb98fe42 6844 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
6845 --zSig0;
6846 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6847 }
6848 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6849 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6850 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6851 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
bb98fe42 6852 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
6853 --zSig1;
6854 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6855 }
6856 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6857 }
6858 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
ff32e16e 6859 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
158142c2
FB
6860
6861}
6862
6863/*----------------------------------------------------------------------------
6864| Returns the remainder of the quadruple-precision floating-point value `a'
6865| with respect to the corresponding value `b'. The operation is performed
6866| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6867*----------------------------------------------------------------------------*/
6868
e5a41ffa 6869float128 float128_rem(float128 a, float128 b, float_status *status)
158142c2 6870{
ed086f3d 6871 flag aSign, zSign;
f4014512 6872 int32_t aExp, bExp, expDiff;
bb98fe42
AF
6873 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6874 uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6875 int64_t sigMean0;
158142c2
FB
6876
6877 aSig1 = extractFloat128Frac1( a );
6878 aSig0 = extractFloat128Frac0( a );
6879 aExp = extractFloat128Exp( a );
6880 aSign = extractFloat128Sign( a );
6881 bSig1 = extractFloat128Frac1( b );
6882 bSig0 = extractFloat128Frac0( b );
6883 bExp = extractFloat128Exp( b );
158142c2
FB
6884 if ( aExp == 0x7FFF ) {
6885 if ( ( aSig0 | aSig1 )
6886 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
ff32e16e 6887 return propagateFloat128NaN(a, b, status);
158142c2
FB
6888 }
6889 goto invalid;
6890 }
6891 if ( bExp == 0x7FFF ) {
ff32e16e
PM
6892 if (bSig0 | bSig1) {
6893 return propagateFloat128NaN(a, b, status);
6894 }
158142c2
FB
6895 return a;
6896 }
6897 if ( bExp == 0 ) {
6898 if ( ( bSig0 | bSig1 ) == 0 ) {
6899 invalid:
ff32e16e 6900 float_raise(float_flag_invalid, status);
af39bc8c 6901 return float128_default_nan(status);
158142c2
FB
6902 }
6903 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6904 }
6905 if ( aExp == 0 ) {
6906 if ( ( aSig0 | aSig1 ) == 0 ) return a;
6907 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6908 }
6909 expDiff = aExp - bExp;
6910 if ( expDiff < -1 ) return a;
6911 shortShift128Left(
6912 aSig0 | LIT64( 0x0001000000000000 ),
6913 aSig1,
6914 15 - ( expDiff < 0 ),
6915 &aSig0,
6916 &aSig1
6917 );
6918 shortShift128Left(
6919 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6920 q = le128( bSig0, bSig1, aSig0, aSig1 );
6921 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6922 expDiff -= 64;
6923 while ( 0 < expDiff ) {
6924 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6925 q = ( 4 < q ) ? q - 4 : 0;
6926 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6927 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6928 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6929 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6930 expDiff -= 61;
6931 }
6932 if ( -64 < expDiff ) {
6933 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6934 q = ( 4 < q ) ? q - 4 : 0;
6935 q >>= - expDiff;
6936 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6937 expDiff += 52;
6938 if ( expDiff < 0 ) {
6939 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6940 }
6941 else {
6942 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6943 }
6944 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6945 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6946 }
6947 else {
6948 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6949 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6950 }
6951 do {
6952 alternateASig0 = aSig0;
6953 alternateASig1 = aSig1;
6954 ++q;
6955 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
bb98fe42 6956 } while ( 0 <= (int64_t) aSig0 );
158142c2 6957 add128(
bb98fe42 6958 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
158142c2
FB
6959 if ( ( sigMean0 < 0 )
6960 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6961 aSig0 = alternateASig0;
6962 aSig1 = alternateASig1;
6963 }
bb98fe42 6964 zSign = ( (int64_t) aSig0 < 0 );
158142c2 6965 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
ff32e16e
PM
6966 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
6967 status);
158142c2
FB
6968}
6969
6970/*----------------------------------------------------------------------------
6971| Returns the square root of the quadruple-precision floating-point value `a'.
6972| The operation is performed according to the IEC/IEEE Standard for Binary
6973| Floating-Point Arithmetic.
6974*----------------------------------------------------------------------------*/
6975
e5a41ffa 6976float128 float128_sqrt(float128 a, float_status *status)
158142c2
FB
6977{
6978 flag aSign;
f4014512 6979 int32_t aExp, zExp;
bb98fe42
AF
6980 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6981 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
158142c2
FB
6982
6983 aSig1 = extractFloat128Frac1( a );
6984 aSig0 = extractFloat128Frac0( a );
6985 aExp = extractFloat128Exp( a );
6986 aSign = extractFloat128Sign( a );
6987 if ( aExp == 0x7FFF ) {
ff32e16e
PM
6988 if (aSig0 | aSig1) {
6989 return propagateFloat128NaN(a, a, status);
6990 }
158142c2
FB
6991 if ( ! aSign ) return a;
6992 goto invalid;
6993 }
6994 if ( aSign ) {
6995 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6996 invalid:
ff32e16e 6997 float_raise(float_flag_invalid, status);
af39bc8c 6998 return float128_default_nan(status);
158142c2
FB
6999 }
7000 if ( aExp == 0 ) {
7001 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
7002 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7003 }
7004 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
7005 aSig0 |= LIT64( 0x0001000000000000 );
7006 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
7007 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
7008 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
7009 doubleZSig0 = zSig0<<1;
7010 mul64To128( zSig0, zSig0, &term0, &term1 );
7011 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
bb98fe42 7012 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
7013 --zSig0;
7014 doubleZSig0 -= 2;
7015 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
7016 }
7017 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
7018 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
7019 if ( zSig1 == 0 ) zSig1 = 1;
7020 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
7021 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
7022 mul64To128( zSig1, zSig1, &term2, &term3 );
7023 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
bb98fe42 7024 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
7025 --zSig1;
7026 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
7027 term3 |= 1;
7028 term2 |= doubleZSig0;
7029 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
7030 }
7031 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7032 }
7033 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
ff32e16e 7034 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
158142c2
FB
7035
7036}
7037
7038/*----------------------------------------------------------------------------
7039| Returns 1 if the quadruple-precision floating-point value `a' is equal to
b689362d
AJ
7040| the corresponding value `b', and 0 otherwise. The invalid exception is
7041| raised if either operand is a NaN. Otherwise, the comparison is performed
158142c2
FB
7042| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7043*----------------------------------------------------------------------------*/
7044
e5a41ffa 7045int float128_eq(float128 a, float128 b, float_status *status)
158142c2
FB
7046{
7047
7048 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
7049 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7050 || ( ( extractFloat128Exp( b ) == 0x7FFF )
7051 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7052 ) {
ff32e16e 7053 float_raise(float_flag_invalid, status);
158142c2
FB
7054 return 0;
7055 }
7056 return
7057 ( a.low == b.low )
7058 && ( ( a.high == b.high )
7059 || ( ( a.low == 0 )
bb98fe42 7060 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
7061 );
7062
7063}
7064
7065/*----------------------------------------------------------------------------
7066| Returns 1 if the quadruple-precision floating-point value `a' is less than
f5a64251
AJ
7067| or equal to the corresponding value `b', and 0 otherwise. The invalid
7068| exception is raised if either operand is a NaN. The comparison is performed
7069| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
7070*----------------------------------------------------------------------------*/
7071
e5a41ffa 7072int float128_le(float128 a, float128 b, float_status *status)
158142c2
FB
7073{
7074 flag aSign, bSign;
7075
7076 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
7077 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7078 || ( ( extractFloat128Exp( b ) == 0x7FFF )
7079 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7080 ) {
ff32e16e 7081 float_raise(float_flag_invalid, status);
158142c2
FB
7082 return 0;
7083 }
7084 aSign = extractFloat128Sign( a );
7085 bSign = extractFloat128Sign( b );
7086 if ( aSign != bSign ) {
7087 return
7088 aSign
bb98fe42 7089 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
7090 == 0 );
7091 }
7092 return
7093 aSign ? le128( b.high, b.low, a.high, a.low )
7094 : le128( a.high, a.low, b.high, b.low );
7095
7096}
7097
7098/*----------------------------------------------------------------------------
7099| Returns 1 if the quadruple-precision floating-point value `a' is less than
f5a64251
AJ
7100| the corresponding value `b', and 0 otherwise. The invalid exception is
7101| raised if either operand is a NaN. The comparison is performed according
7102| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
7103*----------------------------------------------------------------------------*/
7104
e5a41ffa 7105int float128_lt(float128 a, float128 b, float_status *status)
158142c2
FB
7106{
7107 flag aSign, bSign;
7108
7109 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
7110 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7111 || ( ( extractFloat128Exp( b ) == 0x7FFF )
7112 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7113 ) {
ff32e16e 7114 float_raise(float_flag_invalid, status);
158142c2
FB
7115 return 0;
7116 }
7117 aSign = extractFloat128Sign( a );
7118 bSign = extractFloat128Sign( b );
7119 if ( aSign != bSign ) {
7120 return
7121 aSign
bb98fe42 7122 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
7123 != 0 );
7124 }
7125 return
7126 aSign ? lt128( b.high, b.low, a.high, a.low )
7127 : lt128( a.high, a.low, b.high, b.low );
7128
7129}
7130
67b7861d
AJ
7131/*----------------------------------------------------------------------------
7132| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
f5a64251
AJ
7133| be compared, and 0 otherwise. The invalid exception is raised if either
7134| operand is a NaN. The comparison is performed according to the IEC/IEEE
7135| Standard for Binary Floating-Point Arithmetic.
67b7861d
AJ
7136*----------------------------------------------------------------------------*/
7137
e5a41ffa 7138int float128_unordered(float128 a, float128 b, float_status *status)
67b7861d
AJ
7139{
7140 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
7141 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7142 || ( ( extractFloat128Exp( b ) == 0x7FFF )
7143 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7144 ) {
ff32e16e 7145 float_raise(float_flag_invalid, status);
67b7861d
AJ
7146 return 1;
7147 }
7148 return 0;
7149}
7150
158142c2
FB
7151/*----------------------------------------------------------------------------
7152| Returns 1 if the quadruple-precision floating-point value `a' is equal to
f5a64251
AJ
7153| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
7154| exception. The comparison is performed according to the IEC/IEEE Standard
7155| for Binary Floating-Point Arithmetic.
158142c2
FB
7156*----------------------------------------------------------------------------*/
7157
e5a41ffa 7158int float128_eq_quiet(float128 a, float128 b, float_status *status)
158142c2
FB
7159{
7160
7161 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
7162 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7163 || ( ( extractFloat128Exp( b ) == 0x7FFF )
7164 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7165 ) {
af39bc8c
AM
7166 if (float128_is_signaling_nan(a, status)
7167 || float128_is_signaling_nan(b, status)) {
ff32e16e 7168 float_raise(float_flag_invalid, status);
b689362d 7169 }
158142c2
FB
7170 return 0;
7171 }
7172 return
7173 ( a.low == b.low )
7174 && ( ( a.high == b.high )
7175 || ( ( a.low == 0 )
bb98fe42 7176 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
7177 );
7178
7179}
7180
7181/*----------------------------------------------------------------------------
7182| Returns 1 if the quadruple-precision floating-point value `a' is less than
7183| or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
7184| cause an exception. Otherwise, the comparison is performed according to the
7185| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7186*----------------------------------------------------------------------------*/
7187
e5a41ffa 7188int float128_le_quiet(float128 a, float128 b, float_status *status)
158142c2
FB
7189{
7190 flag aSign, bSign;
7191
7192 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
7193 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7194 || ( ( extractFloat128Exp( b ) == 0x7FFF )
7195 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7196 ) {
af39bc8c
AM
7197 if (float128_is_signaling_nan(a, status)
7198 || float128_is_signaling_nan(b, status)) {
ff32e16e 7199 float_raise(float_flag_invalid, status);
158142c2
FB
7200 }
7201 return 0;
7202 }
7203 aSign = extractFloat128Sign( a );
7204 bSign = extractFloat128Sign( b );
7205 if ( aSign != bSign ) {
7206 return
7207 aSign
bb98fe42 7208 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
7209 == 0 );
7210 }
7211 return
7212 aSign ? le128( b.high, b.low, a.high, a.low )
7213 : le128( a.high, a.low, b.high, b.low );
7214
7215}
7216
7217/*----------------------------------------------------------------------------
7218| Returns 1 if the quadruple-precision floating-point value `a' is less than
7219| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
7220| exception. Otherwise, the comparison is performed according to the IEC/IEEE
7221| Standard for Binary Floating-Point Arithmetic.
7222*----------------------------------------------------------------------------*/
7223
e5a41ffa 7224int float128_lt_quiet(float128 a, float128 b, float_status *status)
158142c2
FB
7225{
7226 flag aSign, bSign;
7227
7228 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
7229 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7230 || ( ( extractFloat128Exp( b ) == 0x7FFF )
7231 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7232 ) {
af39bc8c
AM
7233 if (float128_is_signaling_nan(a, status)
7234 || float128_is_signaling_nan(b, status)) {
ff32e16e 7235 float_raise(float_flag_invalid, status);
158142c2
FB
7236 }
7237 return 0;
7238 }
7239 aSign = extractFloat128Sign( a );
7240 bSign = extractFloat128Sign( b );
7241 if ( aSign != bSign ) {
7242 return
7243 aSign
bb98fe42 7244 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
7245 != 0 );
7246 }
7247 return
7248 aSign ? lt128( b.high, b.low, a.high, a.low )
7249 : lt128( a.high, a.low, b.high, b.low );
7250
7251}
7252
67b7861d
AJ
7253/*----------------------------------------------------------------------------
7254| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7255| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
7256| comparison is performed according to the IEC/IEEE Standard for Binary
7257| Floating-Point Arithmetic.
7258*----------------------------------------------------------------------------*/
7259
e5a41ffa 7260int float128_unordered_quiet(float128 a, float128 b, float_status *status)
67b7861d
AJ
7261{
7262 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
7263 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7264 || ( ( extractFloat128Exp( b ) == 0x7FFF )
7265 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7266 ) {
af39bc8c
AM
7267 if (float128_is_signaling_nan(a, status)
7268 || float128_is_signaling_nan(b, status)) {
ff32e16e 7269 float_raise(float_flag_invalid, status);
67b7861d
AJ
7270 }
7271 return 1;
7272 }
7273 return 0;
7274}
7275
1d6bda35 7276/* misc functions */
e5a41ffa 7277float32 uint32_to_float32(uint32_t a, float_status *status)
1d6bda35 7278{
ff32e16e 7279 return int64_to_float32(a, status);
1d6bda35
FB
7280}
7281
e5a41ffa 7282float64 uint32_to_float64(uint32_t a, float_status *status)
1d6bda35 7283{
ff32e16e 7284 return int64_to_float64(a, status);
1d6bda35
FB
7285}
7286
3a87d009 7287uint32_t float32_to_uint32(float32 a, float_status *status)
1d6bda35
FB
7288{
7289 int64_t v;
3a87d009 7290 uint32_t res;
34e1c27b 7291 int old_exc_flags = get_float_exception_flags(status);
1d6bda35 7292
ff32e16e 7293 v = float32_to_int64(a, status);
1d6bda35
FB
7294 if (v < 0) {
7295 res = 0;
1d6bda35
FB
7296 } else if (v > 0xffffffff) {
7297 res = 0xffffffff;
1d6bda35 7298 } else {
34e1c27b 7299 return v;
1d6bda35 7300 }
34e1c27b 7301 set_float_exception_flags(old_exc_flags, status);
ff32e16e 7302 float_raise(float_flag_invalid, status);
1d6bda35
FB
7303 return res;
7304}
7305
3a87d009 7306uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *status)
1d6bda35
FB
7307{
7308 int64_t v;
3a87d009 7309 uint32_t res;
34e1c27b 7310 int old_exc_flags = get_float_exception_flags(status);
1d6bda35 7311
ff32e16e 7312 v = float32_to_int64_round_to_zero(a, status);
1d6bda35
FB
7313 if (v < 0) {
7314 res = 0;
1d6bda35
FB
7315 } else if (v > 0xffffffff) {
7316 res = 0xffffffff;
1d6bda35 7317 } else {
34e1c27b 7318 return v;
1d6bda35 7319 }
34e1c27b 7320 set_float_exception_flags(old_exc_flags, status);
ff32e16e 7321 float_raise(float_flag_invalid, status);
1d6bda35
FB
7322 return res;
7323}
7324
0bb721d7 7325int16_t float32_to_int16(float32 a, float_status *status)
f581bf54
WN
7326{
7327 int32_t v;
0bb721d7 7328 int16_t res;
f581bf54
WN
7329 int old_exc_flags = get_float_exception_flags(status);
7330
ff32e16e 7331 v = float32_to_int32(a, status);
f581bf54
WN
7332 if (v < -0x8000) {
7333 res = -0x8000;
7334 } else if (v > 0x7fff) {
7335 res = 0x7fff;
7336 } else {
7337 return v;
7338 }
7339
7340 set_float_exception_flags(old_exc_flags, status);
ff32e16e 7341 float_raise(float_flag_invalid, status);
f581bf54
WN
7342 return res;
7343}
7344
0bb721d7 7345uint16_t float32_to_uint16(float32 a, float_status *status)
f581bf54
WN
7346{
7347 int32_t v;
0bb721d7 7348 uint16_t res;
f581bf54
WN
7349 int old_exc_flags = get_float_exception_flags(status);
7350
ff32e16e 7351 v = float32_to_int32(a, status);
f581bf54
WN
7352 if (v < 0) {
7353 res = 0;
7354 } else if (v > 0xffff) {
7355 res = 0xffff;
7356 } else {
7357 return v;
7358 }
7359
7360 set_float_exception_flags(old_exc_flags, status);
ff32e16e 7361 float_raise(float_flag_invalid, status);
f581bf54
WN
7362 return res;
7363}
7364
0bb721d7 7365uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *status)
cbcef455
PM
7366{
7367 int64_t v;
0bb721d7 7368 uint16_t res;
34e1c27b 7369 int old_exc_flags = get_float_exception_flags(status);
cbcef455 7370
ff32e16e 7371 v = float32_to_int64_round_to_zero(a, status);
cbcef455
PM
7372 if (v < 0) {
7373 res = 0;
cbcef455
PM
7374 } else if (v > 0xffff) {
7375 res = 0xffff;
cbcef455 7376 } else {
34e1c27b 7377 return v;
cbcef455 7378 }
34e1c27b 7379 set_float_exception_flags(old_exc_flags, status);
ff32e16e 7380 float_raise(float_flag_invalid, status);
cbcef455
PM
7381 return res;
7382}
7383
3a87d009 7384uint32_t float64_to_uint32(float64 a, float_status *status)
1d6bda35 7385{
5e7f654f 7386 uint64_t v;
3a87d009 7387 uint32_t res;
5e7f654f 7388 int old_exc_flags = get_float_exception_flags(status);
1d6bda35 7389
ff32e16e 7390 v = float64_to_uint64(a, status);
5e7f654f 7391 if (v > 0xffffffff) {
1d6bda35 7392 res = 0xffffffff;
1d6bda35 7393 } else {
5e7f654f 7394 return v;
1d6bda35 7395 }
5e7f654f 7396 set_float_exception_flags(old_exc_flags, status);
ff32e16e 7397 float_raise(float_flag_invalid, status);
1d6bda35
FB
7398 return res;
7399}
7400
3a87d009 7401uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *status)
1d6bda35 7402{
fd728f2f 7403 uint64_t v;
3a87d009 7404 uint32_t res;
fd728f2f 7405 int old_exc_flags = get_float_exception_flags(status);
1d6bda35 7406
ff32e16e 7407 v = float64_to_uint64_round_to_zero(a, status);
fd728f2f 7408 if (v > 0xffffffff) {
1d6bda35 7409 res = 0xffffffff;
1d6bda35 7410 } else {
fd728f2f 7411 return v;
1d6bda35 7412 }
fd728f2f 7413 set_float_exception_flags(old_exc_flags, status);
ff32e16e 7414 float_raise(float_flag_invalid, status);
1d6bda35
FB
7415 return res;
7416}
7417
0bb721d7 7418int16_t float64_to_int16(float64 a, float_status *status)
f581bf54
WN
7419{
7420 int64_t v;
0bb721d7 7421 int16_t res;
f581bf54
WN
7422 int old_exc_flags = get_float_exception_flags(status);
7423
ff32e16e 7424 v = float64_to_int32(a, status);
f581bf54
WN
7425 if (v < -0x8000) {
7426 res = -0x8000;
7427 } else if (v > 0x7fff) {
7428 res = 0x7fff;
7429 } else {
7430 return v;
7431 }
7432
7433 set_float_exception_flags(old_exc_flags, status);
ff32e16e 7434 float_raise(float_flag_invalid, status);
f581bf54
WN
7435 return res;
7436}
7437
0bb721d7 7438uint16_t float64_to_uint16(float64 a, float_status *status)
f581bf54
WN
7439{
7440 int64_t v;
0bb721d7 7441 uint16_t res;
f581bf54
WN
7442 int old_exc_flags = get_float_exception_flags(status);
7443
ff32e16e 7444 v = float64_to_int32(a, status);
f581bf54
WN
7445 if (v < 0) {
7446 res = 0;
7447 } else if (v > 0xffff) {
7448 res = 0xffff;
7449 } else {
7450 return v;
7451 }
7452
7453 set_float_exception_flags(old_exc_flags, status);
ff32e16e 7454 float_raise(float_flag_invalid, status);
f581bf54
WN
7455 return res;
7456}
7457
0bb721d7 7458uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *status)
cbcef455
PM
7459{
7460 int64_t v;
0bb721d7 7461 uint16_t res;
34e1c27b 7462 int old_exc_flags = get_float_exception_flags(status);
cbcef455 7463
ff32e16e 7464 v = float64_to_int64_round_to_zero(a, status);
cbcef455
PM
7465 if (v < 0) {
7466 res = 0;
cbcef455
PM
7467 } else if (v > 0xffff) {
7468 res = 0xffff;
cbcef455 7469 } else {
34e1c27b 7470 return v;
cbcef455 7471 }
34e1c27b 7472 set_float_exception_flags(old_exc_flags, status);
ff32e16e 7473 float_raise(float_flag_invalid, status);
cbcef455
PM
7474 return res;
7475}
7476
fb3ea83a
TM
7477/*----------------------------------------------------------------------------
7478| Returns the result of converting the double-precision floating-point value
7479| `a' to the 64-bit unsigned integer format. The conversion is
7480| performed according to the IEC/IEEE Standard for Binary Floating-Point
7481| Arithmetic---which means in particular that the conversion is rounded
7482| according to the current rounding mode. If `a' is a NaN, the largest
7483| positive integer is returned. If the conversion overflows, the
7484| largest unsigned integer is returned. If 'a' is negative, the value is
7485| rounded and zero is returned; negative values that do not round to zero
7486| will raise the inexact exception.
7487*----------------------------------------------------------------------------*/
75d62a58 7488
e5a41ffa 7489uint64_t float64_to_uint64(float64 a, float_status *status)
fb3ea83a
TM
7490{
7491 flag aSign;
0c48262d 7492 int aExp;
07d792d2 7493 int shiftCount;
fb3ea83a 7494 uint64_t aSig, aSigExtra;
ff32e16e 7495 a = float64_squash_input_denormal(a, status);
75d62a58 7496
fb3ea83a
TM
7497 aSig = extractFloat64Frac(a);
7498 aExp = extractFloat64Exp(a);
7499 aSign = extractFloat64Sign(a);
7500 if (aSign && (aExp > 1022)) {
ff32e16e 7501 float_raise(float_flag_invalid, status);
fb3ea83a
TM
7502 if (float64_is_any_nan(a)) {
7503 return LIT64(0xFFFFFFFFFFFFFFFF);
7504 } else {
7505 return 0;
7506 }
7507 }
7508 if (aExp) {
7509 aSig |= LIT64(0x0010000000000000);
7510 }
7511 shiftCount = 0x433 - aExp;
7512 if (shiftCount <= 0) {
7513 if (0x43E < aExp) {
ff32e16e 7514 float_raise(float_flag_invalid, status);
fb3ea83a
TM
7515 return LIT64(0xFFFFFFFFFFFFFFFF);
7516 }
7517 aSigExtra = 0;
7518 aSig <<= -shiftCount;
7519 } else {
7520 shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra);
7521 }
ff32e16e 7522 return roundAndPackUint64(aSign, aSig, aSigExtra, status);
75d62a58
JM
7523}
7524
e5a41ffa 7525uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *status)
75d62a58 7526{
a2f2d288 7527 signed char current_rounding_mode = status->float_rounding_mode;
ff32e16e 7528 set_float_rounding_mode(float_round_to_zero, status);
d000b477 7529 uint64_t v = float64_to_uint64(a, status);
ff32e16e 7530 set_float_rounding_mode(current_rounding_mode, status);
0a87a310 7531 return v;
75d62a58
JM
7532}
7533
1d6bda35 7534#define COMPARE(s, nan_exp) \
e5a41ffa
PM
7535static inline int float ## s ## _compare_internal(float ## s a, float ## s b,\
7536 int is_quiet, float_status *status) \
1d6bda35
FB
7537{ \
7538 flag aSign, bSign; \
bb98fe42 7539 uint ## s ## _t av, bv; \
ff32e16e
PM
7540 a = float ## s ## _squash_input_denormal(a, status); \
7541 b = float ## s ## _squash_input_denormal(b, status); \
1d6bda35
FB
7542 \
7543 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
7544 extractFloat ## s ## Frac( a ) ) || \
7545 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
7546 extractFloat ## s ## Frac( b ) )) { \
7547 if (!is_quiet || \
af39bc8c
AM
7548 float ## s ## _is_signaling_nan(a, status) || \
7549 float ## s ## _is_signaling_nan(b, status)) { \
ff32e16e 7550 float_raise(float_flag_invalid, status); \
1d6bda35
FB
7551 } \
7552 return float_relation_unordered; \
7553 } \
7554 aSign = extractFloat ## s ## Sign( a ); \
7555 bSign = extractFloat ## s ## Sign( b ); \
f090c9d4 7556 av = float ## s ## _val(a); \
cd8a2533 7557 bv = float ## s ## _val(b); \
1d6bda35 7558 if ( aSign != bSign ) { \
bb98fe42 7559 if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) { \
1d6bda35
FB
7560 /* zero case */ \
7561 return float_relation_equal; \
7562 } else { \
7563 return 1 - (2 * aSign); \
7564 } \
7565 } else { \
f090c9d4 7566 if (av == bv) { \
1d6bda35
FB
7567 return float_relation_equal; \
7568 } else { \
f090c9d4 7569 return 1 - 2 * (aSign ^ ( av < bv )); \
1d6bda35
FB
7570 } \
7571 } \
7572} \
7573 \
e5a41ffa 7574int float ## s ## _compare(float ## s a, float ## s b, float_status *status) \
1d6bda35 7575{ \
ff32e16e 7576 return float ## s ## _compare_internal(a, b, 0, status); \
1d6bda35
FB
7577} \
7578 \
e5a41ffa
PM
7579int float ## s ## _compare_quiet(float ## s a, float ## s b, \
7580 float_status *status) \
1d6bda35 7581{ \
ff32e16e 7582 return float ## s ## _compare_internal(a, b, 1, status); \
1d6bda35
FB
7583}
7584
7585COMPARE(32, 0xff)
7586COMPARE(64, 0x7ff)
9ee6e8bb 7587
e5a41ffa
PM
7588static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
7589 int is_quiet, float_status *status)
f6714d36
AJ
7590{
7591 flag aSign, bSign;
7592
d1eb8f2a
AD
7593 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
7594 float_raise(float_flag_invalid, status);
7595 return float_relation_unordered;
7596 }
f6714d36
AJ
7597 if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7598 ( extractFloatx80Frac( a )<<1 ) ) ||
7599 ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7600 ( extractFloatx80Frac( b )<<1 ) )) {
7601 if (!is_quiet ||
af39bc8c
AM
7602 floatx80_is_signaling_nan(a, status) ||
7603 floatx80_is_signaling_nan(b, status)) {
ff32e16e 7604 float_raise(float_flag_invalid, status);
f6714d36
AJ
7605 }
7606 return float_relation_unordered;
7607 }
7608 aSign = extractFloatx80Sign( a );
7609 bSign = extractFloatx80Sign( b );
7610 if ( aSign != bSign ) {
7611
7612 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7613 ( ( a.low | b.low ) == 0 ) ) {
7614 /* zero case */
7615 return float_relation_equal;
7616 } else {
7617 return 1 - (2 * aSign);
7618 }
7619 } else {
7620 if (a.low == b.low && a.high == b.high) {
7621 return float_relation_equal;
7622 } else {
7623 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7624 }
7625 }
7626}
7627
e5a41ffa 7628int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
f6714d36 7629{
ff32e16e 7630 return floatx80_compare_internal(a, b, 0, status);
f6714d36
AJ
7631}
7632
e5a41ffa 7633int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
f6714d36 7634{
ff32e16e 7635 return floatx80_compare_internal(a, b, 1, status);
f6714d36
AJ
7636}
7637
e5a41ffa
PM
7638static inline int float128_compare_internal(float128 a, float128 b,
7639 int is_quiet, float_status *status)
1f587329
BS
7640{
7641 flag aSign, bSign;
7642
7643 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7644 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7645 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7646 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7647 if (!is_quiet ||
af39bc8c
AM
7648 float128_is_signaling_nan(a, status) ||
7649 float128_is_signaling_nan(b, status)) {
ff32e16e 7650 float_raise(float_flag_invalid, status);
1f587329
BS
7651 }
7652 return float_relation_unordered;
7653 }
7654 aSign = extractFloat128Sign( a );
7655 bSign = extractFloat128Sign( b );
7656 if ( aSign != bSign ) {
7657 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7658 /* zero case */
7659 return float_relation_equal;
7660 } else {
7661 return 1 - (2 * aSign);
7662 }
7663 } else {
7664 if (a.low == b.low && a.high == b.high) {
7665 return float_relation_equal;
7666 } else {
7667 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7668 }
7669 }
7670}
7671
e5a41ffa 7672int float128_compare(float128 a, float128 b, float_status *status)
1f587329 7673{
ff32e16e 7674 return float128_compare_internal(a, b, 0, status);
1f587329
BS
7675}
7676
e5a41ffa 7677int float128_compare_quiet(float128 a, float128 b, float_status *status)
1f587329 7678{
ff32e16e 7679 return float128_compare_internal(a, b, 1, status);
1f587329
BS
7680}
7681
274f1b04
PM
7682/* min() and max() functions. These can't be implemented as
7683 * 'compare and pick one input' because that would mishandle
7684 * NaNs and +0 vs -0.
e17ab310
WN
7685 *
7686 * minnum() and maxnum() functions. These are similar to the min()
7687 * and max() functions but if one of the arguments is a QNaN and
7688 * the other is numerical then the numerical argument is returned.
7689 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
7690 * and maxNum() operations. min() and max() are the typical min/max
7691 * semantics provided by many CPUs which predate that specification.
2d31e060
LA
7692 *
7693 * minnummag() and maxnummag() functions correspond to minNumMag()
7694 * and minNumMag() from the IEEE-754 2008.
274f1b04 7695 */
e70614ea 7696#define MINMAX(s) \
a49db98d 7697static inline float ## s float ## s ## _minmax(float ## s a, float ## s b, \
2d31e060 7698 int ismin, int isieee, \
e5a41ffa
PM
7699 int ismag, \
7700 float_status *status) \
274f1b04
PM
7701{ \
7702 flag aSign, bSign; \
2d31e060 7703 uint ## s ## _t av, bv, aav, abv; \
ff32e16e
PM
7704 a = float ## s ## _squash_input_denormal(a, status); \
7705 b = float ## s ## _squash_input_denormal(b, status); \
274f1b04
PM
7706 if (float ## s ## _is_any_nan(a) || \
7707 float ## s ## _is_any_nan(b)) { \
e17ab310 7708 if (isieee) { \
af39bc8c 7709 if (float ## s ## _is_quiet_nan(a, status) && \
e17ab310
WN
7710 !float ## s ##_is_any_nan(b)) { \
7711 return b; \
af39bc8c
AM
7712 } else if (float ## s ## _is_quiet_nan(b, status) && \
7713 !float ## s ## _is_any_nan(a)) { \
e17ab310
WN
7714 return a; \
7715 } \
7716 } \
ff32e16e 7717 return propagateFloat ## s ## NaN(a, b, status); \
274f1b04
PM
7718 } \
7719 aSign = extractFloat ## s ## Sign(a); \
7720 bSign = extractFloat ## s ## Sign(b); \
7721 av = float ## s ## _val(a); \
7722 bv = float ## s ## _val(b); \
2d31e060
LA
7723 if (ismag) { \
7724 aav = float ## s ## _abs(av); \
7725 abv = float ## s ## _abs(bv); \
7726 if (aav != abv) { \
7727 if (ismin) { \
7728 return (aav < abv) ? a : b; \
7729 } else { \
7730 return (aav < abv) ? b : a; \
7731 } \
7732 } \
7733 } \
274f1b04
PM
7734 if (aSign != bSign) { \
7735 if (ismin) { \
7736 return aSign ? a : b; \
7737 } else { \
7738 return aSign ? b : a; \
7739 } \
7740 } else { \
7741 if (ismin) { \
7742 return (aSign ^ (av < bv)) ? a : b; \
7743 } else { \
7744 return (aSign ^ (av < bv)) ? b : a; \
7745 } \
7746 } \
7747} \
7748 \
e5a41ffa
PM
7749float ## s float ## s ## _min(float ## s a, float ## s b, \
7750 float_status *status) \
274f1b04 7751{ \
ff32e16e 7752 return float ## s ## _minmax(a, b, 1, 0, 0, status); \
274f1b04
PM
7753} \
7754 \
e5a41ffa
PM
7755float ## s float ## s ## _max(float ## s a, float ## s b, \
7756 float_status *status) \
274f1b04 7757{ \
ff32e16e 7758 return float ## s ## _minmax(a, b, 0, 0, 0, status); \
e17ab310
WN
7759} \
7760 \
e5a41ffa
PM
7761float ## s float ## s ## _minnum(float ## s a, float ## s b, \
7762 float_status *status) \
e17ab310 7763{ \
ff32e16e 7764 return float ## s ## _minmax(a, b, 1, 1, 0, status); \
e17ab310
WN
7765} \
7766 \
e5a41ffa
PM
7767float ## s float ## s ## _maxnum(float ## s a, float ## s b, \
7768 float_status *status) \
e17ab310 7769{ \
ff32e16e 7770 return float ## s ## _minmax(a, b, 0, 1, 0, status); \
2d31e060
LA
7771} \
7772 \
e5a41ffa
PM
7773float ## s float ## s ## _minnummag(float ## s a, float ## s b, \
7774 float_status *status) \
2d31e060 7775{ \
ff32e16e 7776 return float ## s ## _minmax(a, b, 1, 1, 1, status); \
2d31e060
LA
7777} \
7778 \
e5a41ffa
PM
7779float ## s float ## s ## _maxnummag(float ## s a, float ## s b, \
7780 float_status *status) \
2d31e060 7781{ \
ff32e16e 7782 return float ## s ## _minmax(a, b, 0, 1, 1, status); \
274f1b04
PM
7783}
7784
e70614ea
WN
7785MINMAX(32)
7786MINMAX(64)
274f1b04
PM
7787
7788
9ee6e8bb 7789/* Multiply A by 2 raised to the power N. */
e5a41ffa 7790float32 float32_scalbn(float32 a, int n, float_status *status)
9ee6e8bb
PB
7791{
7792 flag aSign;
326b9e98 7793 int16_t aExp;
bb98fe42 7794 uint32_t aSig;
9ee6e8bb 7795
ff32e16e 7796 a = float32_squash_input_denormal(a, status);
9ee6e8bb
PB
7797 aSig = extractFloat32Frac( a );
7798 aExp = extractFloat32Exp( a );
7799 aSign = extractFloat32Sign( a );
7800
7801 if ( aExp == 0xFF ) {
326b9e98 7802 if ( aSig ) {
ff32e16e 7803 return propagateFloat32NaN(a, a, status);
326b9e98 7804 }
9ee6e8bb
PB
7805 return a;
7806 }
3c85c37f 7807 if (aExp != 0) {
69397542 7808 aSig |= 0x00800000;
3c85c37f 7809 } else if (aSig == 0) {
69397542 7810 return a;
3c85c37f
PM
7811 } else {
7812 aExp++;
7813 }
69397542 7814
326b9e98
AJ
7815 if (n > 0x200) {
7816 n = 0x200;
7817 } else if (n < -0x200) {
7818 n = -0x200;
7819 }
7820
69397542
PB
7821 aExp += n - 1;
7822 aSig <<= 7;
ff32e16e 7823 return normalizeRoundAndPackFloat32(aSign, aExp, aSig, status);
9ee6e8bb
PB
7824}
7825
e5a41ffa 7826float64 float64_scalbn(float64 a, int n, float_status *status)
9ee6e8bb
PB
7827{
7828 flag aSign;
326b9e98 7829 int16_t aExp;
bb98fe42 7830 uint64_t aSig;
9ee6e8bb 7831
ff32e16e 7832 a = float64_squash_input_denormal(a, status);
9ee6e8bb
PB
7833 aSig = extractFloat64Frac( a );
7834 aExp = extractFloat64Exp( a );
7835 aSign = extractFloat64Sign( a );
7836
7837 if ( aExp == 0x7FF ) {
326b9e98 7838 if ( aSig ) {
ff32e16e 7839 return propagateFloat64NaN(a, a, status);
326b9e98 7840 }
9ee6e8bb
PB
7841 return a;
7842 }
3c85c37f 7843 if (aExp != 0) {
69397542 7844 aSig |= LIT64( 0x0010000000000000 );
3c85c37f 7845 } else if (aSig == 0) {
69397542 7846 return a;
3c85c37f
PM
7847 } else {
7848 aExp++;
7849 }
69397542 7850
326b9e98
AJ
7851 if (n > 0x1000) {
7852 n = 0x1000;
7853 } else if (n < -0x1000) {
7854 n = -0x1000;
7855 }
7856
69397542
PB
7857 aExp += n - 1;
7858 aSig <<= 10;
ff32e16e 7859 return normalizeRoundAndPackFloat64(aSign, aExp, aSig, status);
9ee6e8bb
PB
7860}
7861
e5a41ffa 7862floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
9ee6e8bb
PB
7863{
7864 flag aSign;
326b9e98 7865 int32_t aExp;
bb98fe42 7866 uint64_t aSig;
9ee6e8bb 7867
d1eb8f2a
AD
7868 if (floatx80_invalid_encoding(a)) {
7869 float_raise(float_flag_invalid, status);
7870 return floatx80_default_nan(status);
7871 }
9ee6e8bb
PB
7872 aSig = extractFloatx80Frac( a );
7873 aExp = extractFloatx80Exp( a );
7874 aSign = extractFloatx80Sign( a );
7875
326b9e98
AJ
7876 if ( aExp == 0x7FFF ) {
7877 if ( aSig<<1 ) {
ff32e16e 7878 return propagateFloatx80NaN(a, a, status);
326b9e98 7879 }
9ee6e8bb
PB
7880 return a;
7881 }
326b9e98 7882
3c85c37f
PM
7883 if (aExp == 0) {
7884 if (aSig == 0) {
7885 return a;
7886 }
7887 aExp++;
7888 }
69397542 7889
326b9e98
AJ
7890 if (n > 0x10000) {
7891 n = 0x10000;
7892 } else if (n < -0x10000) {
7893 n = -0x10000;
7894 }
7895
9ee6e8bb 7896 aExp += n;
a2f2d288
PM
7897 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7898 aSign, aExp, aSig, 0, status);
9ee6e8bb 7899}
9ee6e8bb 7900
e5a41ffa 7901float128 float128_scalbn(float128 a, int n, float_status *status)
9ee6e8bb
PB
7902{
7903 flag aSign;
326b9e98 7904 int32_t aExp;
bb98fe42 7905 uint64_t aSig0, aSig1;
9ee6e8bb
PB
7906
7907 aSig1 = extractFloat128Frac1( a );
7908 aSig0 = extractFloat128Frac0( a );
7909 aExp = extractFloat128Exp( a );
7910 aSign = extractFloat128Sign( a );
7911 if ( aExp == 0x7FFF ) {
326b9e98 7912 if ( aSig0 | aSig1 ) {
ff32e16e 7913 return propagateFloat128NaN(a, a, status);
326b9e98 7914 }
9ee6e8bb
PB
7915 return a;
7916 }
3c85c37f 7917 if (aExp != 0) {
69397542 7918 aSig0 |= LIT64( 0x0001000000000000 );
3c85c37f 7919 } else if (aSig0 == 0 && aSig1 == 0) {
69397542 7920 return a;
3c85c37f
PM
7921 } else {
7922 aExp++;
7923 }
69397542 7924
326b9e98
AJ
7925 if (n > 0x10000) {
7926 n = 0x10000;
7927 } else if (n < -0x10000) {
7928 n = -0x10000;
7929 }
7930
69397542
PB
7931 aExp += n - 1;
7932 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
ff32e16e 7933 , status);
9ee6e8bb
PB
7934
7935}