4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
44 ===============================================================================
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
85 #include "qemu/osdep.h"
87 #include "qemu/bitops.h"
88 #include "fpu/softfloat.h"
90 /* We only need stdlib for abort() */
92 /*----------------------------------------------------------------------------
93 | Primitive arithmetic functions, including multi-word arithmetic, and
94 | division and square root approximations. (Can be specialized to target if
96 *----------------------------------------------------------------------------*/
97 #include "fpu/softfloat-macros.h"
102 * Fast emulation of guest FP instructions is challenging for two reasons.
103 * First, FP instruction semantics are similar but not identical, particularly
104 * when handling NaNs. Second, emulating at reasonable speed the guest FP
105 * exception flags is not trivial: reading the host's flags register with a
106 * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp],
107 * and trapping on every FP exception is not fast nor pleasant to work with.
109 * We address these challenges by leveraging the host FPU for a subset of the
110 * operations. To do this we expand on the idea presented in this paper:
112 * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a
113 * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615.
115 * The idea is thus to leverage the host FPU to (1) compute FP operations
116 * and (2) identify whether FP exceptions occurred while avoiding
117 * expensive exception flag register accesses.
119 * An important optimization shown in the paper is that given that exception
120 * flags are rarely cleared by the guest, we can avoid recomputing some flags.
121 * This is particularly useful for the inexact flag, which is very frequently
122 * raised in floating-point workloads.
124 * We optimize the code further by deferring to soft-fp whenever FP exception
125 * detection might get hairy. Two examples: (1) when at least one operand is
126 * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result
127 * and the result is < the minimum normal.
129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t) \
130 static inline void name(soft_t *a, float_status *s) \
132 if (unlikely(soft_t ## _is_denormal(*a))) { \
133 *a = soft_t ## _set_sign(soft_t ## _zero, \
134 soft_t ## _is_neg(*a)); \
135 float_raise(float_flag_input_denormal, s); \
139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck
, float32
)
140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck
, float64
)
141 #undef GEN_INPUT_FLUSH__NOCHECK
143 #define GEN_INPUT_FLUSH1(name, soft_t) \
144 static inline void name(soft_t *a, float_status *s) \
146 if (likely(!s->flush_inputs_to_zero)) { \
149 soft_t ## _input_flush__nocheck(a, s); \
152 GEN_INPUT_FLUSH1(float32_input_flush1
, float32
)
153 GEN_INPUT_FLUSH1(float64_input_flush1
, float64
)
154 #undef GEN_INPUT_FLUSH1
156 #define GEN_INPUT_FLUSH2(name, soft_t) \
157 static inline void name(soft_t *a, soft_t *b, float_status *s) \
159 if (likely(!s->flush_inputs_to_zero)) { \
162 soft_t ## _input_flush__nocheck(a, s); \
163 soft_t ## _input_flush__nocheck(b, s); \
166 GEN_INPUT_FLUSH2(float32_input_flush2
, float32
)
167 GEN_INPUT_FLUSH2(float64_input_flush2
, float64
)
168 #undef GEN_INPUT_FLUSH2
170 #define GEN_INPUT_FLUSH3(name, soft_t) \
171 static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \
173 if (likely(!s->flush_inputs_to_zero)) { \
176 soft_t ## _input_flush__nocheck(a, s); \
177 soft_t ## _input_flush__nocheck(b, s); \
178 soft_t ## _input_flush__nocheck(c, s); \
181 GEN_INPUT_FLUSH3(float32_input_flush3
, float32
)
182 GEN_INPUT_FLUSH3(float64_input_flush3
, float64
)
183 #undef GEN_INPUT_FLUSH3
186 * Choose whether to use fpclassify or float32/64_* primitives in the generated
187 * hardfloat functions. Each combination of number of inputs and float size
188 * gets its own value.
190 #if defined(__x86_64__)
191 # define QEMU_HARDFLOAT_1F32_USE_FP 0
192 # define QEMU_HARDFLOAT_1F64_USE_FP 1
193 # define QEMU_HARDFLOAT_2F32_USE_FP 0
194 # define QEMU_HARDFLOAT_2F64_USE_FP 1
195 # define QEMU_HARDFLOAT_3F32_USE_FP 0
196 # define QEMU_HARDFLOAT_3F64_USE_FP 1
198 # define QEMU_HARDFLOAT_1F32_USE_FP 0
199 # define QEMU_HARDFLOAT_1F64_USE_FP 0
200 # define QEMU_HARDFLOAT_2F32_USE_FP 0
201 # define QEMU_HARDFLOAT_2F64_USE_FP 0
202 # define QEMU_HARDFLOAT_3F32_USE_FP 0
203 # define QEMU_HARDFLOAT_3F64_USE_FP 0
207 * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over
208 * float{32,64}_is_infinity when !USE_FP.
209 * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup.
210 * On power64 however, using isinf() reduces fp-bench performance by up to 50%.
212 #if defined(__x86_64__) || defined(__aarch64__)
213 # define QEMU_HARDFLOAT_USE_ISINF 1
215 # define QEMU_HARDFLOAT_USE_ISINF 0
219 * Some targets clear the FP flags before most FP operations. This prevents
220 * the use of hardfloat, since hardfloat relies on the inexact flag being
223 #if defined(TARGET_PPC) || defined(__FAST_MATH__)
224 # if defined(__FAST_MATH__)
225 # warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \
228 # define QEMU_NO_HARDFLOAT 1
229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN
231 # define QEMU_NO_HARDFLOAT 0
232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline))
235 static inline bool can_use_fpu(const float_status
*s
)
237 if (QEMU_NO_HARDFLOAT
) {
240 return likely(s
->float_exception_flags
& float_flag_inexact
&&
241 s
->float_rounding_mode
== float_round_nearest_even
);
245 * Hardfloat generation functions. Each operation can have two flavors:
246 * either using softfloat primitives (e.g. float32_is_zero_or_normal) for
247 * most condition checks, or native ones (e.g. fpclassify).
249 * The flavor is chosen by the callers. Instead of using macros, we rely on the
250 * compiler to propagate constants and inline everything into the callers.
252 * We only generate functions for operations with two inputs, since only
253 * these are common enough to justify consolidating them into common code.
266 typedef bool (*f32_check_fn
)(union_float32 a
, union_float32 b
);
267 typedef bool (*f64_check_fn
)(union_float64 a
, union_float64 b
);
269 typedef float32 (*soft_f32_op2_fn
)(float32 a
, float32 b
, float_status
*s
);
270 typedef float64 (*soft_f64_op2_fn
)(float64 a
, float64 b
, float_status
*s
);
271 typedef float (*hard_f32_op2_fn
)(float a
, float b
);
272 typedef double (*hard_f64_op2_fn
)(double a
, double b
);
274 /* 2-input is-zero-or-normal */
275 static inline bool f32_is_zon2(union_float32 a
, union_float32 b
)
277 if (QEMU_HARDFLOAT_2F32_USE_FP
) {
279 * Not using a temp variable for consecutive fpclassify calls ends up
280 * generating faster code.
282 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
283 (fpclassify(b
.h
) == FP_NORMAL
|| fpclassify(b
.h
) == FP_ZERO
);
285 return float32_is_zero_or_normal(a
.s
) &&
286 float32_is_zero_or_normal(b
.s
);
289 static inline bool f64_is_zon2(union_float64 a
, union_float64 b
)
291 if (QEMU_HARDFLOAT_2F64_USE_FP
) {
292 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
293 (fpclassify(b
.h
) == FP_NORMAL
|| fpclassify(b
.h
) == FP_ZERO
);
295 return float64_is_zero_or_normal(a
.s
) &&
296 float64_is_zero_or_normal(b
.s
);
299 /* 3-input is-zero-or-normal */
301 bool f32_is_zon3(union_float32 a
, union_float32 b
, union_float32 c
)
303 if (QEMU_HARDFLOAT_3F32_USE_FP
) {
304 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
305 (fpclassify(b
.h
) == FP_NORMAL
|| fpclassify(b
.h
) == FP_ZERO
) &&
306 (fpclassify(c
.h
) == FP_NORMAL
|| fpclassify(c
.h
) == FP_ZERO
);
308 return float32_is_zero_or_normal(a
.s
) &&
309 float32_is_zero_or_normal(b
.s
) &&
310 float32_is_zero_or_normal(c
.s
);
314 bool f64_is_zon3(union_float64 a
, union_float64 b
, union_float64 c
)
316 if (QEMU_HARDFLOAT_3F64_USE_FP
) {
317 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
318 (fpclassify(b
.h
) == FP_NORMAL
|| fpclassify(b
.h
) == FP_ZERO
) &&
319 (fpclassify(c
.h
) == FP_NORMAL
|| fpclassify(c
.h
) == FP_ZERO
);
321 return float64_is_zero_or_normal(a
.s
) &&
322 float64_is_zero_or_normal(b
.s
) &&
323 float64_is_zero_or_normal(c
.s
);
326 static inline bool f32_is_inf(union_float32 a
)
328 if (QEMU_HARDFLOAT_USE_ISINF
) {
331 return float32_is_infinity(a
.s
);
334 static inline bool f64_is_inf(union_float64 a
)
336 if (QEMU_HARDFLOAT_USE_ISINF
) {
339 return float64_is_infinity(a
.s
);
342 static inline float32
343 float32_gen2(float32 xa
, float32 xb
, float_status
*s
,
344 hard_f32_op2_fn hard
, soft_f32_op2_fn soft
,
345 f32_check_fn pre
, f32_check_fn post
)
347 union_float32 ua
, ub
, ur
;
352 if (unlikely(!can_use_fpu(s
))) {
356 float32_input_flush2(&ua
.s
, &ub
.s
, s
);
357 if (unlikely(!pre(ua
, ub
))) {
361 ur
.h
= hard(ua
.h
, ub
.h
);
362 if (unlikely(f32_is_inf(ur
))) {
363 float_raise(float_flag_overflow
, s
);
364 } else if (unlikely(fabsf(ur
.h
) <= FLT_MIN
) && post(ua
, ub
)) {
370 return soft(ua
.s
, ub
.s
, s
);
373 static inline float64
374 float64_gen2(float64 xa
, float64 xb
, float_status
*s
,
375 hard_f64_op2_fn hard
, soft_f64_op2_fn soft
,
376 f64_check_fn pre
, f64_check_fn post
)
378 union_float64 ua
, ub
, ur
;
383 if (unlikely(!can_use_fpu(s
))) {
387 float64_input_flush2(&ua
.s
, &ub
.s
, s
);
388 if (unlikely(!pre(ua
, ub
))) {
392 ur
.h
= hard(ua
.h
, ub
.h
);
393 if (unlikely(f64_is_inf(ur
))) {
394 float_raise(float_flag_overflow
, s
);
395 } else if (unlikely(fabs(ur
.h
) <= DBL_MIN
) && post(ua
, ub
)) {
401 return soft(ua
.s
, ub
.s
, s
);
404 /*----------------------------------------------------------------------------
405 | Returns the fraction bits of the single-precision floating-point value `a'.
406 *----------------------------------------------------------------------------*/
408 static inline uint32_t extractFloat32Frac(float32 a
)
410 return float32_val(a
) & 0x007FFFFF;
413 /*----------------------------------------------------------------------------
414 | Returns the exponent bits of the single-precision floating-point value `a'.
415 *----------------------------------------------------------------------------*/
417 static inline int extractFloat32Exp(float32 a
)
419 return (float32_val(a
) >> 23) & 0xFF;
422 /*----------------------------------------------------------------------------
423 | Returns the sign bit of the single-precision floating-point value `a'.
424 *----------------------------------------------------------------------------*/
426 static inline bool extractFloat32Sign(float32 a
)
428 return float32_val(a
) >> 31;
431 /*----------------------------------------------------------------------------
432 | Returns the fraction bits of the double-precision floating-point value `a'.
433 *----------------------------------------------------------------------------*/
435 static inline uint64_t extractFloat64Frac(float64 a
)
437 return float64_val(a
) & UINT64_C(0x000FFFFFFFFFFFFF);
440 /*----------------------------------------------------------------------------
441 | Returns the exponent bits of the double-precision floating-point value `a'.
442 *----------------------------------------------------------------------------*/
444 static inline int extractFloat64Exp(float64 a
)
446 return (float64_val(a
) >> 52) & 0x7FF;
449 /*----------------------------------------------------------------------------
450 | Returns the sign bit of the double-precision floating-point value `a'.
451 *----------------------------------------------------------------------------*/
453 static inline bool extractFloat64Sign(float64 a
)
455 return float64_val(a
) >> 63;
459 * Classify a floating point number. Everything above float_class_qnan
460 * is a NaN so cls >= float_class_qnan is any NaN.
463 typedef enum __attribute__ ((__packed__
)) {
464 float_class_unclassified
,
468 float_class_qnan
, /* all NaNs from here */
472 #define float_cmask(bit) (1u << (bit))
475 float_cmask_zero
= float_cmask(float_class_zero
),
476 float_cmask_normal
= float_cmask(float_class_normal
),
477 float_cmask_inf
= float_cmask(float_class_inf
),
478 float_cmask_qnan
= float_cmask(float_class_qnan
),
479 float_cmask_snan
= float_cmask(float_class_snan
),
481 float_cmask_infzero
= float_cmask_zero
| float_cmask_inf
,
482 float_cmask_anynan
= float_cmask_qnan
| float_cmask_snan
,
486 /* Simple helpers for checking if, or what kind of, NaN we have */
487 static inline __attribute__((unused
)) bool is_nan(FloatClass c
)
489 return unlikely(c
>= float_class_qnan
);
492 static inline __attribute__((unused
)) bool is_snan(FloatClass c
)
494 return c
== float_class_snan
;
497 static inline __attribute__((unused
)) bool is_qnan(FloatClass c
)
499 return c
== float_class_qnan
;
503 * Structure holding all of the decomposed parts of a float.
504 * The exponent is unbiased and the fraction is normalized.
506 * The fraction words are stored in big-endian word ordering,
507 * so that truncation from a larger format to a smaller format
508 * can be done simply by ignoring subsequent elements.
516 /* Routines that know the structure may reference the singular name. */
519 * Routines expanded with multiple structures reference "hi" and "lo"
520 * depending on the operation. In FloatParts64, "hi" and "lo" are
521 * both the same word and aliased here.
541 uint64_t frac_hm
; /* high-middle */
542 uint64_t frac_lm
; /* low-middle */
546 /* These apply to the most significant word of each FloatPartsN. */
547 #define DECOMPOSED_BINARY_POINT 63
548 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT)
550 /* Structure holding all of the relevant parameters for a format.
551 * exp_size: the size of the exponent field
552 * exp_bias: the offset applied to the exponent field
553 * exp_max: the maximum normalised exponent
554 * frac_size: the size of the fraction field
555 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
556 * The following are computed based the size of fraction
557 * frac_lsb: least significant bit of fraction
558 * frac_lsbm1: the bit below the least significant bit (for rounding)
559 * round_mask/roundeven_mask: masks used for rounding
560 * The following optional modifiers are available:
561 * arm_althp: handle ARM Alternative Half Precision
572 uint64_t roundeven_mask
;
576 /* Expand fields based on the size of exponent and fraction */
577 #define FLOAT_PARAMS(E, F) \
579 .exp_bias = ((1 << E) - 1) >> 1, \
580 .exp_max = (1 << E) - 1, \
582 .frac_shift = (-F - 1) & 63, \
583 .frac_lsb = 1ull << ((-F - 1) & 63), \
584 .frac_lsbm1 = 1ull << ((-F - 2) & 63), \
585 .round_mask = (1ull << ((-F - 1) & 63)) - 1, \
586 .roundeven_mask = (2ull << ((-F - 1) & 63)) - 1
588 static const FloatFmt float16_params
= {
592 static const FloatFmt float16_params_ahp
= {
597 static const FloatFmt bfloat16_params
= {
601 static const FloatFmt float32_params
= {
605 static const FloatFmt float64_params
= {
609 static const FloatFmt float128_params
= {
610 FLOAT_PARAMS(15, 112)
613 /* Unpack a float to parts, but do not canonicalize. */
614 static void unpack_raw64(FloatParts64
*r
, const FloatFmt
*fmt
, uint64_t raw
)
616 const int f_size
= fmt
->frac_size
;
617 const int e_size
= fmt
->exp_size
;
619 *r
= (FloatParts64
) {
620 .cls
= float_class_unclassified
,
621 .sign
= extract64(raw
, f_size
+ e_size
, 1),
622 .exp
= extract64(raw
, f_size
, e_size
),
623 .frac
= extract64(raw
, 0, f_size
)
627 static inline void float16_unpack_raw(FloatParts64
*p
, float16 f
)
629 unpack_raw64(p
, &float16_params
, f
);
632 static inline void bfloat16_unpack_raw(FloatParts64
*p
, bfloat16 f
)
634 unpack_raw64(p
, &bfloat16_params
, f
);
637 static inline void float32_unpack_raw(FloatParts64
*p
, float32 f
)
639 unpack_raw64(p
, &float32_params
, f
);
642 static inline void float64_unpack_raw(FloatParts64
*p
, float64 f
)
644 unpack_raw64(p
, &float64_params
, f
);
647 static void float128_unpack_raw(FloatParts128
*p
, float128 f
)
649 const int f_size
= float128_params
.frac_size
- 64;
650 const int e_size
= float128_params
.exp_size
;
652 *p
= (FloatParts128
) {
653 .cls
= float_class_unclassified
,
654 .sign
= extract64(f
.high
, f_size
+ e_size
, 1),
655 .exp
= extract64(f
.high
, f_size
, e_size
),
656 .frac_hi
= extract64(f
.high
, 0, f_size
),
661 /* Pack a float from parts, but do not canonicalize. */
662 static uint64_t pack_raw64(const FloatParts64
*p
, const FloatFmt
*fmt
)
664 const int f_size
= fmt
->frac_size
;
665 const int e_size
= fmt
->exp_size
;
668 ret
= (uint64_t)p
->sign
<< (f_size
+ e_size
);
669 ret
= deposit64(ret
, f_size
, e_size
, p
->exp
);
670 ret
= deposit64(ret
, 0, f_size
, p
->frac
);
674 static inline float16
float16_pack_raw(const FloatParts64
*p
)
676 return make_float16(pack_raw64(p
, &float16_params
));
679 static inline bfloat16
bfloat16_pack_raw(const FloatParts64
*p
)
681 return pack_raw64(p
, &bfloat16_params
);
684 static inline float32
float32_pack_raw(const FloatParts64
*p
)
686 return make_float32(pack_raw64(p
, &float32_params
));
689 static inline float64
float64_pack_raw(const FloatParts64
*p
)
691 return make_float64(pack_raw64(p
, &float64_params
));
694 static float128
float128_pack_raw(const FloatParts128
*p
)
696 const int f_size
= float128_params
.frac_size
- 64;
697 const int e_size
= float128_params
.exp_size
;
700 hi
= (uint64_t)p
->sign
<< (f_size
+ e_size
);
701 hi
= deposit64(hi
, f_size
, e_size
, p
->exp
);
702 hi
= deposit64(hi
, 0, f_size
, p
->frac_hi
);
703 return make_float128(hi
, p
->frac_lo
);
706 /*----------------------------------------------------------------------------
707 | Functions and definitions to determine: (1) whether tininess for underflow
708 | is detected before or after rounding by default, (2) what (if anything)
709 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
710 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
711 | are propagated from function inputs to output. These details are target-
713 *----------------------------------------------------------------------------*/
714 #include "softfloat-specialize.c.inc"
716 #define PARTS_GENERIC_64_128(NAME, P) \
717 QEMU_GENERIC(P, (FloatParts128 *, parts128_##NAME), parts64_##NAME)
719 #define PARTS_GENERIC_64_128_256(NAME, P) \
720 QEMU_GENERIC(P, (FloatParts256 *, parts256_##NAME), \
721 (FloatParts128 *, parts128_##NAME), parts64_##NAME)
723 #define parts_default_nan(P, S) PARTS_GENERIC_64_128(default_nan, P)(P, S)
724 #define parts_silence_nan(P, S) PARTS_GENERIC_64_128(silence_nan, P)(P, S)
726 static void parts64_return_nan(FloatParts64
*a
, float_status
*s
);
727 static void parts128_return_nan(FloatParts128
*a
, float_status
*s
);
729 #define parts_return_nan(P, S) PARTS_GENERIC_64_128(return_nan, P)(P, S)
731 static FloatParts64
*parts64_pick_nan(FloatParts64
*a
, FloatParts64
*b
,
733 static FloatParts128
*parts128_pick_nan(FloatParts128
*a
, FloatParts128
*b
,
736 #define parts_pick_nan(A, B, S) PARTS_GENERIC_64_128(pick_nan, A)(A, B, S)
738 static FloatParts64
*parts64_pick_nan_muladd(FloatParts64
*a
, FloatParts64
*b
,
739 FloatParts64
*c
, float_status
*s
,
740 int ab_mask
, int abc_mask
);
741 static FloatParts128
*parts128_pick_nan_muladd(FloatParts128
*a
,
745 int ab_mask
, int abc_mask
);
747 #define parts_pick_nan_muladd(A, B, C, S, ABM, ABCM) \
748 PARTS_GENERIC_64_128(pick_nan_muladd, A)(A, B, C, S, ABM, ABCM)
750 static void parts64_canonicalize(FloatParts64
*p
, float_status
*status
,
751 const FloatFmt
*fmt
);
752 static void parts128_canonicalize(FloatParts128
*p
, float_status
*status
,
753 const FloatFmt
*fmt
);
755 #define parts_canonicalize(A, S, F) \
756 PARTS_GENERIC_64_128(canonicalize, A)(A, S, F)
758 static void parts64_uncanon(FloatParts64
*p
, float_status
*status
,
759 const FloatFmt
*fmt
);
760 static void parts128_uncanon(FloatParts128
*p
, float_status
*status
,
761 const FloatFmt
*fmt
);
763 #define parts_uncanon(A, S, F) \
764 PARTS_GENERIC_64_128(uncanon, A)(A, S, F)
766 static void parts64_add_normal(FloatParts64
*a
, FloatParts64
*b
);
767 static void parts128_add_normal(FloatParts128
*a
, FloatParts128
*b
);
768 static void parts256_add_normal(FloatParts256
*a
, FloatParts256
*b
);
770 #define parts_add_normal(A, B) \
771 PARTS_GENERIC_64_128_256(add_normal, A)(A, B)
773 static bool parts64_sub_normal(FloatParts64
*a
, FloatParts64
*b
);
774 static bool parts128_sub_normal(FloatParts128
*a
, FloatParts128
*b
);
775 static bool parts256_sub_normal(FloatParts256
*a
, FloatParts256
*b
);
777 #define parts_sub_normal(A, B) \
778 PARTS_GENERIC_64_128_256(sub_normal, A)(A, B)
780 static FloatParts64
*parts64_addsub(FloatParts64
*a
, FloatParts64
*b
,
781 float_status
*s
, bool subtract
);
782 static FloatParts128
*parts128_addsub(FloatParts128
*a
, FloatParts128
*b
,
783 float_status
*s
, bool subtract
);
785 #define parts_addsub(A, B, S, Z) \
786 PARTS_GENERIC_64_128(addsub, A)(A, B, S, Z)
788 static FloatParts64
*parts64_mul(FloatParts64
*a
, FloatParts64
*b
,
790 static FloatParts128
*parts128_mul(FloatParts128
*a
, FloatParts128
*b
,
793 #define parts_mul(A, B, S) \
794 PARTS_GENERIC_64_128(mul, A)(A, B, S)
796 static FloatParts64
*parts64_muladd(FloatParts64
*a
, FloatParts64
*b
,
797 FloatParts64
*c
, int flags
,
799 static FloatParts128
*parts128_muladd(FloatParts128
*a
, FloatParts128
*b
,
800 FloatParts128
*c
, int flags
,
803 #define parts_muladd(A, B, C, Z, S) \
804 PARTS_GENERIC_64_128(muladd, A)(A, B, C, Z, S)
807 * Helper functions for softfloat-parts.c.inc, per-size operations.
810 #define FRAC_GENERIC_64_128(NAME, P) \
811 QEMU_GENERIC(P, (FloatParts128 *, frac128_##NAME), frac64_##NAME)
813 #define FRAC_GENERIC_64_128_256(NAME, P) \
814 QEMU_GENERIC(P, (FloatParts256 *, frac256_##NAME), \
815 (FloatParts128 *, frac128_##NAME), frac64_##NAME)
817 static bool frac64_add(FloatParts64
*r
, FloatParts64
*a
, FloatParts64
*b
)
819 return uadd64_overflow(a
->frac
, b
->frac
, &r
->frac
);
822 static bool frac128_add(FloatParts128
*r
, FloatParts128
*a
, FloatParts128
*b
)
825 r
->frac_lo
= uadd64_carry(a
->frac_lo
, b
->frac_lo
, &c
);
826 r
->frac_hi
= uadd64_carry(a
->frac_hi
, b
->frac_hi
, &c
);
830 static bool frac256_add(FloatParts256
*r
, FloatParts256
*a
, FloatParts256
*b
)
833 r
->frac_lo
= uadd64_carry(a
->frac_lo
, b
->frac_lo
, &c
);
834 r
->frac_lm
= uadd64_carry(a
->frac_lm
, b
->frac_lm
, &c
);
835 r
->frac_hm
= uadd64_carry(a
->frac_hm
, b
->frac_hm
, &c
);
836 r
->frac_hi
= uadd64_carry(a
->frac_hi
, b
->frac_hi
, &c
);
840 #define frac_add(R, A, B) FRAC_GENERIC_64_128_256(add, R)(R, A, B)
842 static bool frac64_addi(FloatParts64
*r
, FloatParts64
*a
, uint64_t c
)
844 return uadd64_overflow(a
->frac
, c
, &r
->frac
);
847 static bool frac128_addi(FloatParts128
*r
, FloatParts128
*a
, uint64_t c
)
849 c
= uadd64_overflow(a
->frac_lo
, c
, &r
->frac_lo
);
850 return uadd64_overflow(a
->frac_hi
, c
, &r
->frac_hi
);
853 #define frac_addi(R, A, C) FRAC_GENERIC_64_128(addi, R)(R, A, C)
855 static void frac64_allones(FloatParts64
*a
)
860 static void frac128_allones(FloatParts128
*a
)
862 a
->frac_hi
= a
->frac_lo
= -1;
865 #define frac_allones(A) FRAC_GENERIC_64_128(allones, A)(A)
867 static int frac64_cmp(FloatParts64
*a
, FloatParts64
*b
)
869 return a
->frac
== b
->frac
? 0 : a
->frac
< b
->frac
? -1 : 1;
872 static int frac128_cmp(FloatParts128
*a
, FloatParts128
*b
)
874 uint64_t ta
= a
->frac_hi
, tb
= b
->frac_hi
;
876 ta
= a
->frac_lo
, tb
= b
->frac_lo
;
881 return ta
< tb
? -1 : 1;
884 #define frac_cmp(A, B) FRAC_GENERIC_64_128(cmp, A)(A, B)
886 static void frac64_clear(FloatParts64
*a
)
891 static void frac128_clear(FloatParts128
*a
)
893 a
->frac_hi
= a
->frac_lo
= 0;
896 #define frac_clear(A) FRAC_GENERIC_64_128(clear, A)(A)
898 static bool frac64_eqz(FloatParts64
*a
)
903 static bool frac128_eqz(FloatParts128
*a
)
905 return (a
->frac_hi
| a
->frac_lo
) == 0;
908 #define frac_eqz(A) FRAC_GENERIC_64_128(eqz, A)(A)
910 static void frac64_mulw(FloatParts128
*r
, FloatParts64
*a
, FloatParts64
*b
)
912 mulu64(&r
->frac_lo
, &r
->frac_hi
, a
->frac
, b
->frac
);
915 static void frac128_mulw(FloatParts256
*r
, FloatParts128
*a
, FloatParts128
*b
)
917 mul128To256(a
->frac_hi
, a
->frac_lo
, b
->frac_hi
, b
->frac_lo
,
918 &r
->frac_hi
, &r
->frac_hm
, &r
->frac_lm
, &r
->frac_lo
);
921 #define frac_mulw(R, A, B) FRAC_GENERIC_64_128(mulw, A)(R, A, B)
923 static void frac64_neg(FloatParts64
*a
)
928 static void frac128_neg(FloatParts128
*a
)
931 a
->frac_lo
= usub64_borrow(0, a
->frac_lo
, &c
);
932 a
->frac_hi
= usub64_borrow(0, a
->frac_hi
, &c
);
935 static void frac256_neg(FloatParts256
*a
)
938 a
->frac_lo
= usub64_borrow(0, a
->frac_lo
, &c
);
939 a
->frac_lm
= usub64_borrow(0, a
->frac_lm
, &c
);
940 a
->frac_hm
= usub64_borrow(0, a
->frac_hm
, &c
);
941 a
->frac_hi
= usub64_borrow(0, a
->frac_hi
, &c
);
944 #define frac_neg(A) FRAC_GENERIC_64_128_256(neg, A)(A)
946 static int frac64_normalize(FloatParts64
*a
)
949 int shift
= clz64(a
->frac
);
956 static int frac128_normalize(FloatParts128
*a
)
959 int shl
= clz64(a
->frac_hi
);
962 a
->frac_hi
= (a
->frac_hi
<< shl
) | (a
->frac_lo
>> shr
);
963 a
->frac_lo
= (a
->frac_lo
<< shl
);
966 } else if (a
->frac_lo
) {
967 int shl
= clz64(a
->frac_lo
);
968 a
->frac_hi
= (a
->frac_lo
<< shl
);
975 static int frac256_normalize(FloatParts256
*a
)
977 uint64_t a0
= a
->frac_hi
, a1
= a
->frac_hm
;
978 uint64_t a2
= a
->frac_lm
, a3
= a
->frac_lo
;
990 a0
= a1
, a1
= a2
, a2
= a3
, a3
= 0;
993 a0
= a2
, a1
= a3
, a2
= 0, a3
= 0;
996 a0
= a3
, a1
= 0, a2
= 0, a3
= 0;
999 a0
= 0, a1
= 0, a2
= 0, a3
= 0;
1010 a0
= (a0
<< shl
) | (a1
>> shr
);
1011 a1
= (a1
<< shl
) | (a2
>> shr
);
1012 a2
= (a2
<< shl
) | (a3
>> shr
);
1023 #define frac_normalize(A) FRAC_GENERIC_64_128_256(normalize, A)(A)
1025 static void frac64_shl(FloatParts64
*a
, int c
)
1030 static void frac128_shl(FloatParts128
*a
, int c
)
1032 shift128Left(a
->frac_hi
, a
->frac_lo
, c
, &a
->frac_hi
, &a
->frac_lo
);
1035 #define frac_shl(A, C) FRAC_GENERIC_64_128(shl, A)(A, C)
1037 static void frac64_shr(FloatParts64
*a
, int c
)
1042 static void frac128_shr(FloatParts128
*a
, int c
)
1044 shift128Right(a
->frac_hi
, a
->frac_lo
, c
, &a
->frac_hi
, &a
->frac_lo
);
1047 #define frac_shr(A, C) FRAC_GENERIC_64_128(shr, A)(A, C)
1049 static void frac64_shrjam(FloatParts64
*a
, int c
)
1051 shift64RightJamming(a
->frac
, c
, &a
->frac
);
1054 static void frac128_shrjam(FloatParts128
*a
, int c
)
1056 shift128RightJamming(a
->frac_hi
, a
->frac_lo
, c
, &a
->frac_hi
, &a
->frac_lo
);
1059 static void frac256_shrjam(FloatParts256
*a
, int c
)
1061 uint64_t a0
= a
->frac_hi
, a1
= a
->frac_hm
;
1062 uint64_t a2
= a
->frac_lm
, a3
= a
->frac_lo
;
1063 uint64_t sticky
= 0;
1066 if (unlikely(c
== 0)) {
1068 } else if (likely(c
< 64)) {
1070 } else if (likely(c
< 256)) {
1071 if (unlikely(c
& 128)) {
1073 a3
= a1
, a2
= a0
, a1
= 0, a0
= 0;
1075 if (unlikely(c
& 64)) {
1077 a3
= a2
, a2
= a1
, a1
= a0
, a0
= 0;
1084 sticky
= a0
| a1
| a2
| a3
;
1085 a0
= a1
= a2
= a3
= 0;
1090 sticky
|= a3
<< invc
;
1091 a3
= (a3
>> c
) | (a2
<< invc
);
1092 a2
= (a2
>> c
) | (a1
<< invc
);
1093 a1
= (a1
>> c
) | (a0
<< invc
);
1097 a
->frac_lo
= a3
| (sticky
!= 0);
1103 #define frac_shrjam(A, C) FRAC_GENERIC_64_128_256(shrjam, A)(A, C)
1105 static bool frac64_sub(FloatParts64
*r
, FloatParts64
*a
, FloatParts64
*b
)
1107 return usub64_overflow(a
->frac
, b
->frac
, &r
->frac
);
1110 static bool frac128_sub(FloatParts128
*r
, FloatParts128
*a
, FloatParts128
*b
)
1113 r
->frac_lo
= usub64_borrow(a
->frac_lo
, b
->frac_lo
, &c
);
1114 r
->frac_hi
= usub64_borrow(a
->frac_hi
, b
->frac_hi
, &c
);
1118 static bool frac256_sub(FloatParts256
*r
, FloatParts256
*a
, FloatParts256
*b
)
1121 r
->frac_lo
= usub64_borrow(a
->frac_lo
, b
->frac_lo
, &c
);
1122 r
->frac_lm
= usub64_borrow(a
->frac_lm
, b
->frac_lm
, &c
);
1123 r
->frac_hm
= usub64_borrow(a
->frac_hm
, b
->frac_hm
, &c
);
1124 r
->frac_hi
= usub64_borrow(a
->frac_hi
, b
->frac_hi
, &c
);
1128 #define frac_sub(R, A, B) FRAC_GENERIC_64_128_256(sub, R)(R, A, B)
1130 static void frac64_truncjam(FloatParts64
*r
, FloatParts128
*a
)
1132 r
->frac
= a
->frac_hi
| (a
->frac_lo
!= 0);
1135 static void frac128_truncjam(FloatParts128
*r
, FloatParts256
*a
)
1137 r
->frac_hi
= a
->frac_hi
;
1138 r
->frac_lo
= a
->frac_hm
| ((a
->frac_lm
| a
->frac_lo
) != 0);
1141 #define frac_truncjam(R, A) FRAC_GENERIC_64_128(truncjam, R)(R, A)
1143 static void frac64_widen(FloatParts128
*r
, FloatParts64
*a
)
1145 r
->frac_hi
= a
->frac
;
1149 static void frac128_widen(FloatParts256
*r
, FloatParts128
*a
)
1151 r
->frac_hi
= a
->frac_hi
;
1152 r
->frac_hm
= a
->frac_lo
;
1157 #define frac_widen(A, B) FRAC_GENERIC_64_128(widen, B)(A, B)
1159 #define partsN(NAME) glue(glue(glue(parts,N),_),NAME)
1160 #define FloatPartsN glue(FloatParts,N)
1161 #define FloatPartsW glue(FloatParts,W)
1166 #include "softfloat-parts-addsub.c.inc"
1167 #include "softfloat-parts.c.inc"
1174 #include "softfloat-parts-addsub.c.inc"
1175 #include "softfloat-parts.c.inc"
1181 #include "softfloat-parts-addsub.c.inc"
1190 * Pack/unpack routines with a specific FloatFmt.
1193 static void float16a_unpack_canonical(FloatParts64
*p
, float16 f
,
1194 float_status
*s
, const FloatFmt
*params
)
1196 float16_unpack_raw(p
, f
);
1197 parts_canonicalize(p
, s
, params
);
1200 static void float16_unpack_canonical(FloatParts64
*p
, float16 f
,
1203 float16a_unpack_canonical(p
, f
, s
, &float16_params
);
1206 static void bfloat16_unpack_canonical(FloatParts64
*p
, bfloat16 f
,
1209 bfloat16_unpack_raw(p
, f
);
1210 parts_canonicalize(p
, s
, &bfloat16_params
);
1213 static float16
float16a_round_pack_canonical(FloatParts64
*p
,
1215 const FloatFmt
*params
)
1217 parts_uncanon(p
, s
, params
);
1218 return float16_pack_raw(p
);
1221 static float16
float16_round_pack_canonical(FloatParts64
*p
,
1224 return float16a_round_pack_canonical(p
, s
, &float16_params
);
1227 static bfloat16
bfloat16_round_pack_canonical(FloatParts64
*p
,
1230 parts_uncanon(p
, s
, &bfloat16_params
);
1231 return bfloat16_pack_raw(p
);
1234 static void float32_unpack_canonical(FloatParts64
*p
, float32 f
,
1237 float32_unpack_raw(p
, f
);
1238 parts_canonicalize(p
, s
, &float32_params
);
1241 static float32
float32_round_pack_canonical(FloatParts64
*p
,
1244 parts_uncanon(p
, s
, &float32_params
);
1245 return float32_pack_raw(p
);
1248 static void float64_unpack_canonical(FloatParts64
*p
, float64 f
,
1251 float64_unpack_raw(p
, f
);
1252 parts_canonicalize(p
, s
, &float64_params
);
1255 static float64
float64_round_pack_canonical(FloatParts64
*p
,
1258 parts_uncanon(p
, s
, &float64_params
);
1259 return float64_pack_raw(p
);
1262 static void float128_unpack_canonical(FloatParts128
*p
, float128 f
,
1265 float128_unpack_raw(p
, f
);
1266 parts_canonicalize(p
, s
, &float128_params
);
1269 static float128
float128_round_pack_canonical(FloatParts128
*p
,
1272 parts_uncanon(p
, s
, &float128_params
);
1273 return float128_pack_raw(p
);
1277 * Addition and subtraction
1280 static float16 QEMU_FLATTEN
1281 float16_addsub(float16 a
, float16 b
, float_status
*status
, bool subtract
)
1283 FloatParts64 pa
, pb
, *pr
;
1285 float16_unpack_canonical(&pa
, a
, status
);
1286 float16_unpack_canonical(&pb
, b
, status
);
1287 pr
= parts_addsub(&pa
, &pb
, status
, subtract
);
1289 return float16_round_pack_canonical(pr
, status
);
1292 float16
float16_add(float16 a
, float16 b
, float_status
*status
)
1294 return float16_addsub(a
, b
, status
, false);
1297 float16
float16_sub(float16 a
, float16 b
, float_status
*status
)
1299 return float16_addsub(a
, b
, status
, true);
1302 static float32 QEMU_SOFTFLOAT_ATTR
1303 soft_f32_addsub(float32 a
, float32 b
, float_status
*status
, bool subtract
)
1305 FloatParts64 pa
, pb
, *pr
;
1307 float32_unpack_canonical(&pa
, a
, status
);
1308 float32_unpack_canonical(&pb
, b
, status
);
1309 pr
= parts_addsub(&pa
, &pb
, status
, subtract
);
1311 return float32_round_pack_canonical(pr
, status
);
1314 static float32
soft_f32_add(float32 a
, float32 b
, float_status
*status
)
1316 return soft_f32_addsub(a
, b
, status
, false);
1319 static float32
soft_f32_sub(float32 a
, float32 b
, float_status
*status
)
1321 return soft_f32_addsub(a
, b
, status
, true);
1324 static float64 QEMU_SOFTFLOAT_ATTR
1325 soft_f64_addsub(float64 a
, float64 b
, float_status
*status
, bool subtract
)
1327 FloatParts64 pa
, pb
, *pr
;
1329 float64_unpack_canonical(&pa
, a
, status
);
1330 float64_unpack_canonical(&pb
, b
, status
);
1331 pr
= parts_addsub(&pa
, &pb
, status
, subtract
);
1333 return float64_round_pack_canonical(pr
, status
);
1336 static float64
soft_f64_add(float64 a
, float64 b
, float_status
*status
)
1338 return soft_f64_addsub(a
, b
, status
, false);
1341 static float64
soft_f64_sub(float64 a
, float64 b
, float_status
*status
)
1343 return soft_f64_addsub(a
, b
, status
, true);
1346 static float hard_f32_add(float a
, float b
)
1351 static float hard_f32_sub(float a
, float b
)
1356 static double hard_f64_add(double a
, double b
)
1361 static double hard_f64_sub(double a
, double b
)
1366 static bool f32_addsubmul_post(union_float32 a
, union_float32 b
)
1368 if (QEMU_HARDFLOAT_2F32_USE_FP
) {
1369 return !(fpclassify(a
.h
) == FP_ZERO
&& fpclassify(b
.h
) == FP_ZERO
);
1371 return !(float32_is_zero(a
.s
) && float32_is_zero(b
.s
));
1374 static bool f64_addsubmul_post(union_float64 a
, union_float64 b
)
1376 if (QEMU_HARDFLOAT_2F64_USE_FP
) {
1377 return !(fpclassify(a
.h
) == FP_ZERO
&& fpclassify(b
.h
) == FP_ZERO
);
1379 return !(float64_is_zero(a
.s
) && float64_is_zero(b
.s
));
1383 static float32
float32_addsub(float32 a
, float32 b
, float_status
*s
,
1384 hard_f32_op2_fn hard
, soft_f32_op2_fn soft
)
1386 return float32_gen2(a
, b
, s
, hard
, soft
,
1387 f32_is_zon2
, f32_addsubmul_post
);
1390 static float64
float64_addsub(float64 a
, float64 b
, float_status
*s
,
1391 hard_f64_op2_fn hard
, soft_f64_op2_fn soft
)
1393 return float64_gen2(a
, b
, s
, hard
, soft
,
1394 f64_is_zon2
, f64_addsubmul_post
);
1397 float32 QEMU_FLATTEN
1398 float32_add(float32 a
, float32 b
, float_status
*s
)
1400 return float32_addsub(a
, b
, s
, hard_f32_add
, soft_f32_add
);
1403 float32 QEMU_FLATTEN
1404 float32_sub(float32 a
, float32 b
, float_status
*s
)
1406 return float32_addsub(a
, b
, s
, hard_f32_sub
, soft_f32_sub
);
1409 float64 QEMU_FLATTEN
1410 float64_add(float64 a
, float64 b
, float_status
*s
)
1412 return float64_addsub(a
, b
, s
, hard_f64_add
, soft_f64_add
);
1415 float64 QEMU_FLATTEN
1416 float64_sub(float64 a
, float64 b
, float_status
*s
)
1418 return float64_addsub(a
, b
, s
, hard_f64_sub
, soft_f64_sub
);
1421 static bfloat16 QEMU_FLATTEN
1422 bfloat16_addsub(bfloat16 a
, bfloat16 b
, float_status
*status
, bool subtract
)
1424 FloatParts64 pa
, pb
, *pr
;
1426 bfloat16_unpack_canonical(&pa
, a
, status
);
1427 bfloat16_unpack_canonical(&pb
, b
, status
);
1428 pr
= parts_addsub(&pa
, &pb
, status
, subtract
);
1430 return bfloat16_round_pack_canonical(pr
, status
);
1433 bfloat16
bfloat16_add(bfloat16 a
, bfloat16 b
, float_status
*status
)
1435 return bfloat16_addsub(a
, b
, status
, false);
1438 bfloat16
bfloat16_sub(bfloat16 a
, bfloat16 b
, float_status
*status
)
1440 return bfloat16_addsub(a
, b
, status
, true);
1443 static float128 QEMU_FLATTEN
1444 float128_addsub(float128 a
, float128 b
, float_status
*status
, bool subtract
)
1446 FloatParts128 pa
, pb
, *pr
;
1448 float128_unpack_canonical(&pa
, a
, status
);
1449 float128_unpack_canonical(&pb
, b
, status
);
1450 pr
= parts_addsub(&pa
, &pb
, status
, subtract
);
1452 return float128_round_pack_canonical(pr
, status
);
1455 float128
float128_add(float128 a
, float128 b
, float_status
*status
)
1457 return float128_addsub(a
, b
, status
, false);
1460 float128
float128_sub(float128 a
, float128 b
, float_status
*status
)
1462 return float128_addsub(a
, b
, status
, true);
1469 float16 QEMU_FLATTEN
float16_mul(float16 a
, float16 b
, float_status
*status
)
1471 FloatParts64 pa
, pb
, *pr
;
1473 float16_unpack_canonical(&pa
, a
, status
);
1474 float16_unpack_canonical(&pb
, b
, status
);
1475 pr
= parts_mul(&pa
, &pb
, status
);
1477 return float16_round_pack_canonical(pr
, status
);
1480 static float32 QEMU_SOFTFLOAT_ATTR
1481 soft_f32_mul(float32 a
, float32 b
, float_status
*status
)
1483 FloatParts64 pa
, pb
, *pr
;
1485 float32_unpack_canonical(&pa
, a
, status
);
1486 float32_unpack_canonical(&pb
, b
, status
);
1487 pr
= parts_mul(&pa
, &pb
, status
);
1489 return float32_round_pack_canonical(pr
, status
);
1492 static float64 QEMU_SOFTFLOAT_ATTR
1493 soft_f64_mul(float64 a
, float64 b
, float_status
*status
)
1495 FloatParts64 pa
, pb
, *pr
;
1497 float64_unpack_canonical(&pa
, a
, status
);
1498 float64_unpack_canonical(&pb
, b
, status
);
1499 pr
= parts_mul(&pa
, &pb
, status
);
1501 return float64_round_pack_canonical(pr
, status
);
1504 static float hard_f32_mul(float a
, float b
)
1509 static double hard_f64_mul(double a
, double b
)
1514 float32 QEMU_FLATTEN
1515 float32_mul(float32 a
, float32 b
, float_status
*s
)
1517 return float32_gen2(a
, b
, s
, hard_f32_mul
, soft_f32_mul
,
1518 f32_is_zon2
, f32_addsubmul_post
);
1521 float64 QEMU_FLATTEN
1522 float64_mul(float64 a
, float64 b
, float_status
*s
)
1524 return float64_gen2(a
, b
, s
, hard_f64_mul
, soft_f64_mul
,
1525 f64_is_zon2
, f64_addsubmul_post
);
1528 bfloat16 QEMU_FLATTEN
1529 bfloat16_mul(bfloat16 a
, bfloat16 b
, float_status
*status
)
1531 FloatParts64 pa
, pb
, *pr
;
1533 bfloat16_unpack_canonical(&pa
, a
, status
);
1534 bfloat16_unpack_canonical(&pb
, b
, status
);
1535 pr
= parts_mul(&pa
, &pb
, status
);
1537 return bfloat16_round_pack_canonical(pr
, status
);
1540 float128 QEMU_FLATTEN
1541 float128_mul(float128 a
, float128 b
, float_status
*status
)
1543 FloatParts128 pa
, pb
, *pr
;
1545 float128_unpack_canonical(&pa
, a
, status
);
1546 float128_unpack_canonical(&pb
, b
, status
);
1547 pr
= parts_mul(&pa
, &pb
, status
);
1549 return float128_round_pack_canonical(pr
, status
);
1553 * Fused multiply-add
1556 float16 QEMU_FLATTEN
float16_muladd(float16 a
, float16 b
, float16 c
,
1557 int flags
, float_status
*status
)
1559 FloatParts64 pa
, pb
, pc
, *pr
;
1561 float16_unpack_canonical(&pa
, a
, status
);
1562 float16_unpack_canonical(&pb
, b
, status
);
1563 float16_unpack_canonical(&pc
, c
, status
);
1564 pr
= parts_muladd(&pa
, &pb
, &pc
, flags
, status
);
1566 return float16_round_pack_canonical(pr
, status
);
1569 static float32 QEMU_SOFTFLOAT_ATTR
1570 soft_f32_muladd(float32 a
, float32 b
, float32 c
, int flags
,
1571 float_status
*status
)
1573 FloatParts64 pa
, pb
, pc
, *pr
;
1575 float32_unpack_canonical(&pa
, a
, status
);
1576 float32_unpack_canonical(&pb
, b
, status
);
1577 float32_unpack_canonical(&pc
, c
, status
);
1578 pr
= parts_muladd(&pa
, &pb
, &pc
, flags
, status
);
1580 return float32_round_pack_canonical(pr
, status
);
1583 static float64 QEMU_SOFTFLOAT_ATTR
1584 soft_f64_muladd(float64 a
, float64 b
, float64 c
, int flags
,
1585 float_status
*status
)
1587 FloatParts64 pa
, pb
, pc
, *pr
;
1589 float64_unpack_canonical(&pa
, a
, status
);
1590 float64_unpack_canonical(&pb
, b
, status
);
1591 float64_unpack_canonical(&pc
, c
, status
);
1592 pr
= parts_muladd(&pa
, &pb
, &pc
, flags
, status
);
1594 return float64_round_pack_canonical(pr
, status
);
1597 static bool force_soft_fma
;
1599 float32 QEMU_FLATTEN
1600 float32_muladd(float32 xa
, float32 xb
, float32 xc
, int flags
, float_status
*s
)
1602 union_float32 ua
, ub
, uc
, ur
;
1608 if (unlikely(!can_use_fpu(s
))) {
1611 if (unlikely(flags
& float_muladd_halve_result
)) {
1615 float32_input_flush3(&ua
.s
, &ub
.s
, &uc
.s
, s
);
1616 if (unlikely(!f32_is_zon3(ua
, ub
, uc
))) {
1620 if (unlikely(force_soft_fma
)) {
1625 * When (a || b) == 0, there's no need to check for under/over flow,
1626 * since we know the addend is (normal || 0) and the product is 0.
1628 if (float32_is_zero(ua
.s
) || float32_is_zero(ub
.s
)) {
1632 prod_sign
= float32_is_neg(ua
.s
) ^ float32_is_neg(ub
.s
);
1633 prod_sign
^= !!(flags
& float_muladd_negate_product
);
1634 up
.s
= float32_set_sign(float32_zero
, prod_sign
);
1636 if (flags
& float_muladd_negate_c
) {
1641 union_float32 ua_orig
= ua
;
1642 union_float32 uc_orig
= uc
;
1644 if (flags
& float_muladd_negate_product
) {
1647 if (flags
& float_muladd_negate_c
) {
1651 ur
.h
= fmaf(ua
.h
, ub
.h
, uc
.h
);
1653 if (unlikely(f32_is_inf(ur
))) {
1654 float_raise(float_flag_overflow
, s
);
1655 } else if (unlikely(fabsf(ur
.h
) <= FLT_MIN
)) {
1661 if (flags
& float_muladd_negate_result
) {
1662 return float32_chs(ur
.s
);
1667 return soft_f32_muladd(ua
.s
, ub
.s
, uc
.s
, flags
, s
);
1670 float64 QEMU_FLATTEN
1671 float64_muladd(float64 xa
, float64 xb
, float64 xc
, int flags
, float_status
*s
)
1673 union_float64 ua
, ub
, uc
, ur
;
1679 if (unlikely(!can_use_fpu(s
))) {
1682 if (unlikely(flags
& float_muladd_halve_result
)) {
1686 float64_input_flush3(&ua
.s
, &ub
.s
, &uc
.s
, s
);
1687 if (unlikely(!f64_is_zon3(ua
, ub
, uc
))) {
1691 if (unlikely(force_soft_fma
)) {
1696 * When (a || b) == 0, there's no need to check for under/over flow,
1697 * since we know the addend is (normal || 0) and the product is 0.
1699 if (float64_is_zero(ua
.s
) || float64_is_zero(ub
.s
)) {
1703 prod_sign
= float64_is_neg(ua
.s
) ^ float64_is_neg(ub
.s
);
1704 prod_sign
^= !!(flags
& float_muladd_negate_product
);
1705 up
.s
= float64_set_sign(float64_zero
, prod_sign
);
1707 if (flags
& float_muladd_negate_c
) {
1712 union_float64 ua_orig
= ua
;
1713 union_float64 uc_orig
= uc
;
1715 if (flags
& float_muladd_negate_product
) {
1718 if (flags
& float_muladd_negate_c
) {
1722 ur
.h
= fma(ua
.h
, ub
.h
, uc
.h
);
1724 if (unlikely(f64_is_inf(ur
))) {
1725 float_raise(float_flag_overflow
, s
);
1726 } else if (unlikely(fabs(ur
.h
) <= FLT_MIN
)) {
1732 if (flags
& float_muladd_negate_result
) {
1733 return float64_chs(ur
.s
);
1738 return soft_f64_muladd(ua
.s
, ub
.s
, uc
.s
, flags
, s
);
1741 bfloat16 QEMU_FLATTEN
bfloat16_muladd(bfloat16 a
, bfloat16 b
, bfloat16 c
,
1742 int flags
, float_status
*status
)
1744 FloatParts64 pa
, pb
, pc
, *pr
;
1746 bfloat16_unpack_canonical(&pa
, a
, status
);
1747 bfloat16_unpack_canonical(&pb
, b
, status
);
1748 bfloat16_unpack_canonical(&pc
, c
, status
);
1749 pr
= parts_muladd(&pa
, &pb
, &pc
, flags
, status
);
1751 return bfloat16_round_pack_canonical(pr
, status
);
1754 float128 QEMU_FLATTEN
float128_muladd(float128 a
, float128 b
, float128 c
,
1755 int flags
, float_status
*status
)
1757 FloatParts128 pa
, pb
, pc
, *pr
;
1759 float128_unpack_canonical(&pa
, a
, status
);
1760 float128_unpack_canonical(&pb
, b
, status
);
1761 float128_unpack_canonical(&pc
, c
, status
);
1762 pr
= parts_muladd(&pa
, &pb
, &pc
, flags
, status
);
1764 return float128_round_pack_canonical(pr
, status
);
1768 * Returns the result of dividing the floating-point value `a' by the
1769 * corresponding value `b'. The operation is performed according to
1770 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1773 static FloatParts64
div_floats(FloatParts64 a
, FloatParts64 b
, float_status
*s
)
1775 bool sign
= a
.sign
^ b
.sign
;
1777 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
1778 uint64_t n0
, n1
, q
, r
;
1779 int exp
= a
.exp
- b
.exp
;
1782 * We want a 2*N / N-bit division to produce exactly an N-bit
1783 * result, so that we do not lose any precision and so that we
1784 * do not have to renormalize afterward. If A.frac < B.frac,
1785 * then division would produce an (N-1)-bit result; shift A left
1786 * by one to produce the an N-bit result, and decrement the
1787 * exponent to match.
1789 * The udiv_qrnnd algorithm that we're using requires normalization,
1790 * i.e. the msb of the denominator must be set, which is already true.
1792 if (a
.frac
< b
.frac
) {
1794 shift128Left(0, a
.frac
, DECOMPOSED_BINARY_POINT
+ 1, &n1
, &n0
);
1796 shift128Left(0, a
.frac
, DECOMPOSED_BINARY_POINT
, &n1
, &n0
);
1798 q
= udiv_qrnnd(&r
, n1
, n0
, b
.frac
);
1800 /* Set lsb if there is a remainder, to set inexact. */
1801 a
.frac
= q
| (r
!= 0);
1806 /* handle all the NaN cases */
1807 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
1808 return *parts_pick_nan(&a
, &b
, s
);
1810 /* 0/0 or Inf/Inf */
1813 (a
.cls
== float_class_inf
|| a
.cls
== float_class_zero
)) {
1814 float_raise(float_flag_invalid
, s
);
1815 parts_default_nan(&a
, s
);
1818 /* Inf / x or 0 / x */
1819 if (a
.cls
== float_class_inf
|| a
.cls
== float_class_zero
) {
1824 if (b
.cls
== float_class_zero
) {
1825 float_raise(float_flag_divbyzero
, s
);
1826 a
.cls
= float_class_inf
;
1831 if (b
.cls
== float_class_inf
) {
1832 a
.cls
= float_class_zero
;
1836 g_assert_not_reached();
1839 float16
float16_div(float16 a
, float16 b
, float_status
*status
)
1841 FloatParts64 pa
, pb
, pr
;
1843 float16_unpack_canonical(&pa
, a
, status
);
1844 float16_unpack_canonical(&pb
, b
, status
);
1845 pr
= div_floats(pa
, pb
, status
);
1847 return float16_round_pack_canonical(&pr
, status
);
1850 static float32 QEMU_SOFTFLOAT_ATTR
1851 soft_f32_div(float32 a
, float32 b
, float_status
*status
)
1853 FloatParts64 pa
, pb
, pr
;
1855 float32_unpack_canonical(&pa
, a
, status
);
1856 float32_unpack_canonical(&pb
, b
, status
);
1857 pr
= div_floats(pa
, pb
, status
);
1859 return float32_round_pack_canonical(&pr
, status
);
1862 static float64 QEMU_SOFTFLOAT_ATTR
1863 soft_f64_div(float64 a
, float64 b
, float_status
*status
)
1865 FloatParts64 pa
, pb
, pr
;
1867 float64_unpack_canonical(&pa
, a
, status
);
1868 float64_unpack_canonical(&pb
, b
, status
);
1869 pr
= div_floats(pa
, pb
, status
);
1871 return float64_round_pack_canonical(&pr
, status
);
1874 static float hard_f32_div(float a
, float b
)
1879 static double hard_f64_div(double a
, double b
)
1884 static bool f32_div_pre(union_float32 a
, union_float32 b
)
1886 if (QEMU_HARDFLOAT_2F32_USE_FP
) {
1887 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
1888 fpclassify(b
.h
) == FP_NORMAL
;
1890 return float32_is_zero_or_normal(a
.s
) && float32_is_normal(b
.s
);
1893 static bool f64_div_pre(union_float64 a
, union_float64 b
)
1895 if (QEMU_HARDFLOAT_2F64_USE_FP
) {
1896 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
1897 fpclassify(b
.h
) == FP_NORMAL
;
1899 return float64_is_zero_or_normal(a
.s
) && float64_is_normal(b
.s
);
1902 static bool f32_div_post(union_float32 a
, union_float32 b
)
1904 if (QEMU_HARDFLOAT_2F32_USE_FP
) {
1905 return fpclassify(a
.h
) != FP_ZERO
;
1907 return !float32_is_zero(a
.s
);
1910 static bool f64_div_post(union_float64 a
, union_float64 b
)
1912 if (QEMU_HARDFLOAT_2F64_USE_FP
) {
1913 return fpclassify(a
.h
) != FP_ZERO
;
1915 return !float64_is_zero(a
.s
);
1918 float32 QEMU_FLATTEN
1919 float32_div(float32 a
, float32 b
, float_status
*s
)
1921 return float32_gen2(a
, b
, s
, hard_f32_div
, soft_f32_div
,
1922 f32_div_pre
, f32_div_post
);
1925 float64 QEMU_FLATTEN
1926 float64_div(float64 a
, float64 b
, float_status
*s
)
1928 return float64_gen2(a
, b
, s
, hard_f64_div
, soft_f64_div
,
1929 f64_div_pre
, f64_div_post
);
1933 * Returns the result of dividing the bfloat16
1934 * value `a' by the corresponding value `b'.
1937 bfloat16
bfloat16_div(bfloat16 a
, bfloat16 b
, float_status
*status
)
1939 FloatParts64 pa
, pb
, pr
;
1941 bfloat16_unpack_canonical(&pa
, a
, status
);
1942 bfloat16_unpack_canonical(&pb
, b
, status
);
1943 pr
= div_floats(pa
, pb
, status
);
1945 return bfloat16_round_pack_canonical(&pr
, status
);
1949 * Float to Float conversions
1951 * Returns the result of converting one float format to another. The
1952 * conversion is performed according to the IEC/IEEE Standard for
1953 * Binary Floating-Point Arithmetic.
1955 * The float_to_float helper only needs to take care of raising
1956 * invalid exceptions and handling the conversion on NaNs.
1959 static FloatParts64
float_to_float(FloatParts64 a
, const FloatFmt
*dstf
,
1962 if (dstf
->arm_althp
) {
1964 case float_class_qnan
:
1965 case float_class_snan
:
1966 /* There is no NaN in the destination format. Raise Invalid
1967 * and return a zero with the sign of the input NaN.
1969 float_raise(float_flag_invalid
, s
);
1970 a
.cls
= float_class_zero
;
1975 case float_class_inf
:
1976 /* There is no Inf in the destination format. Raise Invalid
1977 * and return the maximum normal with the correct sign.
1979 float_raise(float_flag_invalid
, s
);
1980 a
.cls
= float_class_normal
;
1981 a
.exp
= dstf
->exp_max
;
1982 a
.frac
= ((1ull << dstf
->frac_size
) - 1) << dstf
->frac_shift
;
1988 } else if (is_nan(a
.cls
)) {
1989 parts_return_nan(&a
, s
);
1994 float32
float16_to_float32(float16 a
, bool ieee
, float_status
*s
)
1996 const FloatFmt
*fmt16
= ieee
? &float16_params
: &float16_params_ahp
;
1997 FloatParts64 pa
, pr
;
1999 float16a_unpack_canonical(&pa
, a
, s
, fmt16
);
2000 pr
= float_to_float(pa
, &float32_params
, s
);
2001 return float32_round_pack_canonical(&pr
, s
);
2004 float64
float16_to_float64(float16 a
, bool ieee
, float_status
*s
)
2006 const FloatFmt
*fmt16
= ieee
? &float16_params
: &float16_params_ahp
;
2007 FloatParts64 pa
, pr
;
2009 float16a_unpack_canonical(&pa
, a
, s
, fmt16
);
2010 pr
= float_to_float(pa
, &float64_params
, s
);
2011 return float64_round_pack_canonical(&pr
, s
);
2014 float16
float32_to_float16(float32 a
, bool ieee
, float_status
*s
)
2016 const FloatFmt
*fmt16
= ieee
? &float16_params
: &float16_params_ahp
;
2017 FloatParts64 pa
, pr
;
2019 float32_unpack_canonical(&pa
, a
, s
);
2020 pr
= float_to_float(pa
, fmt16
, s
);
2021 return float16a_round_pack_canonical(&pr
, s
, fmt16
);
2024 static float64 QEMU_SOFTFLOAT_ATTR
2025 soft_float32_to_float64(float32 a
, float_status
*s
)
2027 FloatParts64 pa
, pr
;
2029 float32_unpack_canonical(&pa
, a
, s
);
2030 pr
= float_to_float(pa
, &float64_params
, s
);
2031 return float64_round_pack_canonical(&pr
, s
);
2034 float64
float32_to_float64(float32 a
, float_status
*s
)
2036 if (likely(float32_is_normal(a
))) {
2037 /* Widening conversion can never produce inexact results. */
2043 } else if (float32_is_zero(a
)) {
2044 return float64_set_sign(float64_zero
, float32_is_neg(a
));
2046 return soft_float32_to_float64(a
, s
);
2050 float16
float64_to_float16(float64 a
, bool ieee
, float_status
*s
)
2052 const FloatFmt
*fmt16
= ieee
? &float16_params
: &float16_params_ahp
;
2053 FloatParts64 pa
, pr
;
2055 float64_unpack_canonical(&pa
, a
, s
);
2056 pr
= float_to_float(pa
, fmt16
, s
);
2057 return float16a_round_pack_canonical(&pr
, s
, fmt16
);
2060 float32
float64_to_float32(float64 a
, float_status
*s
)
2062 FloatParts64 pa
, pr
;
2064 float64_unpack_canonical(&pa
, a
, s
);
2065 pr
= float_to_float(pa
, &float32_params
, s
);
2066 return float32_round_pack_canonical(&pr
, s
);
2069 float32
bfloat16_to_float32(bfloat16 a
, float_status
*s
)
2071 FloatParts64 pa
, pr
;
2073 bfloat16_unpack_canonical(&pa
, a
, s
);
2074 pr
= float_to_float(pa
, &float32_params
, s
);
2075 return float32_round_pack_canonical(&pr
, s
);
2078 float64
bfloat16_to_float64(bfloat16 a
, float_status
*s
)
2080 FloatParts64 pa
, pr
;
2082 bfloat16_unpack_canonical(&pa
, a
, s
);
2083 pr
= float_to_float(pa
, &float64_params
, s
);
2084 return float64_round_pack_canonical(&pr
, s
);
2087 bfloat16
float32_to_bfloat16(float32 a
, float_status
*s
)
2089 FloatParts64 pa
, pr
;
2091 float32_unpack_canonical(&pa
, a
, s
);
2092 pr
= float_to_float(pa
, &bfloat16_params
, s
);
2093 return bfloat16_round_pack_canonical(&pr
, s
);
2096 bfloat16
float64_to_bfloat16(float64 a
, float_status
*s
)
2098 FloatParts64 pa
, pr
;
2100 float64_unpack_canonical(&pa
, a
, s
);
2101 pr
= float_to_float(pa
, &bfloat16_params
, s
);
2102 return bfloat16_round_pack_canonical(&pr
, s
);
2106 * Rounds the floating-point value `a' to an integer, and returns the
2107 * result as a floating-point value. The operation is performed
2108 * according to the IEC/IEEE Standard for Binary Floating-Point
2112 static FloatParts64
round_to_int(FloatParts64 a
, FloatRoundMode rmode
,
2113 int scale
, float_status
*s
)
2116 case float_class_qnan
:
2117 case float_class_snan
:
2118 parts_return_nan(&a
, s
);
2121 case float_class_zero
:
2122 case float_class_inf
:
2123 /* already "integral" */
2126 case float_class_normal
:
2127 scale
= MIN(MAX(scale
, -0x10000), 0x10000);
2130 if (a
.exp
>= DECOMPOSED_BINARY_POINT
) {
2131 /* already integral */
2136 /* all fractional */
2137 float_raise(float_flag_inexact
, s
);
2139 case float_round_nearest_even
:
2140 one
= a
.exp
== -1 && a
.frac
> DECOMPOSED_IMPLICIT_BIT
;
2142 case float_round_ties_away
:
2143 one
= a
.exp
== -1 && a
.frac
>= DECOMPOSED_IMPLICIT_BIT
;
2145 case float_round_to_zero
:
2148 case float_round_up
:
2151 case float_round_down
:
2154 case float_round_to_odd
:
2158 g_assert_not_reached();
2162 a
.frac
= DECOMPOSED_IMPLICIT_BIT
;
2165 a
.cls
= float_class_zero
;
2168 uint64_t frac_lsb
= DECOMPOSED_IMPLICIT_BIT
>> a
.exp
;
2169 uint64_t frac_lsbm1
= frac_lsb
>> 1;
2170 uint64_t rnd_even_mask
= (frac_lsb
- 1) | frac_lsb
;
2171 uint64_t rnd_mask
= rnd_even_mask
>> 1;
2175 case float_round_nearest_even
:
2176 inc
= ((a
.frac
& rnd_even_mask
) != frac_lsbm1
? frac_lsbm1
: 0);
2178 case float_round_ties_away
:
2181 case float_round_to_zero
:
2184 case float_round_up
:
2185 inc
= a
.sign
? 0 : rnd_mask
;
2187 case float_round_down
:
2188 inc
= a
.sign
? rnd_mask
: 0;
2190 case float_round_to_odd
:
2191 inc
= a
.frac
& frac_lsb
? 0 : rnd_mask
;
2194 g_assert_not_reached();
2197 if (a
.frac
& rnd_mask
) {
2198 float_raise(float_flag_inexact
, s
);
2199 if (uadd64_overflow(a
.frac
, inc
, &a
.frac
)) {
2201 a
.frac
|= DECOMPOSED_IMPLICIT_BIT
;
2204 a
.frac
&= ~rnd_mask
;
2209 g_assert_not_reached();
2214 float16
float16_round_to_int(float16 a
, float_status
*s
)
2216 FloatParts64 pa
, pr
;
2218 float16_unpack_canonical(&pa
, a
, s
);
2219 pr
= round_to_int(pa
, s
->float_rounding_mode
, 0, s
);
2220 return float16_round_pack_canonical(&pr
, s
);
2223 float32
float32_round_to_int(float32 a
, float_status
*s
)
2225 FloatParts64 pa
, pr
;
2227 float32_unpack_canonical(&pa
, a
, s
);
2228 pr
= round_to_int(pa
, s
->float_rounding_mode
, 0, s
);
2229 return float32_round_pack_canonical(&pr
, s
);
2232 float64
float64_round_to_int(float64 a
, float_status
*s
)
2234 FloatParts64 pa
, pr
;
2236 float64_unpack_canonical(&pa
, a
, s
);
2237 pr
= round_to_int(pa
, s
->float_rounding_mode
, 0, s
);
2238 return float64_round_pack_canonical(&pr
, s
);
2242 * Rounds the bfloat16 value `a' to an integer, and returns the
2243 * result as a bfloat16 value.
2246 bfloat16
bfloat16_round_to_int(bfloat16 a
, float_status
*s
)
2248 FloatParts64 pa
, pr
;
2250 bfloat16_unpack_canonical(&pa
, a
, s
);
2251 pr
= round_to_int(pa
, s
->float_rounding_mode
, 0, s
);
2252 return bfloat16_round_pack_canonical(&pr
, s
);
2256 * Returns the result of converting the floating-point value `a' to
2257 * the two's complement integer format. The conversion is performed
2258 * according to the IEC/IEEE Standard for Binary Floating-Point
2259 * Arithmetic---which means in particular that the conversion is
2260 * rounded according to the current rounding mode. If `a' is a NaN,
2261 * the largest positive integer is returned. Otherwise, if the
2262 * conversion overflows, the largest integer with the same sign as `a'
2266 static int64_t round_to_int_and_pack(FloatParts64 in
, FloatRoundMode rmode
,
2267 int scale
, int64_t min
, int64_t max
,
2271 int orig_flags
= get_float_exception_flags(s
);
2272 FloatParts64 p
= round_to_int(in
, rmode
, scale
, s
);
2275 case float_class_snan
:
2276 case float_class_qnan
:
2277 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2279 case float_class_inf
:
2280 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2281 return p
.sign
? min
: max
;
2282 case float_class_zero
:
2284 case float_class_normal
:
2285 if (p
.exp
<= DECOMPOSED_BINARY_POINT
) {
2286 r
= p
.frac
>> (DECOMPOSED_BINARY_POINT
- p
.exp
);
2291 if (r
<= -(uint64_t) min
) {
2294 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2301 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2306 g_assert_not_reached();
2310 int8_t float16_to_int8_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2315 float16_unpack_canonical(&p
, a
, s
);
2316 return round_to_int_and_pack(p
, rmode
, scale
, INT8_MIN
, INT8_MAX
, s
);
2319 int16_t float16_to_int16_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2324 float16_unpack_canonical(&p
, a
, s
);
2325 return round_to_int_and_pack(p
, rmode
, scale
, INT16_MIN
, INT16_MAX
, s
);
2328 int32_t float16_to_int32_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2333 float16_unpack_canonical(&p
, a
, s
);
2334 return round_to_int_and_pack(p
, rmode
, scale
, INT32_MIN
, INT32_MAX
, s
);
2337 int64_t float16_to_int64_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2342 float16_unpack_canonical(&p
, a
, s
);
2343 return round_to_int_and_pack(p
, rmode
, scale
, INT64_MIN
, INT64_MAX
, s
);
2346 int16_t float32_to_int16_scalbn(float32 a
, FloatRoundMode rmode
, int scale
,
2351 float32_unpack_canonical(&p
, a
, s
);
2352 return round_to_int_and_pack(p
, rmode
, scale
, INT16_MIN
, INT16_MAX
, s
);
2355 int32_t float32_to_int32_scalbn(float32 a
, FloatRoundMode rmode
, int scale
,
2360 float32_unpack_canonical(&p
, a
, s
);
2361 return round_to_int_and_pack(p
, rmode
, scale
, INT32_MIN
, INT32_MAX
, s
);
2364 int64_t float32_to_int64_scalbn(float32 a
, FloatRoundMode rmode
, int scale
,
2369 float32_unpack_canonical(&p
, a
, s
);
2370 return round_to_int_and_pack(p
, rmode
, scale
, INT64_MIN
, INT64_MAX
, s
);
2373 int16_t float64_to_int16_scalbn(float64 a
, FloatRoundMode rmode
, int scale
,
2378 float64_unpack_canonical(&p
, a
, s
);
2379 return round_to_int_and_pack(p
, rmode
, scale
, INT16_MIN
, INT16_MAX
, s
);
2382 int32_t float64_to_int32_scalbn(float64 a
, FloatRoundMode rmode
, int scale
,
2387 float64_unpack_canonical(&p
, a
, s
);
2388 return round_to_int_and_pack(p
, rmode
, scale
, INT32_MIN
, INT32_MAX
, s
);
2391 int64_t float64_to_int64_scalbn(float64 a
, FloatRoundMode rmode
, int scale
,
2396 float64_unpack_canonical(&p
, a
, s
);
2397 return round_to_int_and_pack(p
, rmode
, scale
, INT64_MIN
, INT64_MAX
, s
);
2400 int8_t float16_to_int8(float16 a
, float_status
*s
)
2402 return float16_to_int8_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2405 int16_t float16_to_int16(float16 a
, float_status
*s
)
2407 return float16_to_int16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2410 int32_t float16_to_int32(float16 a
, float_status
*s
)
2412 return float16_to_int32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2415 int64_t float16_to_int64(float16 a
, float_status
*s
)
2417 return float16_to_int64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2420 int16_t float32_to_int16(float32 a
, float_status
*s
)
2422 return float32_to_int16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2425 int32_t float32_to_int32(float32 a
, float_status
*s
)
2427 return float32_to_int32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2430 int64_t float32_to_int64(float32 a
, float_status
*s
)
2432 return float32_to_int64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2435 int16_t float64_to_int16(float64 a
, float_status
*s
)
2437 return float64_to_int16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2440 int32_t float64_to_int32(float64 a
, float_status
*s
)
2442 return float64_to_int32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2445 int64_t float64_to_int64(float64 a
, float_status
*s
)
2447 return float64_to_int64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2450 int16_t float16_to_int16_round_to_zero(float16 a
, float_status
*s
)
2452 return float16_to_int16_scalbn(a
, float_round_to_zero
, 0, s
);
2455 int32_t float16_to_int32_round_to_zero(float16 a
, float_status
*s
)
2457 return float16_to_int32_scalbn(a
, float_round_to_zero
, 0, s
);
2460 int64_t float16_to_int64_round_to_zero(float16 a
, float_status
*s
)
2462 return float16_to_int64_scalbn(a
, float_round_to_zero
, 0, s
);
2465 int16_t float32_to_int16_round_to_zero(float32 a
, float_status
*s
)
2467 return float32_to_int16_scalbn(a
, float_round_to_zero
, 0, s
);
2470 int32_t float32_to_int32_round_to_zero(float32 a
, float_status
*s
)
2472 return float32_to_int32_scalbn(a
, float_round_to_zero
, 0, s
);
2475 int64_t float32_to_int64_round_to_zero(float32 a
, float_status
*s
)
2477 return float32_to_int64_scalbn(a
, float_round_to_zero
, 0, s
);
2480 int16_t float64_to_int16_round_to_zero(float64 a
, float_status
*s
)
2482 return float64_to_int16_scalbn(a
, float_round_to_zero
, 0, s
);
2485 int32_t float64_to_int32_round_to_zero(float64 a
, float_status
*s
)
2487 return float64_to_int32_scalbn(a
, float_round_to_zero
, 0, s
);
2490 int64_t float64_to_int64_round_to_zero(float64 a
, float_status
*s
)
2492 return float64_to_int64_scalbn(a
, float_round_to_zero
, 0, s
);
2496 * Returns the result of converting the floating-point value `a' to
2497 * the two's complement integer format.
2500 int16_t bfloat16_to_int16_scalbn(bfloat16 a
, FloatRoundMode rmode
, int scale
,
2505 bfloat16_unpack_canonical(&p
, a
, s
);
2506 return round_to_int_and_pack(p
, rmode
, scale
, INT16_MIN
, INT16_MAX
, s
);
2509 int32_t bfloat16_to_int32_scalbn(bfloat16 a
, FloatRoundMode rmode
, int scale
,
2514 bfloat16_unpack_canonical(&p
, a
, s
);
2515 return round_to_int_and_pack(p
, rmode
, scale
, INT32_MIN
, INT32_MAX
, s
);
2518 int64_t bfloat16_to_int64_scalbn(bfloat16 a
, FloatRoundMode rmode
, int scale
,
2523 bfloat16_unpack_canonical(&p
, a
, s
);
2524 return round_to_int_and_pack(p
, rmode
, scale
, INT64_MIN
, INT64_MAX
, s
);
2527 int16_t bfloat16_to_int16(bfloat16 a
, float_status
*s
)
2529 return bfloat16_to_int16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2532 int32_t bfloat16_to_int32(bfloat16 a
, float_status
*s
)
2534 return bfloat16_to_int32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2537 int64_t bfloat16_to_int64(bfloat16 a
, float_status
*s
)
2539 return bfloat16_to_int64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2542 int16_t bfloat16_to_int16_round_to_zero(bfloat16 a
, float_status
*s
)
2544 return bfloat16_to_int16_scalbn(a
, float_round_to_zero
, 0, s
);
2547 int32_t bfloat16_to_int32_round_to_zero(bfloat16 a
, float_status
*s
)
2549 return bfloat16_to_int32_scalbn(a
, float_round_to_zero
, 0, s
);
2552 int64_t bfloat16_to_int64_round_to_zero(bfloat16 a
, float_status
*s
)
2554 return bfloat16_to_int64_scalbn(a
, float_round_to_zero
, 0, s
);
2558 * Returns the result of converting the floating-point value `a' to
2559 * the unsigned integer format. The conversion is performed according
2560 * to the IEC/IEEE Standard for Binary Floating-Point
2561 * Arithmetic---which means in particular that the conversion is
2562 * rounded according to the current rounding mode. If `a' is a NaN,
2563 * the largest unsigned integer is returned. Otherwise, if the
2564 * conversion overflows, the largest unsigned integer is returned. If
2565 * the 'a' is negative, the result is rounded and zero is returned;
2566 * values that do not round to zero will raise the inexact exception
2570 static uint64_t round_to_uint_and_pack(FloatParts64 in
, FloatRoundMode rmode
,
2571 int scale
, uint64_t max
,
2574 int orig_flags
= get_float_exception_flags(s
);
2575 FloatParts64 p
= round_to_int(in
, rmode
, scale
, s
);
2579 case float_class_snan
:
2580 case float_class_qnan
:
2581 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2583 case float_class_inf
:
2584 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2585 return p
.sign
? 0 : max
;
2586 case float_class_zero
:
2588 case float_class_normal
:
2590 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2594 if (p
.exp
<= DECOMPOSED_BINARY_POINT
) {
2595 r
= p
.frac
>> (DECOMPOSED_BINARY_POINT
- p
.exp
);
2597 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2601 /* For uint64 this will never trip, but if p.exp is too large
2602 * to shift a decomposed fraction we shall have exited via the
2606 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2611 g_assert_not_reached();
2615 uint8_t float16_to_uint8_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2620 float16_unpack_canonical(&p
, a
, s
);
2621 return round_to_uint_and_pack(p
, rmode
, scale
, UINT8_MAX
, s
);
2624 uint16_t float16_to_uint16_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2629 float16_unpack_canonical(&p
, a
, s
);
2630 return round_to_uint_and_pack(p
, rmode
, scale
, UINT16_MAX
, s
);
2633 uint32_t float16_to_uint32_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2638 float16_unpack_canonical(&p
, a
, s
);
2639 return round_to_uint_and_pack(p
, rmode
, scale
, UINT32_MAX
, s
);
2642 uint64_t float16_to_uint64_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2647 float16_unpack_canonical(&p
, a
, s
);
2648 return round_to_uint_and_pack(p
, rmode
, scale
, UINT64_MAX
, s
);
2651 uint16_t float32_to_uint16_scalbn(float32 a
, FloatRoundMode rmode
, int scale
,
2656 float32_unpack_canonical(&p
, a
, s
);
2657 return round_to_uint_and_pack(p
, rmode
, scale
, UINT16_MAX
, s
);
2660 uint32_t float32_to_uint32_scalbn(float32 a
, FloatRoundMode rmode
, int scale
,
2665 float32_unpack_canonical(&p
, a
, s
);
2666 return round_to_uint_and_pack(p
, rmode
, scale
, UINT32_MAX
, s
);
2669 uint64_t float32_to_uint64_scalbn(float32 a
, FloatRoundMode rmode
, int scale
,
2674 float32_unpack_canonical(&p
, a
, s
);
2675 return round_to_uint_and_pack(p
, rmode
, scale
, UINT64_MAX
, s
);
2678 uint16_t float64_to_uint16_scalbn(float64 a
, FloatRoundMode rmode
, int scale
,
2683 float64_unpack_canonical(&p
, a
, s
);
2684 return round_to_uint_and_pack(p
, rmode
, scale
, UINT16_MAX
, s
);
2687 uint32_t float64_to_uint32_scalbn(float64 a
, FloatRoundMode rmode
, int scale
,
2692 float64_unpack_canonical(&p
, a
, s
);
2693 return round_to_uint_and_pack(p
, rmode
, scale
, UINT32_MAX
, s
);
2696 uint64_t float64_to_uint64_scalbn(float64 a
, FloatRoundMode rmode
, int scale
,
2701 float64_unpack_canonical(&p
, a
, s
);
2702 return round_to_uint_and_pack(p
, rmode
, scale
, UINT64_MAX
, s
);
2705 uint8_t float16_to_uint8(float16 a
, float_status
*s
)
2707 return float16_to_uint8_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2710 uint16_t float16_to_uint16(float16 a
, float_status
*s
)
2712 return float16_to_uint16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2715 uint32_t float16_to_uint32(float16 a
, float_status
*s
)
2717 return float16_to_uint32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2720 uint64_t float16_to_uint64(float16 a
, float_status
*s
)
2722 return float16_to_uint64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2725 uint16_t float32_to_uint16(float32 a
, float_status
*s
)
2727 return float32_to_uint16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2730 uint32_t float32_to_uint32(float32 a
, float_status
*s
)
2732 return float32_to_uint32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2735 uint64_t float32_to_uint64(float32 a
, float_status
*s
)
2737 return float32_to_uint64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2740 uint16_t float64_to_uint16(float64 a
, float_status
*s
)
2742 return float64_to_uint16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2745 uint32_t float64_to_uint32(float64 a
, float_status
*s
)
2747 return float64_to_uint32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2750 uint64_t float64_to_uint64(float64 a
, float_status
*s
)
2752 return float64_to_uint64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2755 uint16_t float16_to_uint16_round_to_zero(float16 a
, float_status
*s
)
2757 return float16_to_uint16_scalbn(a
, float_round_to_zero
, 0, s
);
2760 uint32_t float16_to_uint32_round_to_zero(float16 a
, float_status
*s
)
2762 return float16_to_uint32_scalbn(a
, float_round_to_zero
, 0, s
);
2765 uint64_t float16_to_uint64_round_to_zero(float16 a
, float_status
*s
)
2767 return float16_to_uint64_scalbn(a
, float_round_to_zero
, 0, s
);
2770 uint16_t float32_to_uint16_round_to_zero(float32 a
, float_status
*s
)
2772 return float32_to_uint16_scalbn(a
, float_round_to_zero
, 0, s
);
2775 uint32_t float32_to_uint32_round_to_zero(float32 a
, float_status
*s
)
2777 return float32_to_uint32_scalbn(a
, float_round_to_zero
, 0, s
);
2780 uint64_t float32_to_uint64_round_to_zero(float32 a
, float_status
*s
)
2782 return float32_to_uint64_scalbn(a
, float_round_to_zero
, 0, s
);
2785 uint16_t float64_to_uint16_round_to_zero(float64 a
, float_status
*s
)
2787 return float64_to_uint16_scalbn(a
, float_round_to_zero
, 0, s
);
2790 uint32_t float64_to_uint32_round_to_zero(float64 a
, float_status
*s
)
2792 return float64_to_uint32_scalbn(a
, float_round_to_zero
, 0, s
);
2795 uint64_t float64_to_uint64_round_to_zero(float64 a
, float_status
*s
)
2797 return float64_to_uint64_scalbn(a
, float_round_to_zero
, 0, s
);
2801 * Returns the result of converting the bfloat16 value `a' to
2802 * the unsigned integer format.
2805 uint16_t bfloat16_to_uint16_scalbn(bfloat16 a
, FloatRoundMode rmode
,
2806 int scale
, float_status
*s
)
2810 bfloat16_unpack_canonical(&p
, a
, s
);
2811 return round_to_uint_and_pack(p
, rmode
, scale
, UINT16_MAX
, s
);
2814 uint32_t bfloat16_to_uint32_scalbn(bfloat16 a
, FloatRoundMode rmode
,
2815 int scale
, float_status
*s
)
2819 bfloat16_unpack_canonical(&p
, a
, s
);
2820 return round_to_uint_and_pack(p
, rmode
, scale
, UINT32_MAX
, s
);
2823 uint64_t bfloat16_to_uint64_scalbn(bfloat16 a
, FloatRoundMode rmode
,
2824 int scale
, float_status
*s
)
2828 bfloat16_unpack_canonical(&p
, a
, s
);
2829 return round_to_uint_and_pack(p
, rmode
, scale
, UINT64_MAX
, s
);
2832 uint16_t bfloat16_to_uint16(bfloat16 a
, float_status
*s
)
2834 return bfloat16_to_uint16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2837 uint32_t bfloat16_to_uint32(bfloat16 a
, float_status
*s
)
2839 return bfloat16_to_uint32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2842 uint64_t bfloat16_to_uint64(bfloat16 a
, float_status
*s
)
2844 return bfloat16_to_uint64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2847 uint16_t bfloat16_to_uint16_round_to_zero(bfloat16 a
, float_status
*s
)
2849 return bfloat16_to_uint16_scalbn(a
, float_round_to_zero
, 0, s
);
2852 uint32_t bfloat16_to_uint32_round_to_zero(bfloat16 a
, float_status
*s
)
2854 return bfloat16_to_uint32_scalbn(a
, float_round_to_zero
, 0, s
);
2857 uint64_t bfloat16_to_uint64_round_to_zero(bfloat16 a
, float_status
*s
)
2859 return bfloat16_to_uint64_scalbn(a
, float_round_to_zero
, 0, s
);
2863 * Integer to float conversions
2865 * Returns the result of converting the two's complement integer `a'
2866 * to the floating-point format. The conversion is performed according
2867 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2870 static FloatParts64
int_to_float(int64_t a
, int scale
, float_status
*status
)
2872 FloatParts64 r
= { .sign
= false };
2875 r
.cls
= float_class_zero
;
2880 r
.cls
= float_class_normal
;
2886 scale
= MIN(MAX(scale
, -0x10000), 0x10000);
2888 r
.exp
= DECOMPOSED_BINARY_POINT
- shift
+ scale
;
2889 r
.frac
= f
<< shift
;
2895 float16
int64_to_float16_scalbn(int64_t a
, int scale
, float_status
*status
)
2897 FloatParts64 pa
= int_to_float(a
, scale
, status
);
2898 return float16_round_pack_canonical(&pa
, status
);
2901 float16
int32_to_float16_scalbn(int32_t a
, int scale
, float_status
*status
)
2903 return int64_to_float16_scalbn(a
, scale
, status
);
2906 float16
int16_to_float16_scalbn(int16_t a
, int scale
, float_status
*status
)
2908 return int64_to_float16_scalbn(a
, scale
, status
);
2911 float16
int64_to_float16(int64_t a
, float_status
*status
)
2913 return int64_to_float16_scalbn(a
, 0, status
);
2916 float16
int32_to_float16(int32_t a
, float_status
*status
)
2918 return int64_to_float16_scalbn(a
, 0, status
);
2921 float16
int16_to_float16(int16_t a
, float_status
*status
)
2923 return int64_to_float16_scalbn(a
, 0, status
);
2926 float16
int8_to_float16(int8_t a
, float_status
*status
)
2928 return int64_to_float16_scalbn(a
, 0, status
);
2931 float32
int64_to_float32_scalbn(int64_t a
, int scale
, float_status
*status
)
2933 FloatParts64 pa
= int_to_float(a
, scale
, status
);
2934 return float32_round_pack_canonical(&pa
, status
);
2937 float32
int32_to_float32_scalbn(int32_t a
, int scale
, float_status
*status
)
2939 return int64_to_float32_scalbn(a
, scale
, status
);
2942 float32
int16_to_float32_scalbn(int16_t a
, int scale
, float_status
*status
)
2944 return int64_to_float32_scalbn(a
, scale
, status
);
2947 float32
int64_to_float32(int64_t a
, float_status
*status
)
2949 return int64_to_float32_scalbn(a
, 0, status
);
2952 float32
int32_to_float32(int32_t a
, float_status
*status
)
2954 return int64_to_float32_scalbn(a
, 0, status
);
2957 float32
int16_to_float32(int16_t a
, float_status
*status
)
2959 return int64_to_float32_scalbn(a
, 0, status
);
2962 float64
int64_to_float64_scalbn(int64_t a
, int scale
, float_status
*status
)
2964 FloatParts64 pa
= int_to_float(a
, scale
, status
);
2965 return float64_round_pack_canonical(&pa
, status
);
2968 float64
int32_to_float64_scalbn(int32_t a
, int scale
, float_status
*status
)
2970 return int64_to_float64_scalbn(a
, scale
, status
);
2973 float64
int16_to_float64_scalbn(int16_t a
, int scale
, float_status
*status
)
2975 return int64_to_float64_scalbn(a
, scale
, status
);
2978 float64
int64_to_float64(int64_t a
, float_status
*status
)
2980 return int64_to_float64_scalbn(a
, 0, status
);
2983 float64
int32_to_float64(int32_t a
, float_status
*status
)
2985 return int64_to_float64_scalbn(a
, 0, status
);
2988 float64
int16_to_float64(int16_t a
, float_status
*status
)
2990 return int64_to_float64_scalbn(a
, 0, status
);
2994 * Returns the result of converting the two's complement integer `a'
2995 * to the bfloat16 format.
2998 bfloat16
int64_to_bfloat16_scalbn(int64_t a
, int scale
, float_status
*status
)
3000 FloatParts64 pa
= int_to_float(a
, scale
, status
);
3001 return bfloat16_round_pack_canonical(&pa
, status
);
3004 bfloat16
int32_to_bfloat16_scalbn(int32_t a
, int scale
, float_status
*status
)
3006 return int64_to_bfloat16_scalbn(a
, scale
, status
);
3009 bfloat16
int16_to_bfloat16_scalbn(int16_t a
, int scale
, float_status
*status
)
3011 return int64_to_bfloat16_scalbn(a
, scale
, status
);
3014 bfloat16
int64_to_bfloat16(int64_t a
, float_status
*status
)
3016 return int64_to_bfloat16_scalbn(a
, 0, status
);
3019 bfloat16
int32_to_bfloat16(int32_t a
, float_status
*status
)
3021 return int64_to_bfloat16_scalbn(a
, 0, status
);
3024 bfloat16
int16_to_bfloat16(int16_t a
, float_status
*status
)
3026 return int64_to_bfloat16_scalbn(a
, 0, status
);
3030 * Unsigned Integer to float conversions
3032 * Returns the result of converting the unsigned integer `a' to the
3033 * floating-point format. The conversion is performed according to the
3034 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3037 static FloatParts64
uint_to_float(uint64_t a
, int scale
, float_status
*status
)
3039 FloatParts64 r
= { .sign
= false };
3043 r
.cls
= float_class_zero
;
3045 scale
= MIN(MAX(scale
, -0x10000), 0x10000);
3047 r
.cls
= float_class_normal
;
3048 r
.exp
= DECOMPOSED_BINARY_POINT
- shift
+ scale
;
3049 r
.frac
= a
<< shift
;
3055 float16
uint64_to_float16_scalbn(uint64_t a
, int scale
, float_status
*status
)
3057 FloatParts64 pa
= uint_to_float(a
, scale
, status
);
3058 return float16_round_pack_canonical(&pa
, status
);
3061 float16
uint32_to_float16_scalbn(uint32_t a
, int scale
, float_status
*status
)
3063 return uint64_to_float16_scalbn(a
, scale
, status
);
3066 float16
uint16_to_float16_scalbn(uint16_t a
, int scale
, float_status
*status
)
3068 return uint64_to_float16_scalbn(a
, scale
, status
);
3071 float16
uint64_to_float16(uint64_t a
, float_status
*status
)
3073 return uint64_to_float16_scalbn(a
, 0, status
);
3076 float16
uint32_to_float16(uint32_t a
, float_status
*status
)
3078 return uint64_to_float16_scalbn(a
, 0, status
);
3081 float16
uint16_to_float16(uint16_t a
, float_status
*status
)
3083 return uint64_to_float16_scalbn(a
, 0, status
);
3086 float16
uint8_to_float16(uint8_t a
, float_status
*status
)
3088 return uint64_to_float16_scalbn(a
, 0, status
);
3091 float32
uint64_to_float32_scalbn(uint64_t a
, int scale
, float_status
*status
)
3093 FloatParts64 pa
= uint_to_float(a
, scale
, status
);
3094 return float32_round_pack_canonical(&pa
, status
);
3097 float32
uint32_to_float32_scalbn(uint32_t a
, int scale
, float_status
*status
)
3099 return uint64_to_float32_scalbn(a
, scale
, status
);
3102 float32
uint16_to_float32_scalbn(uint16_t a
, int scale
, float_status
*status
)
3104 return uint64_to_float32_scalbn(a
, scale
, status
);
3107 float32
uint64_to_float32(uint64_t a
, float_status
*status
)
3109 return uint64_to_float32_scalbn(a
, 0, status
);
3112 float32
uint32_to_float32(uint32_t a
, float_status
*status
)
3114 return uint64_to_float32_scalbn(a
, 0, status
);
3117 float32
uint16_to_float32(uint16_t a
, float_status
*status
)
3119 return uint64_to_float32_scalbn(a
, 0, status
);
3122 float64
uint64_to_float64_scalbn(uint64_t a
, int scale
, float_status
*status
)
3124 FloatParts64 pa
= uint_to_float(a
, scale
, status
);
3125 return float64_round_pack_canonical(&pa
, status
);
3128 float64
uint32_to_float64_scalbn(uint32_t a
, int scale
, float_status
*status
)
3130 return uint64_to_float64_scalbn(a
, scale
, status
);
3133 float64
uint16_to_float64_scalbn(uint16_t a
, int scale
, float_status
*status
)
3135 return uint64_to_float64_scalbn(a
, scale
, status
);
3138 float64
uint64_to_float64(uint64_t a
, float_status
*status
)
3140 return uint64_to_float64_scalbn(a
, 0, status
);
3143 float64
uint32_to_float64(uint32_t a
, float_status
*status
)
3145 return uint64_to_float64_scalbn(a
, 0, status
);
3148 float64
uint16_to_float64(uint16_t a
, float_status
*status
)
3150 return uint64_to_float64_scalbn(a
, 0, status
);
3154 * Returns the result of converting the unsigned integer `a' to the
3158 bfloat16
uint64_to_bfloat16_scalbn(uint64_t a
, int scale
, float_status
*status
)
3160 FloatParts64 pa
= uint_to_float(a
, scale
, status
);
3161 return bfloat16_round_pack_canonical(&pa
, status
);
3164 bfloat16
uint32_to_bfloat16_scalbn(uint32_t a
, int scale
, float_status
*status
)
3166 return uint64_to_bfloat16_scalbn(a
, scale
, status
);
3169 bfloat16
uint16_to_bfloat16_scalbn(uint16_t a
, int scale
, float_status
*status
)
3171 return uint64_to_bfloat16_scalbn(a
, scale
, status
);
3174 bfloat16
uint64_to_bfloat16(uint64_t a
, float_status
*status
)
3176 return uint64_to_bfloat16_scalbn(a
, 0, status
);
3179 bfloat16
uint32_to_bfloat16(uint32_t a
, float_status
*status
)
3181 return uint64_to_bfloat16_scalbn(a
, 0, status
);
3184 bfloat16
uint16_to_bfloat16(uint16_t a
, float_status
*status
)
3186 return uint64_to_bfloat16_scalbn(a
, 0, status
);
3190 /* min() and max() functions. These can't be implemented as
3191 * 'compare and pick one input' because that would mishandle
3192 * NaNs and +0 vs -0.
3194 * minnum() and maxnum() functions. These are similar to the min()
3195 * and max() functions but if one of the arguments is a QNaN and
3196 * the other is numerical then the numerical argument is returned.
3197 * SNaNs will get quietened before being returned.
3198 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
3199 * and maxNum() operations. min() and max() are the typical min/max
3200 * semantics provided by many CPUs which predate that specification.
3202 * minnummag() and maxnummag() functions correspond to minNumMag()
3203 * and minNumMag() from the IEEE-754 2008.
3205 static FloatParts64
minmax_floats(FloatParts64 a
, FloatParts64 b
, bool ismin
,
3206 bool ieee
, bool ismag
, float_status
*s
)
3208 if (unlikely(is_nan(a
.cls
) || is_nan(b
.cls
))) {
3210 /* Takes two floating-point values `a' and `b', one of
3211 * which is a NaN, and returns the appropriate NaN
3212 * result. If either `a' or `b' is a signaling NaN,
3213 * the invalid exception is raised.
3215 if (is_snan(a
.cls
) || is_snan(b
.cls
)) {
3216 return *parts_pick_nan(&a
, &b
, s
);
3217 } else if (is_nan(a
.cls
) && !is_nan(b
.cls
)) {
3219 } else if (is_nan(b
.cls
) && !is_nan(a
.cls
)) {
3223 return *parts_pick_nan(&a
, &b
, s
);
3228 case float_class_normal
:
3231 case float_class_inf
:
3234 case float_class_zero
:
3238 g_assert_not_reached();
3242 case float_class_normal
:
3245 case float_class_inf
:
3248 case float_class_zero
:
3252 g_assert_not_reached();
3256 if (ismag
&& (a_exp
!= b_exp
|| a
.frac
!= b
.frac
)) {
3257 bool a_less
= a_exp
< b_exp
;
3258 if (a_exp
== b_exp
) {
3259 a_less
= a
.frac
< b
.frac
;
3261 return a_less
^ ismin
? b
: a
;
3264 if (a
.sign
== b
.sign
) {
3265 bool a_less
= a_exp
< b_exp
;
3266 if (a_exp
== b_exp
) {
3267 a_less
= a
.frac
< b
.frac
;
3269 return a
.sign
^ a_less
^ ismin
? b
: a
;
3271 return a
.sign
^ ismin
? b
: a
;
3276 #define MINMAX(sz, name, ismin, isiee, ismag) \
3277 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \
3280 FloatParts64 pa, pb, pr; \
3281 float ## sz ## _unpack_canonical(&pa, a, s); \
3282 float ## sz ## _unpack_canonical(&pb, b, s); \
3283 pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
3284 return float ## sz ## _round_pack_canonical(&pr, s); \
3287 MINMAX(16, min
, true, false, false)
3288 MINMAX(16, minnum
, true, true, false)
3289 MINMAX(16, minnummag
, true, true, true)
3290 MINMAX(16, max
, false, false, false)
3291 MINMAX(16, maxnum
, false, true, false)
3292 MINMAX(16, maxnummag
, false, true, true)
3294 MINMAX(32, min
, true, false, false)
3295 MINMAX(32, minnum
, true, true, false)
3296 MINMAX(32, minnummag
, true, true, true)
3297 MINMAX(32, max
, false, false, false)
3298 MINMAX(32, maxnum
, false, true, false)
3299 MINMAX(32, maxnummag
, false, true, true)
3301 MINMAX(64, min
, true, false, false)
3302 MINMAX(64, minnum
, true, true, false)
3303 MINMAX(64, minnummag
, true, true, true)
3304 MINMAX(64, max
, false, false, false)
3305 MINMAX(64, maxnum
, false, true, false)
3306 MINMAX(64, maxnummag
, false, true, true)
3310 #define BF16_MINMAX(name, ismin, isiee, ismag) \
3311 bfloat16 bfloat16_ ## name(bfloat16 a, bfloat16 b, float_status *s) \
3313 FloatParts64 pa, pb, pr; \
3314 bfloat16_unpack_canonical(&pa, a, s); \
3315 bfloat16_unpack_canonical(&pb, b, s); \
3316 pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
3317 return bfloat16_round_pack_canonical(&pr, s); \
3320 BF16_MINMAX(min
, true, false, false)
3321 BF16_MINMAX(minnum
, true, true, false)
3322 BF16_MINMAX(minnummag
, true, true, true)
3323 BF16_MINMAX(max
, false, false, false)
3324 BF16_MINMAX(maxnum
, false, true, false)
3325 BF16_MINMAX(maxnummag
, false, true, true)
3329 /* Floating point compare */
3330 static FloatRelation
compare_floats(FloatParts64 a
, FloatParts64 b
, bool is_quiet
,
3333 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
3335 a
.cls
== float_class_snan
||
3336 b
.cls
== float_class_snan
) {
3337 float_raise(float_flag_invalid
, s
);
3339 return float_relation_unordered
;
3342 if (a
.cls
== float_class_zero
) {
3343 if (b
.cls
== float_class_zero
) {
3344 return float_relation_equal
;
3346 return b
.sign
? float_relation_greater
: float_relation_less
;
3347 } else if (b
.cls
== float_class_zero
) {
3348 return a
.sign
? float_relation_less
: float_relation_greater
;
3351 /* The only really important thing about infinity is its sign. If
3352 * both are infinities the sign marks the smallest of the two.
3354 if (a
.cls
== float_class_inf
) {
3355 if ((b
.cls
== float_class_inf
) && (a
.sign
== b
.sign
)) {
3356 return float_relation_equal
;
3358 return a
.sign
? float_relation_less
: float_relation_greater
;
3359 } else if (b
.cls
== float_class_inf
) {
3360 return b
.sign
? float_relation_greater
: float_relation_less
;
3363 if (a
.sign
!= b
.sign
) {
3364 return a
.sign
? float_relation_less
: float_relation_greater
;
3367 if (a
.exp
== b
.exp
) {
3368 if (a
.frac
== b
.frac
) {
3369 return float_relation_equal
;
3372 return a
.frac
> b
.frac
?
3373 float_relation_less
: float_relation_greater
;
3375 return a
.frac
> b
.frac
?
3376 float_relation_greater
: float_relation_less
;
3380 return a
.exp
> b
.exp
? float_relation_less
: float_relation_greater
;
3382 return a
.exp
> b
.exp
? float_relation_greater
: float_relation_less
;
3387 #define COMPARE(name, attr, sz) \
3389 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s) \
3391 FloatParts64 pa, pb; \
3392 float ## sz ## _unpack_canonical(&pa, a, s); \
3393 float ## sz ## _unpack_canonical(&pb, b, s); \
3394 return compare_floats(pa, pb, is_quiet, s); \
3397 COMPARE(soft_f16_compare
, QEMU_FLATTEN
, 16)
3398 COMPARE(soft_f32_compare
, QEMU_SOFTFLOAT_ATTR
, 32)
3399 COMPARE(soft_f64_compare
, QEMU_SOFTFLOAT_ATTR
, 64)
3403 FloatRelation
float16_compare(float16 a
, float16 b
, float_status
*s
)
3405 return soft_f16_compare(a
, b
, false, s
);
3408 FloatRelation
float16_compare_quiet(float16 a
, float16 b
, float_status
*s
)
3410 return soft_f16_compare(a
, b
, true, s
);
3413 static FloatRelation QEMU_FLATTEN
3414 f32_compare(float32 xa
, float32 xb
, bool is_quiet
, float_status
*s
)
3416 union_float32 ua
, ub
;
3421 if (QEMU_NO_HARDFLOAT
) {
3425 float32_input_flush2(&ua
.s
, &ub
.s
, s
);
3426 if (isgreaterequal(ua
.h
, ub
.h
)) {
3427 if (isgreater(ua
.h
, ub
.h
)) {
3428 return float_relation_greater
;
3430 return float_relation_equal
;
3432 if (likely(isless(ua
.h
, ub
.h
))) {
3433 return float_relation_less
;
3435 /* The only condition remaining is unordered.
3436 * Fall through to set flags.
3439 return soft_f32_compare(ua
.s
, ub
.s
, is_quiet
, s
);
3442 FloatRelation
float32_compare(float32 a
, float32 b
, float_status
*s
)
3444 return f32_compare(a
, b
, false, s
);
3447 FloatRelation
float32_compare_quiet(float32 a
, float32 b
, float_status
*s
)
3449 return f32_compare(a
, b
, true, s
);
3452 static FloatRelation QEMU_FLATTEN
3453 f64_compare(float64 xa
, float64 xb
, bool is_quiet
, float_status
*s
)
3455 union_float64 ua
, ub
;
3460 if (QEMU_NO_HARDFLOAT
) {
3464 float64_input_flush2(&ua
.s
, &ub
.s
, s
);
3465 if (isgreaterequal(ua
.h
, ub
.h
)) {
3466 if (isgreater(ua
.h
, ub
.h
)) {
3467 return float_relation_greater
;
3469 return float_relation_equal
;
3471 if (likely(isless(ua
.h
, ub
.h
))) {
3472 return float_relation_less
;
3474 /* The only condition remaining is unordered.
3475 * Fall through to set flags.
3478 return soft_f64_compare(ua
.s
, ub
.s
, is_quiet
, s
);
3481 FloatRelation
float64_compare(float64 a
, float64 b
, float_status
*s
)
3483 return f64_compare(a
, b
, false, s
);
3486 FloatRelation
float64_compare_quiet(float64 a
, float64 b
, float_status
*s
)
3488 return f64_compare(a
, b
, true, s
);
3491 static FloatRelation QEMU_FLATTEN
3492 soft_bf16_compare(bfloat16 a
, bfloat16 b
, bool is_quiet
, float_status
*s
)
3494 FloatParts64 pa
, pb
;
3496 bfloat16_unpack_canonical(&pa
, a
, s
);
3497 bfloat16_unpack_canonical(&pb
, b
, s
);
3498 return compare_floats(pa
, pb
, is_quiet
, s
);
3501 FloatRelation
bfloat16_compare(bfloat16 a
, bfloat16 b
, float_status
*s
)
3503 return soft_bf16_compare(a
, b
, false, s
);
3506 FloatRelation
bfloat16_compare_quiet(bfloat16 a
, bfloat16 b
, float_status
*s
)
3508 return soft_bf16_compare(a
, b
, true, s
);
3511 /* Multiply A by 2 raised to the power N. */
3512 static FloatParts64
scalbn_decomposed(FloatParts64 a
, int n
, float_status
*s
)
3514 if (unlikely(is_nan(a
.cls
))) {
3515 parts_return_nan(&a
, s
);
3517 if (a
.cls
== float_class_normal
) {
3518 /* The largest float type (even though not supported by FloatParts64)
3519 * is float128, which has a 15 bit exponent. Bounding N to 16 bits
3520 * still allows rounding to infinity, without allowing overflow
3521 * within the int32_t that backs FloatParts64.exp.
3523 n
= MIN(MAX(n
, -0x10000), 0x10000);
3529 float16
float16_scalbn(float16 a
, int n
, float_status
*status
)
3531 FloatParts64 pa
, pr
;
3533 float16_unpack_canonical(&pa
, a
, status
);
3534 pr
= scalbn_decomposed(pa
, n
, status
);
3535 return float16_round_pack_canonical(&pr
, status
);
3538 float32
float32_scalbn(float32 a
, int n
, float_status
*status
)
3540 FloatParts64 pa
, pr
;
3542 float32_unpack_canonical(&pa
, a
, status
);
3543 pr
= scalbn_decomposed(pa
, n
, status
);
3544 return float32_round_pack_canonical(&pr
, status
);
3547 float64
float64_scalbn(float64 a
, int n
, float_status
*status
)
3549 FloatParts64 pa
, pr
;
3551 float64_unpack_canonical(&pa
, a
, status
);
3552 pr
= scalbn_decomposed(pa
, n
, status
);
3553 return float64_round_pack_canonical(&pr
, status
);
3556 bfloat16
bfloat16_scalbn(bfloat16 a
, int n
, float_status
*status
)
3558 FloatParts64 pa
, pr
;
3560 bfloat16_unpack_canonical(&pa
, a
, status
);
3561 pr
= scalbn_decomposed(pa
, n
, status
);
3562 return bfloat16_round_pack_canonical(&pr
, status
);
3568 * The old softfloat code did an approximation step before zeroing in
3569 * on the final result. However for simpleness we just compute the
3570 * square root by iterating down from the implicit bit to enough extra
3571 * bits to ensure we get a correctly rounded result.
3573 * This does mean however the calculation is slower than before,
3574 * especially for 64 bit floats.
3577 static FloatParts64
sqrt_float(FloatParts64 a
, float_status
*s
, const FloatFmt
*p
)
3579 uint64_t a_frac
, r_frac
, s_frac
;
3582 if (is_nan(a
.cls
)) {
3583 parts_return_nan(&a
, s
);
3586 if (a
.cls
== float_class_zero
) {
3587 return a
; /* sqrt(+-0) = +-0 */
3590 float_raise(float_flag_invalid
, s
);
3591 parts_default_nan(&a
, s
);
3594 if (a
.cls
== float_class_inf
) {
3595 return a
; /* sqrt(+inf) = +inf */
3598 assert(a
.cls
== float_class_normal
);
3600 /* We need two overflow bits at the top. Adding room for that is a
3601 * right shift. If the exponent is odd, we can discard the low bit
3602 * by multiplying the fraction by 2; that's a left shift. Combine
3603 * those and we shift right by 1 if the exponent is odd, otherwise 2.
3605 a_frac
= a
.frac
>> (2 - (a
.exp
& 1));
3608 /* Bit-by-bit computation of sqrt. */
3612 /* Iterate from implicit bit down to the 3 extra bits to compute a
3613 * properly rounded result. Remember we've inserted two more bits
3614 * at the top, so these positions are two less.
3616 bit
= DECOMPOSED_BINARY_POINT
- 2;
3617 last_bit
= MAX(p
->frac_shift
- 4, 0);
3619 uint64_t q
= 1ULL << bit
;
3620 uint64_t t_frac
= s_frac
+ q
;
3621 if (t_frac
<= a_frac
) {
3622 s_frac
= t_frac
+ q
;
3627 } while (--bit
>= last_bit
);
3629 /* Undo the right shift done above. If there is any remaining
3630 * fraction, the result is inexact. Set the sticky bit.
3632 a
.frac
= (r_frac
<< 2) + (a_frac
!= 0);
3637 float16 QEMU_FLATTEN
float16_sqrt(float16 a
, float_status
*status
)
3639 FloatParts64 pa
, pr
;
3641 float16_unpack_canonical(&pa
, a
, status
);
3642 pr
= sqrt_float(pa
, status
, &float16_params
);
3643 return float16_round_pack_canonical(&pr
, status
);
3646 static float32 QEMU_SOFTFLOAT_ATTR
3647 soft_f32_sqrt(float32 a
, float_status
*status
)
3649 FloatParts64 pa
, pr
;
3651 float32_unpack_canonical(&pa
, a
, status
);
3652 pr
= sqrt_float(pa
, status
, &float32_params
);
3653 return float32_round_pack_canonical(&pr
, status
);
3656 static float64 QEMU_SOFTFLOAT_ATTR
3657 soft_f64_sqrt(float64 a
, float_status
*status
)
3659 FloatParts64 pa
, pr
;
3661 float64_unpack_canonical(&pa
, a
, status
);
3662 pr
= sqrt_float(pa
, status
, &float64_params
);
3663 return float64_round_pack_canonical(&pr
, status
);
3666 float32 QEMU_FLATTEN
float32_sqrt(float32 xa
, float_status
*s
)
3668 union_float32 ua
, ur
;
3671 if (unlikely(!can_use_fpu(s
))) {
3675 float32_input_flush1(&ua
.s
, s
);
3676 if (QEMU_HARDFLOAT_1F32_USE_FP
) {
3677 if (unlikely(!(fpclassify(ua
.h
) == FP_NORMAL
||
3678 fpclassify(ua
.h
) == FP_ZERO
) ||
3682 } else if (unlikely(!float32_is_zero_or_normal(ua
.s
) ||
3683 float32_is_neg(ua
.s
))) {
3690 return soft_f32_sqrt(ua
.s
, s
);
3693 float64 QEMU_FLATTEN
float64_sqrt(float64 xa
, float_status
*s
)
3695 union_float64 ua
, ur
;
3698 if (unlikely(!can_use_fpu(s
))) {
3702 float64_input_flush1(&ua
.s
, s
);
3703 if (QEMU_HARDFLOAT_1F64_USE_FP
) {
3704 if (unlikely(!(fpclassify(ua
.h
) == FP_NORMAL
||
3705 fpclassify(ua
.h
) == FP_ZERO
) ||
3709 } else if (unlikely(!float64_is_zero_or_normal(ua
.s
) ||
3710 float64_is_neg(ua
.s
))) {
3717 return soft_f64_sqrt(ua
.s
, s
);
3720 bfloat16 QEMU_FLATTEN
bfloat16_sqrt(bfloat16 a
, float_status
*status
)
3722 FloatParts64 pa
, pr
;
3724 bfloat16_unpack_canonical(&pa
, a
, status
);
3725 pr
= sqrt_float(pa
, status
, &bfloat16_params
);
3726 return bfloat16_round_pack_canonical(&pr
, status
);
3729 /*----------------------------------------------------------------------------
3730 | The pattern for a default generated NaN.
3731 *----------------------------------------------------------------------------*/
3733 float16
float16_default_nan(float_status
*status
)
3737 parts_default_nan(&p
, status
);
3738 p
.frac
>>= float16_params
.frac_shift
;
3739 return float16_pack_raw(&p
);
3742 float32
float32_default_nan(float_status
*status
)
3746 parts_default_nan(&p
, status
);
3747 p
.frac
>>= float32_params
.frac_shift
;
3748 return float32_pack_raw(&p
);
3751 float64
float64_default_nan(float_status
*status
)
3755 parts_default_nan(&p
, status
);
3756 p
.frac
>>= float64_params
.frac_shift
;
3757 return float64_pack_raw(&p
);
3760 float128
float128_default_nan(float_status
*status
)
3764 parts_default_nan(&p
, status
);
3765 frac_shr(&p
, float128_params
.frac_shift
);
3766 return float128_pack_raw(&p
);
3769 bfloat16
bfloat16_default_nan(float_status
*status
)
3773 parts_default_nan(&p
, status
);
3774 p
.frac
>>= bfloat16_params
.frac_shift
;
3775 return bfloat16_pack_raw(&p
);
3778 /*----------------------------------------------------------------------------
3779 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
3780 *----------------------------------------------------------------------------*/
3782 float16
float16_silence_nan(float16 a
, float_status
*status
)
3786 float16_unpack_raw(&p
, a
);
3787 p
.frac
<<= float16_params
.frac_shift
;
3788 parts_silence_nan(&p
, status
);
3789 p
.frac
>>= float16_params
.frac_shift
;
3790 return float16_pack_raw(&p
);
3793 float32
float32_silence_nan(float32 a
, float_status
*status
)
3797 float32_unpack_raw(&p
, a
);
3798 p
.frac
<<= float32_params
.frac_shift
;
3799 parts_silence_nan(&p
, status
);
3800 p
.frac
>>= float32_params
.frac_shift
;
3801 return float32_pack_raw(&p
);
3804 float64
float64_silence_nan(float64 a
, float_status
*status
)
3808 float64_unpack_raw(&p
, a
);
3809 p
.frac
<<= float64_params
.frac_shift
;
3810 parts_silence_nan(&p
, status
);
3811 p
.frac
>>= float64_params
.frac_shift
;
3812 return float64_pack_raw(&p
);
3815 bfloat16
bfloat16_silence_nan(bfloat16 a
, float_status
*status
)
3819 bfloat16_unpack_raw(&p
, a
);
3820 p
.frac
<<= bfloat16_params
.frac_shift
;
3821 parts_silence_nan(&p
, status
);
3822 p
.frac
>>= bfloat16_params
.frac_shift
;
3823 return bfloat16_pack_raw(&p
);
3826 float128
float128_silence_nan(float128 a
, float_status
*status
)
3830 float128_unpack_raw(&p
, a
);
3831 frac_shl(&p
, float128_params
.frac_shift
);
3832 parts_silence_nan(&p
, status
);
3833 frac_shr(&p
, float128_params
.frac_shift
);
3834 return float128_pack_raw(&p
);
3837 /*----------------------------------------------------------------------------
3838 | If `a' is denormal and we are in flush-to-zero mode then set the
3839 | input-denormal exception and return zero. Otherwise just return the value.
3840 *----------------------------------------------------------------------------*/
3842 static bool parts_squash_denormal(FloatParts64 p
, float_status
*status
)
3844 if (p
.exp
== 0 && p
.frac
!= 0) {
3845 float_raise(float_flag_input_denormal
, status
);
3852 float16
float16_squash_input_denormal(float16 a
, float_status
*status
)
3854 if (status
->flush_inputs_to_zero
) {
3857 float16_unpack_raw(&p
, a
);
3858 if (parts_squash_denormal(p
, status
)) {
3859 return float16_set_sign(float16_zero
, p
.sign
);
3865 float32
float32_squash_input_denormal(float32 a
, float_status
*status
)
3867 if (status
->flush_inputs_to_zero
) {
3870 float32_unpack_raw(&p
, a
);
3871 if (parts_squash_denormal(p
, status
)) {
3872 return float32_set_sign(float32_zero
, p
.sign
);
3878 float64
float64_squash_input_denormal(float64 a
, float_status
*status
)
3880 if (status
->flush_inputs_to_zero
) {
3883 float64_unpack_raw(&p
, a
);
3884 if (parts_squash_denormal(p
, status
)) {
3885 return float64_set_sign(float64_zero
, p
.sign
);
3891 bfloat16
bfloat16_squash_input_denormal(bfloat16 a
, float_status
*status
)
3893 if (status
->flush_inputs_to_zero
) {
3896 bfloat16_unpack_raw(&p
, a
);
3897 if (parts_squash_denormal(p
, status
)) {
3898 return bfloat16_set_sign(bfloat16_zero
, p
.sign
);
3904 /*----------------------------------------------------------------------------
3905 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
3906 | and 7, and returns the properly rounded 32-bit integer corresponding to the
3907 | input. If `zSign' is 1, the input is negated before being converted to an
3908 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
3909 | is simply rounded to an integer, with the inexact exception raised if the
3910 | input cannot be represented exactly as an integer. However, if the fixed-
3911 | point input is too large, the invalid exception is raised and the largest
3912 | positive or negative integer is returned.
3913 *----------------------------------------------------------------------------*/
3915 static int32_t roundAndPackInt32(bool zSign
, uint64_t absZ
,
3916 float_status
*status
)
3918 int8_t roundingMode
;
3919 bool roundNearestEven
;
3920 int8_t roundIncrement
, roundBits
;
3923 roundingMode
= status
->float_rounding_mode
;
3924 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
3925 switch (roundingMode
) {
3926 case float_round_nearest_even
:
3927 case float_round_ties_away
:
3928 roundIncrement
= 0x40;
3930 case float_round_to_zero
:
3933 case float_round_up
:
3934 roundIncrement
= zSign
? 0 : 0x7f;
3936 case float_round_down
:
3937 roundIncrement
= zSign
? 0x7f : 0;
3939 case float_round_to_odd
:
3940 roundIncrement
= absZ
& 0x80 ? 0 : 0x7f;
3945 roundBits
= absZ
& 0x7F;
3946 absZ
= ( absZ
+ roundIncrement
)>>7;
3947 if (!(roundBits
^ 0x40) && roundNearestEven
) {
3951 if ( zSign
) z
= - z
;
3952 if ( ( absZ
>>32 ) || ( z
&& ( ( z
< 0 ) ^ zSign
) ) ) {
3953 float_raise(float_flag_invalid
, status
);
3954 return zSign
? INT32_MIN
: INT32_MAX
;
3957 float_raise(float_flag_inexact
, status
);
3963 /*----------------------------------------------------------------------------
3964 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3965 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3966 | and returns the properly rounded 64-bit integer corresponding to the input.
3967 | If `zSign' is 1, the input is negated before being converted to an integer.
3968 | Ordinarily, the fixed-point input is simply rounded to an integer, with
3969 | the inexact exception raised if the input cannot be represented exactly as
3970 | an integer. However, if the fixed-point input is too large, the invalid
3971 | exception is raised and the largest positive or negative integer is
3973 *----------------------------------------------------------------------------*/
3975 static int64_t roundAndPackInt64(bool zSign
, uint64_t absZ0
, uint64_t absZ1
,
3976 float_status
*status
)
3978 int8_t roundingMode
;
3979 bool roundNearestEven
, increment
;
3982 roundingMode
= status
->float_rounding_mode
;
3983 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
3984 switch (roundingMode
) {
3985 case float_round_nearest_even
:
3986 case float_round_ties_away
:
3987 increment
= ((int64_t) absZ1
< 0);
3989 case float_round_to_zero
:
3992 case float_round_up
:
3993 increment
= !zSign
&& absZ1
;
3995 case float_round_down
:
3996 increment
= zSign
&& absZ1
;
3998 case float_round_to_odd
:
3999 increment
= !(absZ0
& 1) && absZ1
;
4006 if ( absZ0
== 0 ) goto overflow
;
4007 if (!(absZ1
<< 1) && roundNearestEven
) {
4012 if ( zSign
) z
= - z
;
4013 if ( z
&& ( ( z
< 0 ) ^ zSign
) ) {
4015 float_raise(float_flag_invalid
, status
);
4016 return zSign
? INT64_MIN
: INT64_MAX
;
4019 float_raise(float_flag_inexact
, status
);
4025 /*----------------------------------------------------------------------------
4026 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
4027 | `absZ1', with binary point between bits 63 and 64 (between the input words),
4028 | and returns the properly rounded 64-bit unsigned integer corresponding to the
4029 | input. Ordinarily, the fixed-point input is simply rounded to an integer,
4030 | with the inexact exception raised if the input cannot be represented exactly
4031 | as an integer. However, if the fixed-point input is too large, the invalid
4032 | exception is raised and the largest unsigned integer is returned.
4033 *----------------------------------------------------------------------------*/
4035 static int64_t roundAndPackUint64(bool zSign
, uint64_t absZ0
,
4036 uint64_t absZ1
, float_status
*status
)
4038 int8_t roundingMode
;
4039 bool roundNearestEven
, increment
;
4041 roundingMode
= status
->float_rounding_mode
;
4042 roundNearestEven
= (roundingMode
== float_round_nearest_even
);
4043 switch (roundingMode
) {
4044 case float_round_nearest_even
:
4045 case float_round_ties_away
:
4046 increment
= ((int64_t)absZ1
< 0);
4048 case float_round_to_zero
:
4051 case float_round_up
:
4052 increment
= !zSign
&& absZ1
;
4054 case float_round_down
:
4055 increment
= zSign
&& absZ1
;
4057 case float_round_to_odd
:
4058 increment
= !(absZ0
& 1) && absZ1
;
4066 float_raise(float_flag_invalid
, status
);
4069 if (!(absZ1
<< 1) && roundNearestEven
) {
4074 if (zSign
&& absZ0
) {
4075 float_raise(float_flag_invalid
, status
);
4080 float_raise(float_flag_inexact
, status
);
4085 /*----------------------------------------------------------------------------
4086 | Normalizes the subnormal single-precision floating-point value represented
4087 | by the denormalized significand `aSig'. The normalized exponent and
4088 | significand are stored at the locations pointed to by `zExpPtr' and
4089 | `zSigPtr', respectively.
4090 *----------------------------------------------------------------------------*/
4093 normalizeFloat32Subnormal(uint32_t aSig
, int *zExpPtr
, uint32_t *zSigPtr
)
4097 shiftCount
= clz32(aSig
) - 8;
4098 *zSigPtr
= aSig
<<shiftCount
;
4099 *zExpPtr
= 1 - shiftCount
;
4103 /*----------------------------------------------------------------------------
4104 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4105 | and significand `zSig', and returns the proper single-precision floating-
4106 | point value corresponding to the abstract input. Ordinarily, the abstract
4107 | value is simply rounded and packed into the single-precision format, with
4108 | the inexact exception raised if the abstract input cannot be represented
4109 | exactly. However, if the abstract value is too large, the overflow and
4110 | inexact exceptions are raised and an infinity or maximal finite value is
4111 | returned. If the abstract value is too small, the input value is rounded to
4112 | a subnormal number, and the underflow and inexact exceptions are raised if
4113 | the abstract input cannot be represented exactly as a subnormal single-
4114 | precision floating-point number.
4115 | The input significand `zSig' has its binary point between bits 30
4116 | and 29, which is 7 bits to the left of the usual location. This shifted
4117 | significand must be normalized or smaller. If `zSig' is not normalized,
4118 | `zExp' must be 0; in that case, the result returned is a subnormal number,
4119 | and it must not require rounding. In the usual case that `zSig' is
4120 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
4121 | The handling of underflow and overflow follows the IEC/IEEE Standard for
4122 | Binary Floating-Point Arithmetic.
4123 *----------------------------------------------------------------------------*/
4125 static float32
roundAndPackFloat32(bool zSign
, int zExp
, uint32_t zSig
,
4126 float_status
*status
)
4128 int8_t roundingMode
;
4129 bool roundNearestEven
;
4130 int8_t roundIncrement
, roundBits
;
4133 roundingMode
= status
->float_rounding_mode
;
4134 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
4135 switch (roundingMode
) {
4136 case float_round_nearest_even
:
4137 case float_round_ties_away
:
4138 roundIncrement
= 0x40;
4140 case float_round_to_zero
:
4143 case float_round_up
:
4144 roundIncrement
= zSign
? 0 : 0x7f;
4146 case float_round_down
:
4147 roundIncrement
= zSign
? 0x7f : 0;
4149 case float_round_to_odd
:
4150 roundIncrement
= zSig
& 0x80 ? 0 : 0x7f;
4156 roundBits
= zSig
& 0x7F;
4157 if ( 0xFD <= (uint16_t) zExp
) {
4158 if ( ( 0xFD < zExp
)
4159 || ( ( zExp
== 0xFD )
4160 && ( (int32_t) ( zSig
+ roundIncrement
) < 0 ) )
4162 bool overflow_to_inf
= roundingMode
!= float_round_to_odd
&&
4163 roundIncrement
!= 0;
4164 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
4165 return packFloat32(zSign
, 0xFF, -!overflow_to_inf
);
4168 if (status
->flush_to_zero
) {
4169 float_raise(float_flag_output_denormal
, status
);
4170 return packFloat32(zSign
, 0, 0);
4172 isTiny
= status
->tininess_before_rounding
4174 || (zSig
+ roundIncrement
< 0x80000000);
4175 shift32RightJamming( zSig
, - zExp
, &zSig
);
4177 roundBits
= zSig
& 0x7F;
4178 if (isTiny
&& roundBits
) {
4179 float_raise(float_flag_underflow
, status
);
4181 if (roundingMode
== float_round_to_odd
) {
4183 * For round-to-odd case, the roundIncrement depends on
4184 * zSig which just changed.
4186 roundIncrement
= zSig
& 0x80 ? 0 : 0x7f;
4191 float_raise(float_flag_inexact
, status
);
4193 zSig
= ( zSig
+ roundIncrement
)>>7;
4194 if (!(roundBits
^ 0x40) && roundNearestEven
) {
4197 if ( zSig
== 0 ) zExp
= 0;
4198 return packFloat32( zSign
, zExp
, zSig
);
4202 /*----------------------------------------------------------------------------
4203 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4204 | and significand `zSig', and returns the proper single-precision floating-
4205 | point value corresponding to the abstract input. This routine is just like
4206 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
4207 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
4208 | floating-point exponent.
4209 *----------------------------------------------------------------------------*/
4212 normalizeRoundAndPackFloat32(bool zSign
, int zExp
, uint32_t zSig
,
4213 float_status
*status
)
4217 shiftCount
= clz32(zSig
) - 1;
4218 return roundAndPackFloat32(zSign
, zExp
- shiftCount
, zSig
<<shiftCount
,
4223 /*----------------------------------------------------------------------------
4224 | Normalizes the subnormal double-precision floating-point value represented
4225 | by the denormalized significand `aSig'. The normalized exponent and
4226 | significand are stored at the locations pointed to by `zExpPtr' and
4227 | `zSigPtr', respectively.
4228 *----------------------------------------------------------------------------*/
4231 normalizeFloat64Subnormal(uint64_t aSig
, int *zExpPtr
, uint64_t *zSigPtr
)
4235 shiftCount
= clz64(aSig
) - 11;
4236 *zSigPtr
= aSig
<<shiftCount
;
4237 *zExpPtr
= 1 - shiftCount
;
4241 /*----------------------------------------------------------------------------
4242 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
4243 | double-precision floating-point value, returning the result. After being
4244 | shifted into the proper positions, the three fields are simply added
4245 | together to form the result. This means that any integer portion of `zSig'
4246 | will be added into the exponent. Since a properly normalized significand
4247 | will have an integer portion equal to 1, the `zExp' input should be 1 less
4248 | than the desired result exponent whenever `zSig' is a complete, normalized
4250 *----------------------------------------------------------------------------*/
4252 static inline float64
packFloat64(bool zSign
, int zExp
, uint64_t zSig
)
4255 return make_float64(
4256 ( ( (uint64_t) zSign
)<<63 ) + ( ( (uint64_t) zExp
)<<52 ) + zSig
);
4260 /*----------------------------------------------------------------------------
4261 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4262 | and significand `zSig', and returns the proper double-precision floating-
4263 | point value corresponding to the abstract input. Ordinarily, the abstract
4264 | value is simply rounded and packed into the double-precision format, with
4265 | the inexact exception raised if the abstract input cannot be represented
4266 | exactly. However, if the abstract value is too large, the overflow and
4267 | inexact exceptions are raised and an infinity or maximal finite value is
4268 | returned. If the abstract value is too small, the input value is rounded to
4269 | a subnormal number, and the underflow and inexact exceptions are raised if
4270 | the abstract input cannot be represented exactly as a subnormal double-
4271 | precision floating-point number.
4272 | The input significand `zSig' has its binary point between bits 62
4273 | and 61, which is 10 bits to the left of the usual location. This shifted
4274 | significand must be normalized or smaller. If `zSig' is not normalized,
4275 | `zExp' must be 0; in that case, the result returned is a subnormal number,
4276 | and it must not require rounding. In the usual case that `zSig' is
4277 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
4278 | The handling of underflow and overflow follows the IEC/IEEE Standard for
4279 | Binary Floating-Point Arithmetic.
4280 *----------------------------------------------------------------------------*/
4282 static float64
roundAndPackFloat64(bool zSign
, int zExp
, uint64_t zSig
,
4283 float_status
*status
)
4285 int8_t roundingMode
;
4286 bool roundNearestEven
;
4287 int roundIncrement
, roundBits
;
4290 roundingMode
= status
->float_rounding_mode
;
4291 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
4292 switch (roundingMode
) {
4293 case float_round_nearest_even
:
4294 case float_round_ties_away
:
4295 roundIncrement
= 0x200;
4297 case float_round_to_zero
:
4300 case float_round_up
:
4301 roundIncrement
= zSign
? 0 : 0x3ff;
4303 case float_round_down
:
4304 roundIncrement
= zSign
? 0x3ff : 0;
4306 case float_round_to_odd
:
4307 roundIncrement
= (zSig
& 0x400) ? 0 : 0x3ff;
4312 roundBits
= zSig
& 0x3FF;
4313 if ( 0x7FD <= (uint16_t) zExp
) {
4314 if ( ( 0x7FD < zExp
)
4315 || ( ( zExp
== 0x7FD )
4316 && ( (int64_t) ( zSig
+ roundIncrement
) < 0 ) )
4318 bool overflow_to_inf
= roundingMode
!= float_round_to_odd
&&
4319 roundIncrement
!= 0;
4320 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
4321 return packFloat64(zSign
, 0x7FF, -(!overflow_to_inf
));
4324 if (status
->flush_to_zero
) {
4325 float_raise(float_flag_output_denormal
, status
);
4326 return packFloat64(zSign
, 0, 0);
4328 isTiny
= status
->tininess_before_rounding
4330 || (zSig
+ roundIncrement
< UINT64_C(0x8000000000000000));
4331 shift64RightJamming( zSig
, - zExp
, &zSig
);
4333 roundBits
= zSig
& 0x3FF;
4334 if (isTiny
&& roundBits
) {
4335 float_raise(float_flag_underflow
, status
);
4337 if (roundingMode
== float_round_to_odd
) {
4339 * For round-to-odd case, the roundIncrement depends on
4340 * zSig which just changed.
4342 roundIncrement
= (zSig
& 0x400) ? 0 : 0x3ff;
4347 float_raise(float_flag_inexact
, status
);
4349 zSig
= ( zSig
+ roundIncrement
)>>10;
4350 if (!(roundBits
^ 0x200) && roundNearestEven
) {
4353 if ( zSig
== 0 ) zExp
= 0;
4354 return packFloat64( zSign
, zExp
, zSig
);
4358 /*----------------------------------------------------------------------------
4359 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4360 | and significand `zSig', and returns the proper double-precision floating-
4361 | point value corresponding to the abstract input. This routine is just like
4362 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
4363 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
4364 | floating-point exponent.
4365 *----------------------------------------------------------------------------*/
4368 normalizeRoundAndPackFloat64(bool zSign
, int zExp
, uint64_t zSig
,
4369 float_status
*status
)
4373 shiftCount
= clz64(zSig
) - 1;
4374 return roundAndPackFloat64(zSign
, zExp
- shiftCount
, zSig
<<shiftCount
,
4379 /*----------------------------------------------------------------------------
4380 | Normalizes the subnormal extended double-precision floating-point value
4381 | represented by the denormalized significand `aSig'. The normalized exponent
4382 | and significand are stored at the locations pointed to by `zExpPtr' and
4383 | `zSigPtr', respectively.
4384 *----------------------------------------------------------------------------*/
4386 void normalizeFloatx80Subnormal(uint64_t aSig
, int32_t *zExpPtr
,
4391 shiftCount
= clz64(aSig
);
4392 *zSigPtr
= aSig
<<shiftCount
;
4393 *zExpPtr
= 1 - shiftCount
;
4396 /*----------------------------------------------------------------------------
4397 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4398 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
4399 | and returns the proper extended double-precision floating-point value
4400 | corresponding to the abstract input. Ordinarily, the abstract value is
4401 | rounded and packed into the extended double-precision format, with the
4402 | inexact exception raised if the abstract input cannot be represented
4403 | exactly. However, if the abstract value is too large, the overflow and
4404 | inexact exceptions are raised and an infinity or maximal finite value is
4405 | returned. If the abstract value is too small, the input value is rounded to
4406 | a subnormal number, and the underflow and inexact exceptions are raised if
4407 | the abstract input cannot be represented exactly as a subnormal extended
4408 | double-precision floating-point number.
4409 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
4410 | number of bits as single or double precision, respectively. Otherwise, the
4411 | result is rounded to the full precision of the extended double-precision
4413 | The input significand must be normalized or smaller. If the input
4414 | significand is not normalized, `zExp' must be 0; in that case, the result
4415 | returned is a subnormal number, and it must not require rounding. The
4416 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
4417 | Floating-Point Arithmetic.
4418 *----------------------------------------------------------------------------*/
4420 floatx80
roundAndPackFloatx80(int8_t roundingPrecision
, bool zSign
,
4421 int32_t zExp
, uint64_t zSig0
, uint64_t zSig1
,
4422 float_status
*status
)
4424 int8_t roundingMode
;
4425 bool roundNearestEven
, increment
, isTiny
;
4426 int64_t roundIncrement
, roundMask
, roundBits
;
4428 roundingMode
= status
->float_rounding_mode
;
4429 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
4430 if ( roundingPrecision
== 80 ) goto precision80
;
4431 if ( roundingPrecision
== 64 ) {
4432 roundIncrement
= UINT64_C(0x0000000000000400);
4433 roundMask
= UINT64_C(0x00000000000007FF);
4435 else if ( roundingPrecision
== 32 ) {
4436 roundIncrement
= UINT64_C(0x0000008000000000);
4437 roundMask
= UINT64_C(0x000000FFFFFFFFFF);
4442 zSig0
|= ( zSig1
!= 0 );
4443 switch (roundingMode
) {
4444 case float_round_nearest_even
:
4445 case float_round_ties_away
:
4447 case float_round_to_zero
:
4450 case float_round_up
:
4451 roundIncrement
= zSign
? 0 : roundMask
;
4453 case float_round_down
:
4454 roundIncrement
= zSign
? roundMask
: 0;
4459 roundBits
= zSig0
& roundMask
;
4460 if ( 0x7FFD <= (uint32_t) ( zExp
- 1 ) ) {
4461 if ( ( 0x7FFE < zExp
)
4462 || ( ( zExp
== 0x7FFE ) && ( zSig0
+ roundIncrement
< zSig0
) )
4467 if (status
->flush_to_zero
) {
4468 float_raise(float_flag_output_denormal
, status
);
4469 return packFloatx80(zSign
, 0, 0);
4471 isTiny
= status
->tininess_before_rounding
4473 || (zSig0
<= zSig0
+ roundIncrement
);
4474 shift64RightJamming( zSig0
, 1 - zExp
, &zSig0
);
4476 roundBits
= zSig0
& roundMask
;
4477 if (isTiny
&& roundBits
) {
4478 float_raise(float_flag_underflow
, status
);
4481 float_raise(float_flag_inexact
, status
);
4483 zSig0
+= roundIncrement
;
4484 if ( (int64_t) zSig0
< 0 ) zExp
= 1;
4485 roundIncrement
= roundMask
+ 1;
4486 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
4487 roundMask
|= roundIncrement
;
4489 zSig0
&= ~ roundMask
;
4490 return packFloatx80( zSign
, zExp
, zSig0
);
4494 float_raise(float_flag_inexact
, status
);
4496 zSig0
+= roundIncrement
;
4497 if ( zSig0
< roundIncrement
) {
4499 zSig0
= UINT64_C(0x8000000000000000);
4501 roundIncrement
= roundMask
+ 1;
4502 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
4503 roundMask
|= roundIncrement
;
4505 zSig0
&= ~ roundMask
;
4506 if ( zSig0
== 0 ) zExp
= 0;
4507 return packFloatx80( zSign
, zExp
, zSig0
);
4509 switch (roundingMode
) {
4510 case float_round_nearest_even
:
4511 case float_round_ties_away
:
4512 increment
= ((int64_t)zSig1
< 0);
4514 case float_round_to_zero
:
4517 case float_round_up
:
4518 increment
= !zSign
&& zSig1
;
4520 case float_round_down
:
4521 increment
= zSign
&& zSig1
;
4526 if ( 0x7FFD <= (uint32_t) ( zExp
- 1 ) ) {
4527 if ( ( 0x7FFE < zExp
)
4528 || ( ( zExp
== 0x7FFE )
4529 && ( zSig0
== UINT64_C(0xFFFFFFFFFFFFFFFF) )
4535 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
4536 if ( ( roundingMode
== float_round_to_zero
)
4537 || ( zSign
&& ( roundingMode
== float_round_up
) )
4538 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
4540 return packFloatx80( zSign
, 0x7FFE, ~ roundMask
);
4542 return packFloatx80(zSign
,
4543 floatx80_infinity_high
,
4544 floatx80_infinity_low
);
4547 isTiny
= status
->tininess_before_rounding
4550 || (zSig0
< UINT64_C(0xFFFFFFFFFFFFFFFF));
4551 shift64ExtraRightJamming( zSig0
, zSig1
, 1 - zExp
, &zSig0
, &zSig1
);
4553 if (isTiny
&& zSig1
) {
4554 float_raise(float_flag_underflow
, status
);
4557 float_raise(float_flag_inexact
, status
);
4559 switch (roundingMode
) {
4560 case float_round_nearest_even
:
4561 case float_round_ties_away
:
4562 increment
= ((int64_t)zSig1
< 0);
4564 case float_round_to_zero
:
4567 case float_round_up
:
4568 increment
= !zSign
&& zSig1
;
4570 case float_round_down
:
4571 increment
= zSign
&& zSig1
;
4578 if (!(zSig1
<< 1) && roundNearestEven
) {
4581 if ( (int64_t) zSig0
< 0 ) zExp
= 1;
4583 return packFloatx80( zSign
, zExp
, zSig0
);
4587 float_raise(float_flag_inexact
, status
);
4593 zSig0
= UINT64_C(0x8000000000000000);
4596 if (!(zSig1
<< 1) && roundNearestEven
) {
4602 if ( zSig0
== 0 ) zExp
= 0;
4604 return packFloatx80( zSign
, zExp
, zSig0
);
4608 /*----------------------------------------------------------------------------
4609 | Takes an abstract floating-point value having sign `zSign', exponent
4610 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
4611 | and returns the proper extended double-precision floating-point value
4612 | corresponding to the abstract input. This routine is just like
4613 | `roundAndPackFloatx80' except that the input significand does not have to be
4615 *----------------------------------------------------------------------------*/
4617 floatx80
normalizeRoundAndPackFloatx80(int8_t roundingPrecision
,
4618 bool zSign
, int32_t zExp
,
4619 uint64_t zSig0
, uint64_t zSig1
,
4620 float_status
*status
)
4629 shiftCount
= clz64(zSig0
);
4630 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
4632 return roundAndPackFloatx80(roundingPrecision
, zSign
, zExp
,
4633 zSig0
, zSig1
, status
);
4637 /*----------------------------------------------------------------------------
4638 | Returns the least-significant 64 fraction bits of the quadruple-precision
4639 | floating-point value `a'.
4640 *----------------------------------------------------------------------------*/
4642 static inline uint64_t extractFloat128Frac1( float128 a
)
4649 /*----------------------------------------------------------------------------
4650 | Returns the most-significant 48 fraction bits of the quadruple-precision
4651 | floating-point value `a'.
4652 *----------------------------------------------------------------------------*/
4654 static inline uint64_t extractFloat128Frac0( float128 a
)
4657 return a
.high
& UINT64_C(0x0000FFFFFFFFFFFF);
4661 /*----------------------------------------------------------------------------
4662 | Returns the exponent bits of the quadruple-precision floating-point value
4664 *----------------------------------------------------------------------------*/
4666 static inline int32_t extractFloat128Exp( float128 a
)
4669 return ( a
.high
>>48 ) & 0x7FFF;
4673 /*----------------------------------------------------------------------------
4674 | Returns the sign bit of the quadruple-precision floating-point value `a'.
4675 *----------------------------------------------------------------------------*/
4677 static inline bool extractFloat128Sign(float128 a
)
4679 return a
.high
>> 63;
4682 /*----------------------------------------------------------------------------
4683 | Normalizes the subnormal quadruple-precision floating-point value
4684 | represented by the denormalized significand formed by the concatenation of
4685 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
4686 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
4687 | significand are stored at the location pointed to by `zSig0Ptr', and the
4688 | least significant 64 bits of the normalized significand are stored at the
4689 | location pointed to by `zSig1Ptr'.
4690 *----------------------------------------------------------------------------*/
4693 normalizeFloat128Subnormal(
4704 shiftCount
= clz64(aSig1
) - 15;
4705 if ( shiftCount
< 0 ) {
4706 *zSig0Ptr
= aSig1
>>( - shiftCount
);
4707 *zSig1Ptr
= aSig1
<<( shiftCount
& 63 );
4710 *zSig0Ptr
= aSig1
<<shiftCount
;
4713 *zExpPtr
= - shiftCount
- 63;
4716 shiftCount
= clz64(aSig0
) - 15;
4717 shortShift128Left( aSig0
, aSig1
, shiftCount
, zSig0Ptr
, zSig1Ptr
);
4718 *zExpPtr
= 1 - shiftCount
;
4723 /*----------------------------------------------------------------------------
4724 | Packs the sign `zSign', the exponent `zExp', and the significand formed
4725 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
4726 | floating-point value, returning the result. After being shifted into the
4727 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
4728 | added together to form the most significant 32 bits of the result. This
4729 | means that any integer portion of `zSig0' will be added into the exponent.
4730 | Since a properly normalized significand will have an integer portion equal
4731 | to 1, the `zExp' input should be 1 less than the desired result exponent
4732 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
4734 *----------------------------------------------------------------------------*/
4736 static inline float128
4737 packFloat128(bool zSign
, int32_t zExp
, uint64_t zSig0
, uint64_t zSig1
)
4742 z
.high
= ((uint64_t)zSign
<< 63) + ((uint64_t)zExp
<< 48) + zSig0
;
4746 /*----------------------------------------------------------------------------
4747 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4748 | and extended significand formed by the concatenation of `zSig0', `zSig1',
4749 | and `zSig2', and returns the proper quadruple-precision floating-point value
4750 | corresponding to the abstract input. Ordinarily, the abstract value is
4751 | simply rounded and packed into the quadruple-precision format, with the
4752 | inexact exception raised if the abstract input cannot be represented
4753 | exactly. However, if the abstract value is too large, the overflow and
4754 | inexact exceptions are raised and an infinity or maximal finite value is
4755 | returned. If the abstract value is too small, the input value is rounded to
4756 | a subnormal number, and the underflow and inexact exceptions are raised if
4757 | the abstract input cannot be represented exactly as a subnormal quadruple-
4758 | precision floating-point number.
4759 | The input significand must be normalized or smaller. If the input
4760 | significand is not normalized, `zExp' must be 0; in that case, the result
4761 | returned is a subnormal number, and it must not require rounding. In the
4762 | usual case that the input significand is normalized, `zExp' must be 1 less
4763 | than the ``true'' floating-point exponent. The handling of underflow and
4764 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4765 *----------------------------------------------------------------------------*/
4767 static float128
roundAndPackFloat128(bool zSign
, int32_t zExp
,
4768 uint64_t zSig0
, uint64_t zSig1
,
4769 uint64_t zSig2
, float_status
*status
)
4771 int8_t roundingMode
;
4772 bool roundNearestEven
, increment
, isTiny
;
4774 roundingMode
= status
->float_rounding_mode
;
4775 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
4776 switch (roundingMode
) {
4777 case float_round_nearest_even
:
4778 case float_round_ties_away
:
4779 increment
= ((int64_t)zSig2
< 0);
4781 case float_round_to_zero
:
4784 case float_round_up
:
4785 increment
= !zSign
&& zSig2
;
4787 case float_round_down
:
4788 increment
= zSign
&& zSig2
;
4790 case float_round_to_odd
:
4791 increment
= !(zSig1
& 0x1) && zSig2
;
4796 if ( 0x7FFD <= (uint32_t) zExp
) {
4797 if ( ( 0x7FFD < zExp
)
4798 || ( ( zExp
== 0x7FFD )
4800 UINT64_C(0x0001FFFFFFFFFFFF),
4801 UINT64_C(0xFFFFFFFFFFFFFFFF),
4808 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
4809 if ( ( roundingMode
== float_round_to_zero
)
4810 || ( zSign
&& ( roundingMode
== float_round_up
) )
4811 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
4812 || (roundingMode
== float_round_to_odd
)
4818 UINT64_C(0x0000FFFFFFFFFFFF),
4819 UINT64_C(0xFFFFFFFFFFFFFFFF)
4822 return packFloat128( zSign
, 0x7FFF, 0, 0 );
4825 if (status
->flush_to_zero
) {
4826 float_raise(float_flag_output_denormal
, status
);
4827 return packFloat128(zSign
, 0, 0, 0);
4829 isTiny
= status
->tininess_before_rounding
4832 || lt128(zSig0
, zSig1
,
4833 UINT64_C(0x0001FFFFFFFFFFFF),
4834 UINT64_C(0xFFFFFFFFFFFFFFFF));
4835 shift128ExtraRightJamming(
4836 zSig0
, zSig1
, zSig2
, - zExp
, &zSig0
, &zSig1
, &zSig2
);
4838 if (isTiny
&& zSig2
) {
4839 float_raise(float_flag_underflow
, status
);
4841 switch (roundingMode
) {
4842 case float_round_nearest_even
:
4843 case float_round_ties_away
:
4844 increment
= ((int64_t)zSig2
< 0);
4846 case float_round_to_zero
:
4849 case float_round_up
:
4850 increment
= !zSign
&& zSig2
;
4852 case float_round_down
:
4853 increment
= zSign
&& zSig2
;
4855 case float_round_to_odd
:
4856 increment
= !(zSig1
& 0x1) && zSig2
;
4864 float_raise(float_flag_inexact
, status
);
4867 add128( zSig0
, zSig1
, 0, 1, &zSig0
, &zSig1
);
4868 if ((zSig2
+ zSig2
== 0) && roundNearestEven
) {
4873 if ( ( zSig0
| zSig1
) == 0 ) zExp
= 0;
4875 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
4879 /*----------------------------------------------------------------------------
4880 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4881 | and significand formed by the concatenation of `zSig0' and `zSig1', and
4882 | returns the proper quadruple-precision floating-point value corresponding
4883 | to the abstract input. This routine is just like `roundAndPackFloat128'
4884 | except that the input significand has fewer bits and does not have to be
4885 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
4887 *----------------------------------------------------------------------------*/
4889 static float128
normalizeRoundAndPackFloat128(bool zSign
, int32_t zExp
,
4890 uint64_t zSig0
, uint64_t zSig1
,
4891 float_status
*status
)
4901 shiftCount
= clz64(zSig0
) - 15;
4902 if ( 0 <= shiftCount
) {
4904 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
4907 shift128ExtraRightJamming(
4908 zSig0
, zSig1
, 0, - shiftCount
, &zSig0
, &zSig1
, &zSig2
);
4911 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
4916 /*----------------------------------------------------------------------------
4917 | Returns the result of converting the 32-bit two's complement integer `a'
4918 | to the extended double-precision floating-point format. The conversion
4919 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4921 *----------------------------------------------------------------------------*/
4923 floatx80
int32_to_floatx80(int32_t a
, float_status
*status
)
4930 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
4932 absA
= zSign
? - a
: a
;
4933 shiftCount
= clz32(absA
) + 32;
4935 return packFloatx80( zSign
, 0x403E - shiftCount
, zSig
<<shiftCount
);
4939 /*----------------------------------------------------------------------------
4940 | Returns the result of converting the 32-bit two's complement integer `a' to
4941 | the quadruple-precision floating-point format. The conversion is performed
4942 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4943 *----------------------------------------------------------------------------*/
4945 float128
int32_to_float128(int32_t a
, float_status
*status
)
4952 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
4954 absA
= zSign
? - a
: a
;
4955 shiftCount
= clz32(absA
) + 17;
4957 return packFloat128( zSign
, 0x402E - shiftCount
, zSig0
<<shiftCount
, 0 );
4961 /*----------------------------------------------------------------------------
4962 | Returns the result of converting the 64-bit two's complement integer `a'
4963 | to the extended double-precision floating-point format. The conversion
4964 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4966 *----------------------------------------------------------------------------*/
4968 floatx80
int64_to_floatx80(int64_t a
, float_status
*status
)
4974 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
4976 absA
= zSign
? - a
: a
;
4977 shiftCount
= clz64(absA
);
4978 return packFloatx80( zSign
, 0x403E - shiftCount
, absA
<<shiftCount
);
4982 /*----------------------------------------------------------------------------
4983 | Returns the result of converting the 64-bit two's complement integer `a' to
4984 | the quadruple-precision floating-point format. The conversion is performed
4985 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4986 *----------------------------------------------------------------------------*/
4988 float128
int64_to_float128(int64_t a
, float_status
*status
)
4994 uint64_t zSig0
, zSig1
;
4996 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
4998 absA
= zSign
? - a
: a
;
4999 shiftCount
= clz64(absA
) + 49;
5000 zExp
= 0x406E - shiftCount
;
5001 if ( 64 <= shiftCount
) {
5010 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
5011 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
5015 /*----------------------------------------------------------------------------
5016 | Returns the result of converting the 64-bit unsigned integer `a'
5017 | to the quadruple-precision floating-point format. The conversion is performed
5018 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5019 *----------------------------------------------------------------------------*/
5021 float128
uint64_to_float128(uint64_t a
, float_status
*status
)
5024 return float128_zero
;
5026 return normalizeRoundAndPackFloat128(0, 0x406E, 0, a
, status
);
5029 /*----------------------------------------------------------------------------
5030 | Returns the result of converting the single-precision floating-point value
5031 | `a' to the extended double-precision floating-point format. The conversion
5032 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5034 *----------------------------------------------------------------------------*/
5036 floatx80
float32_to_floatx80(float32 a
, float_status
*status
)
5042 a
= float32_squash_input_denormal(a
, status
);
5043 aSig
= extractFloat32Frac( a
);
5044 aExp
= extractFloat32Exp( a
);
5045 aSign
= extractFloat32Sign( a
);
5046 if ( aExp
== 0xFF ) {
5048 floatx80 res
= commonNaNToFloatx80(float32ToCommonNaN(a
, status
),
5050 return floatx80_silence_nan(res
, status
);
5052 return packFloatx80(aSign
,
5053 floatx80_infinity_high
,
5054 floatx80_infinity_low
);
5057 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
5058 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
5061 return packFloatx80( aSign
, aExp
+ 0x3F80, ( (uint64_t) aSig
)<<40 );
5065 /*----------------------------------------------------------------------------
5066 | Returns the result of converting the single-precision floating-point value
5067 | `a' to the double-precision floating-point format. The conversion is
5068 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5070 *----------------------------------------------------------------------------*/
5072 float128
float32_to_float128(float32 a
, float_status
*status
)
5078 a
= float32_squash_input_denormal(a
, status
);
5079 aSig
= extractFloat32Frac( a
);
5080 aExp
= extractFloat32Exp( a
);
5081 aSign
= extractFloat32Sign( a
);
5082 if ( aExp
== 0xFF ) {
5084 return commonNaNToFloat128(float32ToCommonNaN(a
, status
), status
);
5086 return packFloat128( aSign
, 0x7FFF, 0, 0 );
5089 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
5090 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
5093 return packFloat128( aSign
, aExp
+ 0x3F80, ( (uint64_t) aSig
)<<25, 0 );
5097 /*----------------------------------------------------------------------------
5098 | Returns the remainder of the single-precision floating-point value `a'
5099 | with respect to the corresponding value `b'. The operation is performed
5100 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5101 *----------------------------------------------------------------------------*/
5103 float32
float32_rem(float32 a
, float32 b
, float_status
*status
)
5106 int aExp
, bExp
, expDiff
;
5107 uint32_t aSig
, bSig
;
5109 uint64_t aSig64
, bSig64
, q64
;
5110 uint32_t alternateASig
;
5112 a
= float32_squash_input_denormal(a
, status
);
5113 b
= float32_squash_input_denormal(b
, status
);
5115 aSig
= extractFloat32Frac( a
);
5116 aExp
= extractFloat32Exp( a
);
5117 aSign
= extractFloat32Sign( a
);
5118 bSig
= extractFloat32Frac( b
);
5119 bExp
= extractFloat32Exp( b
);
5120 if ( aExp
== 0xFF ) {
5121 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
5122 return propagateFloat32NaN(a
, b
, status
);
5124 float_raise(float_flag_invalid
, status
);
5125 return float32_default_nan(status
);
5127 if ( bExp
== 0xFF ) {
5129 return propagateFloat32NaN(a
, b
, status
);
5135 float_raise(float_flag_invalid
, status
);
5136 return float32_default_nan(status
);
5138 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
5141 if ( aSig
== 0 ) return a
;
5142 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
5144 expDiff
= aExp
- bExp
;
5147 if ( expDiff
< 32 ) {
5150 if ( expDiff
< 0 ) {
5151 if ( expDiff
< -1 ) return a
;
5154 q
= ( bSig
<= aSig
);
5155 if ( q
) aSig
-= bSig
;
5156 if ( 0 < expDiff
) {
5157 q
= ( ( (uint64_t) aSig
)<<32 ) / bSig
;
5160 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
5168 if ( bSig
<= aSig
) aSig
-= bSig
;
5169 aSig64
= ( (uint64_t) aSig
)<<40;
5170 bSig64
= ( (uint64_t) bSig
)<<40;
5172 while ( 0 < expDiff
) {
5173 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
5174 q64
= ( 2 < q64
) ? q64
- 2 : 0;
5175 aSig64
= - ( ( bSig
* q64
)<<38 );
5179 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
5180 q64
= ( 2 < q64
) ? q64
- 2 : 0;
5181 q
= q64
>>( 64 - expDiff
);
5183 aSig
= ( ( aSig64
>>33 )<<( expDiff
- 1 ) ) - bSig
* q
;
5186 alternateASig
= aSig
;
5189 } while ( 0 <= (int32_t) aSig
);
5190 sigMean
= aSig
+ alternateASig
;
5191 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
5192 aSig
= alternateASig
;
5194 zSign
= ( (int32_t) aSig
< 0 );
5195 if ( zSign
) aSig
= - aSig
;
5196 return normalizeRoundAndPackFloat32(aSign
^ zSign
, bExp
, aSig
, status
);
5201 /*----------------------------------------------------------------------------
5202 | Returns the binary exponential of the single-precision floating-point value
5203 | `a'. The operation is performed according to the IEC/IEEE Standard for
5204 | Binary Floating-Point Arithmetic.
5206 | Uses the following identities:
5208 | 1. -------------------------------------------------------------------------
5212 | 2. -------------------------------------------------------------------------
5215 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
5217 *----------------------------------------------------------------------------*/
5219 static const float64 float32_exp2_coefficients
[15] =
5221 const_float64( 0x3ff0000000000000ll
), /* 1 */
5222 const_float64( 0x3fe0000000000000ll
), /* 2 */
5223 const_float64( 0x3fc5555555555555ll
), /* 3 */
5224 const_float64( 0x3fa5555555555555ll
), /* 4 */
5225 const_float64( 0x3f81111111111111ll
), /* 5 */
5226 const_float64( 0x3f56c16c16c16c17ll
), /* 6 */
5227 const_float64( 0x3f2a01a01a01a01all
), /* 7 */
5228 const_float64( 0x3efa01a01a01a01all
), /* 8 */
5229 const_float64( 0x3ec71de3a556c734ll
), /* 9 */
5230 const_float64( 0x3e927e4fb7789f5cll
), /* 10 */
5231 const_float64( 0x3e5ae64567f544e4ll
), /* 11 */
5232 const_float64( 0x3e21eed8eff8d898ll
), /* 12 */
5233 const_float64( 0x3de6124613a86d09ll
), /* 13 */
5234 const_float64( 0x3da93974a8c07c9dll
), /* 14 */
5235 const_float64( 0x3d6ae7f3e733b81fll
), /* 15 */
5238 float32
float32_exp2(float32 a
, float_status
*status
)
5245 a
= float32_squash_input_denormal(a
, status
);
5247 aSig
= extractFloat32Frac( a
);
5248 aExp
= extractFloat32Exp( a
);
5249 aSign
= extractFloat32Sign( a
);
5251 if ( aExp
== 0xFF) {
5253 return propagateFloat32NaN(a
, float32_zero
, status
);
5255 return (aSign
) ? float32_zero
: a
;
5258 if (aSig
== 0) return float32_one
;
5261 float_raise(float_flag_inexact
, status
);
5263 /* ******************************* */
5264 /* using float64 for approximation */
5265 /* ******************************* */
5266 x
= float32_to_float64(a
, status
);
5267 x
= float64_mul(x
, float64_ln2
, status
);
5271 for (i
= 0 ; i
< 15 ; i
++) {
5274 f
= float64_mul(xn
, float32_exp2_coefficients
[i
], status
);
5275 r
= float64_add(r
, f
, status
);
5277 xn
= float64_mul(xn
, x
, status
);
5280 return float64_to_float32(r
, status
);
5283 /*----------------------------------------------------------------------------
5284 | Returns the binary log of the single-precision floating-point value `a'.
5285 | The operation is performed according to the IEC/IEEE Standard for Binary
5286 | Floating-Point Arithmetic.
5287 *----------------------------------------------------------------------------*/
5288 float32
float32_log2(float32 a
, float_status
*status
)
5292 uint32_t aSig
, zSig
, i
;
5294 a
= float32_squash_input_denormal(a
, status
);
5295 aSig
= extractFloat32Frac( a
);
5296 aExp
= extractFloat32Exp( a
);
5297 aSign
= extractFloat32Sign( a
);
5300 if ( aSig
== 0 ) return packFloat32( 1, 0xFF, 0 );
5301 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
5304 float_raise(float_flag_invalid
, status
);
5305 return float32_default_nan(status
);
5307 if ( aExp
== 0xFF ) {
5309 return propagateFloat32NaN(a
, float32_zero
, status
);
5319 for (i
= 1 << 22; i
> 0; i
>>= 1) {
5320 aSig
= ( (uint64_t)aSig
* aSig
) >> 23;
5321 if ( aSig
& 0x01000000 ) {
5330 return normalizeRoundAndPackFloat32(zSign
, 0x85, zSig
, status
);
5333 /*----------------------------------------------------------------------------
5334 | Returns the result of converting the double-precision floating-point value
5335 | `a' to the extended double-precision floating-point format. The conversion
5336 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5338 *----------------------------------------------------------------------------*/
5340 floatx80
float64_to_floatx80(float64 a
, float_status
*status
)
5346 a
= float64_squash_input_denormal(a
, status
);
5347 aSig
= extractFloat64Frac( a
);
5348 aExp
= extractFloat64Exp( a
);
5349 aSign
= extractFloat64Sign( a
);
5350 if ( aExp
== 0x7FF ) {
5352 floatx80 res
= commonNaNToFloatx80(float64ToCommonNaN(a
, status
),
5354 return floatx80_silence_nan(res
, status
);
5356 return packFloatx80(aSign
,
5357 floatx80_infinity_high
,
5358 floatx80_infinity_low
);
5361 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
5362 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
5366 aSign
, aExp
+ 0x3C00, (aSig
| UINT64_C(0x0010000000000000)) << 11);
5370 /*----------------------------------------------------------------------------
5371 | Returns the result of converting the double-precision floating-point value
5372 | `a' to the quadruple-precision floating-point format. The conversion is
5373 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5375 *----------------------------------------------------------------------------*/
5377 float128
float64_to_float128(float64 a
, float_status
*status
)
5381 uint64_t aSig
, zSig0
, zSig1
;
5383 a
= float64_squash_input_denormal(a
, status
);
5384 aSig
= extractFloat64Frac( a
);
5385 aExp
= extractFloat64Exp( a
);
5386 aSign
= extractFloat64Sign( a
);
5387 if ( aExp
== 0x7FF ) {
5389 return commonNaNToFloat128(float64ToCommonNaN(a
, status
), status
);
5391 return packFloat128( aSign
, 0x7FFF, 0, 0 );
5394 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
5395 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
5398 shift128Right( aSig
, 0, 4, &zSig0
, &zSig1
);
5399 return packFloat128( aSign
, aExp
+ 0x3C00, zSig0
, zSig1
);
5404 /*----------------------------------------------------------------------------
5405 | Returns the remainder of the double-precision floating-point value `a'
5406 | with respect to the corresponding value `b'. The operation is performed
5407 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5408 *----------------------------------------------------------------------------*/
5410 float64
float64_rem(float64 a
, float64 b
, float_status
*status
)
5413 int aExp
, bExp
, expDiff
;
5414 uint64_t aSig
, bSig
;
5415 uint64_t q
, alternateASig
;
5418 a
= float64_squash_input_denormal(a
, status
);
5419 b
= float64_squash_input_denormal(b
, status
);
5420 aSig
= extractFloat64Frac( a
);
5421 aExp
= extractFloat64Exp( a
);
5422 aSign
= extractFloat64Sign( a
);
5423 bSig
= extractFloat64Frac( b
);
5424 bExp
= extractFloat64Exp( b
);
5425 if ( aExp
== 0x7FF ) {
5426 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
5427 return propagateFloat64NaN(a
, b
, status
);
5429 float_raise(float_flag_invalid
, status
);
5430 return float64_default_nan(status
);
5432 if ( bExp
== 0x7FF ) {
5434 return propagateFloat64NaN(a
, b
, status
);
5440 float_raise(float_flag_invalid
, status
);
5441 return float64_default_nan(status
);
5443 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
5446 if ( aSig
== 0 ) return a
;
5447 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
5449 expDiff
= aExp
- bExp
;
5450 aSig
= (aSig
| UINT64_C(0x0010000000000000)) << 11;
5451 bSig
= (bSig
| UINT64_C(0x0010000000000000)) << 11;
5452 if ( expDiff
< 0 ) {
5453 if ( expDiff
< -1 ) return a
;
5456 q
= ( bSig
<= aSig
);
5457 if ( q
) aSig
-= bSig
;
5459 while ( 0 < expDiff
) {
5460 q
= estimateDiv128To64( aSig
, 0, bSig
);
5461 q
= ( 2 < q
) ? q
- 2 : 0;
5462 aSig
= - ( ( bSig
>>2 ) * q
);
5466 if ( 0 < expDiff
) {
5467 q
= estimateDiv128To64( aSig
, 0, bSig
);
5468 q
= ( 2 < q
) ? q
- 2 : 0;
5471 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
5478 alternateASig
= aSig
;
5481 } while ( 0 <= (int64_t) aSig
);
5482 sigMean
= aSig
+ alternateASig
;
5483 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
5484 aSig
= alternateASig
;
5486 zSign
= ( (int64_t) aSig
< 0 );
5487 if ( zSign
) aSig
= - aSig
;
5488 return normalizeRoundAndPackFloat64(aSign
^ zSign
, bExp
, aSig
, status
);
5492 /*----------------------------------------------------------------------------
5493 | Returns the binary log of the double-precision floating-point value `a'.
5494 | The operation is performed according to the IEC/IEEE Standard for Binary
5495 | Floating-Point Arithmetic.
5496 *----------------------------------------------------------------------------*/
5497 float64
float64_log2(float64 a
, float_status
*status
)
5501 uint64_t aSig
, aSig0
, aSig1
, zSig
, i
;
5502 a
= float64_squash_input_denormal(a
, status
);
5504 aSig
= extractFloat64Frac( a
);
5505 aExp
= extractFloat64Exp( a
);
5506 aSign
= extractFloat64Sign( a
);
5509 if ( aSig
== 0 ) return packFloat64( 1, 0x7FF, 0 );
5510 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
5513 float_raise(float_flag_invalid
, status
);
5514 return float64_default_nan(status
);
5516 if ( aExp
== 0x7FF ) {
5518 return propagateFloat64NaN(a
, float64_zero
, status
);
5524 aSig
|= UINT64_C(0x0010000000000000);
5526 zSig
= (uint64_t)aExp
<< 52;
5527 for (i
= 1LL << 51; i
> 0; i
>>= 1) {
5528 mul64To128( aSig
, aSig
, &aSig0
, &aSig1
);
5529 aSig
= ( aSig0
<< 12 ) | ( aSig1
>> 52 );
5530 if ( aSig
& UINT64_C(0x0020000000000000) ) {
5538 return normalizeRoundAndPackFloat64(zSign
, 0x408, zSig
, status
);
5541 /*----------------------------------------------------------------------------
5542 | Returns the result of converting the extended double-precision floating-
5543 | point value `a' to the 32-bit two's complement integer format. The
5544 | conversion is performed according to the IEC/IEEE Standard for Binary
5545 | Floating-Point Arithmetic---which means in particular that the conversion
5546 | is rounded according to the current rounding mode. If `a' is a NaN, the
5547 | largest positive integer is returned. Otherwise, if the conversion
5548 | overflows, the largest integer with the same sign as `a' is returned.
5549 *----------------------------------------------------------------------------*/
5551 int32_t floatx80_to_int32(floatx80 a
, float_status
*status
)
5554 int32_t aExp
, shiftCount
;
5557 if (floatx80_invalid_encoding(a
)) {
5558 float_raise(float_flag_invalid
, status
);
5561 aSig
= extractFloatx80Frac( a
);
5562 aExp
= extractFloatx80Exp( a
);
5563 aSign
= extractFloatx80Sign( a
);
5564 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) aSign
= 0;
5565 shiftCount
= 0x4037 - aExp
;
5566 if ( shiftCount
<= 0 ) shiftCount
= 1;
5567 shift64RightJamming( aSig
, shiftCount
, &aSig
);
5568 return roundAndPackInt32(aSign
, aSig
, status
);
5572 /*----------------------------------------------------------------------------
5573 | Returns the result of converting the extended double-precision floating-
5574 | point value `a' to the 32-bit two's complement integer format. The
5575 | conversion is performed according to the IEC/IEEE Standard for Binary
5576 | Floating-Point Arithmetic, except that the conversion is always rounded
5577 | toward zero. If `a' is a NaN, the largest positive integer is returned.
5578 | Otherwise, if the conversion overflows, the largest integer with the same
5579 | sign as `a' is returned.
5580 *----------------------------------------------------------------------------*/
5582 int32_t floatx80_to_int32_round_to_zero(floatx80 a
, float_status
*status
)
5585 int32_t aExp
, shiftCount
;
5586 uint64_t aSig
, savedASig
;
5589 if (floatx80_invalid_encoding(a
)) {
5590 float_raise(float_flag_invalid
, status
);
5593 aSig
= extractFloatx80Frac( a
);
5594 aExp
= extractFloatx80Exp( a
);
5595 aSign
= extractFloatx80Sign( a
);
5596 if ( 0x401E < aExp
) {
5597 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) aSign
= 0;
5600 else if ( aExp
< 0x3FFF ) {
5602 float_raise(float_flag_inexact
, status
);
5606 shiftCount
= 0x403E - aExp
;
5608 aSig
>>= shiftCount
;
5610 if ( aSign
) z
= - z
;
5611 if ( ( z
< 0 ) ^ aSign
) {
5613 float_raise(float_flag_invalid
, status
);
5614 return aSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
5616 if ( ( aSig
<<shiftCount
) != savedASig
) {
5617 float_raise(float_flag_inexact
, status
);
5623 /*----------------------------------------------------------------------------
5624 | Returns the result of converting the extended double-precision floating-
5625 | point value `a' to the 64-bit two's complement integer format. The
5626 | conversion is performed according to the IEC/IEEE Standard for Binary
5627 | Floating-Point Arithmetic---which means in particular that the conversion
5628 | is rounded according to the current rounding mode. If `a' is a NaN,
5629 | the largest positive integer is returned. Otherwise, if the conversion
5630 | overflows, the largest integer with the same sign as `a' is returned.
5631 *----------------------------------------------------------------------------*/
5633 int64_t floatx80_to_int64(floatx80 a
, float_status
*status
)
5636 int32_t aExp
, shiftCount
;
5637 uint64_t aSig
, aSigExtra
;
5639 if (floatx80_invalid_encoding(a
)) {
5640 float_raise(float_flag_invalid
, status
);
5643 aSig
= extractFloatx80Frac( a
);
5644 aExp
= extractFloatx80Exp( a
);
5645 aSign
= extractFloatx80Sign( a
);
5646 shiftCount
= 0x403E - aExp
;
5647 if ( shiftCount
<= 0 ) {
5649 float_raise(float_flag_invalid
, status
);
5650 if (!aSign
|| floatx80_is_any_nan(a
)) {
5658 shift64ExtraRightJamming( aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
5660 return roundAndPackInt64(aSign
, aSig
, aSigExtra
, status
);
5664 /*----------------------------------------------------------------------------
5665 | Returns the result of converting the extended double-precision floating-
5666 | point value `a' to the 64-bit two's complement integer format. The
5667 | conversion is performed according to the IEC/IEEE Standard for Binary
5668 | Floating-Point Arithmetic, except that the conversion is always rounded
5669 | toward zero. If `a' is a NaN, the largest positive integer is returned.
5670 | Otherwise, if the conversion overflows, the largest integer with the same
5671 | sign as `a' is returned.
5672 *----------------------------------------------------------------------------*/
5674 int64_t floatx80_to_int64_round_to_zero(floatx80 a
, float_status
*status
)
5677 int32_t aExp
, shiftCount
;
5681 if (floatx80_invalid_encoding(a
)) {
5682 float_raise(float_flag_invalid
, status
);
5685 aSig
= extractFloatx80Frac( a
);
5686 aExp
= extractFloatx80Exp( a
);
5687 aSign
= extractFloatx80Sign( a
);
5688 shiftCount
= aExp
- 0x403E;
5689 if ( 0 <= shiftCount
) {
5690 aSig
&= UINT64_C(0x7FFFFFFFFFFFFFFF);
5691 if ( ( a
.high
!= 0xC03E ) || aSig
) {
5692 float_raise(float_flag_invalid
, status
);
5693 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && aSig
) ) {
5699 else if ( aExp
< 0x3FFF ) {
5701 float_raise(float_flag_inexact
, status
);
5705 z
= aSig
>>( - shiftCount
);
5706 if ( (uint64_t) ( aSig
<<( shiftCount
& 63 ) ) ) {
5707 float_raise(float_flag_inexact
, status
);
5709 if ( aSign
) z
= - z
;
5714 /*----------------------------------------------------------------------------
5715 | Returns the result of converting the extended double-precision floating-
5716 | point value `a' to the single-precision floating-point format. The
5717 | conversion is performed according to the IEC/IEEE Standard for Binary
5718 | Floating-Point Arithmetic.
5719 *----------------------------------------------------------------------------*/
5721 float32
floatx80_to_float32(floatx80 a
, float_status
*status
)
5727 if (floatx80_invalid_encoding(a
)) {
5728 float_raise(float_flag_invalid
, status
);
5729 return float32_default_nan(status
);
5731 aSig
= extractFloatx80Frac( a
);
5732 aExp
= extractFloatx80Exp( a
);
5733 aSign
= extractFloatx80Sign( a
);
5734 if ( aExp
== 0x7FFF ) {
5735 if ( (uint64_t) ( aSig
<<1 ) ) {
5736 float32 res
= commonNaNToFloat32(floatx80ToCommonNaN(a
, status
),
5738 return float32_silence_nan(res
, status
);
5740 return packFloat32( aSign
, 0xFF, 0 );
5742 shift64RightJamming( aSig
, 33, &aSig
);
5743 if ( aExp
|| aSig
) aExp
-= 0x3F81;
5744 return roundAndPackFloat32(aSign
, aExp
, aSig
, status
);
5748 /*----------------------------------------------------------------------------
5749 | Returns the result of converting the extended double-precision floating-
5750 | point value `a' to the double-precision floating-point format. The
5751 | conversion is performed according to the IEC/IEEE Standard for Binary
5752 | Floating-Point Arithmetic.
5753 *----------------------------------------------------------------------------*/
5755 float64
floatx80_to_float64(floatx80 a
, float_status
*status
)
5759 uint64_t aSig
, zSig
;
5761 if (floatx80_invalid_encoding(a
)) {
5762 float_raise(float_flag_invalid
, status
);
5763 return float64_default_nan(status
);
5765 aSig
= extractFloatx80Frac( a
);
5766 aExp
= extractFloatx80Exp( a
);
5767 aSign
= extractFloatx80Sign( a
);
5768 if ( aExp
== 0x7FFF ) {
5769 if ( (uint64_t) ( aSig
<<1 ) ) {
5770 float64 res
= commonNaNToFloat64(floatx80ToCommonNaN(a
, status
),
5772 return float64_silence_nan(res
, status
);
5774 return packFloat64( aSign
, 0x7FF, 0 );
5776 shift64RightJamming( aSig
, 1, &zSig
);
5777 if ( aExp
|| aSig
) aExp
-= 0x3C01;
5778 return roundAndPackFloat64(aSign
, aExp
, zSig
, status
);
5782 /*----------------------------------------------------------------------------
5783 | Returns the result of converting the extended double-precision floating-
5784 | point value `a' to the quadruple-precision floating-point format. The
5785 | conversion is performed according to the IEC/IEEE Standard for Binary
5786 | Floating-Point Arithmetic.
5787 *----------------------------------------------------------------------------*/
5789 float128
floatx80_to_float128(floatx80 a
, float_status
*status
)
5793 uint64_t aSig
, zSig0
, zSig1
;
5795 if (floatx80_invalid_encoding(a
)) {
5796 float_raise(float_flag_invalid
, status
);
5797 return float128_default_nan(status
);
5799 aSig
= extractFloatx80Frac( a
);
5800 aExp
= extractFloatx80Exp( a
);
5801 aSign
= extractFloatx80Sign( a
);
5802 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) {
5803 float128 res
= commonNaNToFloat128(floatx80ToCommonNaN(a
, status
),
5805 return float128_silence_nan(res
, status
);
5807 shift128Right( aSig
<<1, 0, 16, &zSig0
, &zSig1
);
5808 return packFloat128( aSign
, aExp
, zSig0
, zSig1
);
5812 /*----------------------------------------------------------------------------
5813 | Rounds the extended double-precision floating-point value `a'
5814 | to the precision provided by floatx80_rounding_precision and returns the
5815 | result as an extended double-precision floating-point value.
5816 | The operation is performed according to the IEC/IEEE Standard for Binary
5817 | Floating-Point Arithmetic.
5818 *----------------------------------------------------------------------------*/
5820 floatx80
floatx80_round(floatx80 a
, float_status
*status
)
5822 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5823 extractFloatx80Sign(a
),
5824 extractFloatx80Exp(a
),
5825 extractFloatx80Frac(a
), 0, status
);
5828 /*----------------------------------------------------------------------------
5829 | Rounds the extended double-precision floating-point value `a' to an integer,
5830 | and returns the result as an extended quadruple-precision floating-point
5831 | value. The operation is performed according to the IEC/IEEE Standard for
5832 | Binary Floating-Point Arithmetic.
5833 *----------------------------------------------------------------------------*/
5835 floatx80
floatx80_round_to_int(floatx80 a
, float_status
*status
)
5839 uint64_t lastBitMask
, roundBitsMask
;
5842 if (floatx80_invalid_encoding(a
)) {
5843 float_raise(float_flag_invalid
, status
);
5844 return floatx80_default_nan(status
);
5846 aExp
= extractFloatx80Exp( a
);
5847 if ( 0x403E <= aExp
) {
5848 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) ) {
5849 return propagateFloatx80NaN(a
, a
, status
);
5853 if ( aExp
< 0x3FFF ) {
5855 && ( (uint64_t) ( extractFloatx80Frac( a
) ) == 0 ) ) {
5858 float_raise(float_flag_inexact
, status
);
5859 aSign
= extractFloatx80Sign( a
);
5860 switch (status
->float_rounding_mode
) {
5861 case float_round_nearest_even
:
5862 if ( ( aExp
== 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a
)<<1 )
5865 packFloatx80( aSign
, 0x3FFF, UINT64_C(0x8000000000000000));
5868 case float_round_ties_away
:
5869 if (aExp
== 0x3FFE) {
5870 return packFloatx80(aSign
, 0x3FFF, UINT64_C(0x8000000000000000));
5873 case float_round_down
:
5876 packFloatx80( 1, 0x3FFF, UINT64_C(0x8000000000000000))
5877 : packFloatx80( 0, 0, 0 );
5878 case float_round_up
:
5880 aSign
? packFloatx80( 1, 0, 0 )
5881 : packFloatx80( 0, 0x3FFF, UINT64_C(0x8000000000000000));
5883 case float_round_to_zero
:
5886 g_assert_not_reached();
5888 return packFloatx80( aSign
, 0, 0 );
5891 lastBitMask
<<= 0x403E - aExp
;
5892 roundBitsMask
= lastBitMask
- 1;
5894 switch (status
->float_rounding_mode
) {
5895 case float_round_nearest_even
:
5896 z
.low
+= lastBitMask
>>1;
5897 if ((z
.low
& roundBitsMask
) == 0) {
5898 z
.low
&= ~lastBitMask
;
5901 case float_round_ties_away
:
5902 z
.low
+= lastBitMask
>> 1;
5904 case float_round_to_zero
:
5906 case float_round_up
:
5907 if (!extractFloatx80Sign(z
)) {
5908 z
.low
+= roundBitsMask
;
5911 case float_round_down
:
5912 if (extractFloatx80Sign(z
)) {
5913 z
.low
+= roundBitsMask
;
5919 z
.low
&= ~ roundBitsMask
;
5922 z
.low
= UINT64_C(0x8000000000000000);
5924 if (z
.low
!= a
.low
) {
5925 float_raise(float_flag_inexact
, status
);
5931 /*----------------------------------------------------------------------------
5932 | Returns the result of adding the absolute values of the extended double-
5933 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
5934 | negated before being returned. `zSign' is ignored if the result is a NaN.
5935 | The addition is performed according to the IEC/IEEE Standard for Binary
5936 | Floating-Point Arithmetic.
5937 *----------------------------------------------------------------------------*/
5939 static floatx80
addFloatx80Sigs(floatx80 a
, floatx80 b
, bool zSign
,
5940 float_status
*status
)
5942 int32_t aExp
, bExp
, zExp
;
5943 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5946 aSig
= extractFloatx80Frac( a
);
5947 aExp
= extractFloatx80Exp( a
);
5948 bSig
= extractFloatx80Frac( b
);
5949 bExp
= extractFloatx80Exp( b
);
5950 expDiff
= aExp
- bExp
;
5951 if ( 0 < expDiff
) {
5952 if ( aExp
== 0x7FFF ) {
5953 if ((uint64_t)(aSig
<< 1)) {
5954 return propagateFloatx80NaN(a
, b
, status
);
5958 if ( bExp
== 0 ) --expDiff
;
5959 shift64ExtraRightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
5962 else if ( expDiff
< 0 ) {
5963 if ( bExp
== 0x7FFF ) {
5964 if ((uint64_t)(bSig
<< 1)) {
5965 return propagateFloatx80NaN(a
, b
, status
);
5967 return packFloatx80(zSign
,
5968 floatx80_infinity_high
,
5969 floatx80_infinity_low
);
5971 if ( aExp
== 0 ) ++expDiff
;
5972 shift64ExtraRightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
5976 if ( aExp
== 0x7FFF ) {
5977 if ( (uint64_t) ( ( aSig
| bSig
)<<1 ) ) {
5978 return propagateFloatx80NaN(a
, b
, status
);
5983 zSig0
= aSig
+ bSig
;
5985 if ((aSig
| bSig
) & UINT64_C(0x8000000000000000) && zSig0
< aSig
) {
5986 /* At least one of the values is a pseudo-denormal,
5987 * and there is a carry out of the result. */
5992 return packFloatx80(zSign
, 0, 0);
5994 normalizeFloatx80Subnormal( zSig0
, &zExp
, &zSig0
);
6000 zSig0
= aSig
+ bSig
;
6001 if ( (int64_t) zSig0
< 0 ) goto roundAndPack
;
6003 shift64ExtraRightJamming( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
6004 zSig0
|= UINT64_C(0x8000000000000000);
6007 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
6008 zSign
, zExp
, zSig0
, zSig1
, status
);
6011 /*----------------------------------------------------------------------------
6012 | Returns the result of subtracting the absolute values of the extended
6013 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
6014 | difference is negated before being returned. `zSign' is ignored if the
6015 | result is a NaN. The subtraction is performed according to the IEC/IEEE
6016 | Standard for Binary Floating-Point Arithmetic.
6017 *----------------------------------------------------------------------------*/
6019 static floatx80
subFloatx80Sigs(floatx80 a
, floatx80 b
, bool zSign
,
6020 float_status
*status
)
6022 int32_t aExp
, bExp
, zExp
;
6023 uint64_t aSig
, bSig
, zSig0
, zSig1
;
6026 aSig
= extractFloatx80Frac( a
);
6027 aExp
= extractFloatx80Exp( a
);
6028 bSig
= extractFloatx80Frac( b
);
6029 bExp
= extractFloatx80Exp( b
);
6030 expDiff
= aExp
- bExp
;
6031 if ( 0 < expDiff
) goto aExpBigger
;
6032 if ( expDiff
< 0 ) goto bExpBigger
;
6033 if ( aExp
== 0x7FFF ) {
6034 if ( (uint64_t) ( ( aSig
| bSig
)<<1 ) ) {
6035 return propagateFloatx80NaN(a
, b
, status
);
6037 float_raise(float_flag_invalid
, status
);
6038 return floatx80_default_nan(status
);
6045 if ( bSig
< aSig
) goto aBigger
;
6046 if ( aSig
< bSig
) goto bBigger
;
6047 return packFloatx80(status
->float_rounding_mode
== float_round_down
, 0, 0);
6049 if ( bExp
== 0x7FFF ) {
6050 if ((uint64_t)(bSig
<< 1)) {
6051 return propagateFloatx80NaN(a
, b
, status
);
6053 return packFloatx80(zSign
^ 1, floatx80_infinity_high
,
6054 floatx80_infinity_low
);
6056 if ( aExp
== 0 ) ++expDiff
;
6057 shift128RightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
6059 sub128( bSig
, 0, aSig
, zSig1
, &zSig0
, &zSig1
);
6062 goto normalizeRoundAndPack
;
6064 if ( aExp
== 0x7FFF ) {
6065 if ((uint64_t)(aSig
<< 1)) {
6066 return propagateFloatx80NaN(a
, b
, status
);
6070 if ( bExp
== 0 ) --expDiff
;
6071 shift128RightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
6073 sub128( aSig
, 0, bSig
, zSig1
, &zSig0
, &zSig1
);
6075 normalizeRoundAndPack
:
6076 return normalizeRoundAndPackFloatx80(status
->floatx80_rounding_precision
,
6077 zSign
, zExp
, zSig0
, zSig1
, status
);
6080 /*----------------------------------------------------------------------------
6081 | Returns the result of adding the extended double-precision floating-point
6082 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6083 | Standard for Binary Floating-Point Arithmetic.
6084 *----------------------------------------------------------------------------*/
6086 floatx80
floatx80_add(floatx80 a
, floatx80 b
, float_status
*status
)
6090 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
6091 float_raise(float_flag_invalid
, status
);
6092 return floatx80_default_nan(status
);
6094 aSign
= extractFloatx80Sign( a
);
6095 bSign
= extractFloatx80Sign( b
);
6096 if ( aSign
== bSign
) {
6097 return addFloatx80Sigs(a
, b
, aSign
, status
);
6100 return subFloatx80Sigs(a
, b
, aSign
, status
);
6105 /*----------------------------------------------------------------------------
6106 | Returns the result of subtracting the extended double-precision floating-
6107 | point values `a' and `b'. The operation is performed according to the
6108 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6109 *----------------------------------------------------------------------------*/
6111 floatx80
floatx80_sub(floatx80 a
, floatx80 b
, float_status
*status
)
6115 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
6116 float_raise(float_flag_invalid
, status
);
6117 return floatx80_default_nan(status
);
6119 aSign
= extractFloatx80Sign( a
);
6120 bSign
= extractFloatx80Sign( b
);
6121 if ( aSign
== bSign
) {
6122 return subFloatx80Sigs(a
, b
, aSign
, status
);
6125 return addFloatx80Sigs(a
, b
, aSign
, status
);
6130 /*----------------------------------------------------------------------------
6131 | Returns the result of multiplying the extended double-precision floating-
6132 | point values `a' and `b'. The operation is performed according to the
6133 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6134 *----------------------------------------------------------------------------*/
6136 floatx80
floatx80_mul(floatx80 a
, floatx80 b
, float_status
*status
)
6138 bool aSign
, bSign
, zSign
;
6139 int32_t aExp
, bExp
, zExp
;
6140 uint64_t aSig
, bSig
, zSig0
, zSig1
;
6142 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
6143 float_raise(float_flag_invalid
, status
);
6144 return floatx80_default_nan(status
);
6146 aSig
= extractFloatx80Frac( a
);
6147 aExp
= extractFloatx80Exp( a
);
6148 aSign
= extractFloatx80Sign( a
);
6149 bSig
= extractFloatx80Frac( b
);
6150 bExp
= extractFloatx80Exp( b
);
6151 bSign
= extractFloatx80Sign( b
);
6152 zSign
= aSign
^ bSign
;
6153 if ( aExp
== 0x7FFF ) {
6154 if ( (uint64_t) ( aSig
<<1 )
6155 || ( ( bExp
== 0x7FFF ) && (uint64_t) ( bSig
<<1 ) ) ) {
6156 return propagateFloatx80NaN(a
, b
, status
);
6158 if ( ( bExp
| bSig
) == 0 ) goto invalid
;
6159 return packFloatx80(zSign
, floatx80_infinity_high
,
6160 floatx80_infinity_low
);
6162 if ( bExp
== 0x7FFF ) {
6163 if ((uint64_t)(bSig
<< 1)) {
6164 return propagateFloatx80NaN(a
, b
, status
);
6166 if ( ( aExp
| aSig
) == 0 ) {
6168 float_raise(float_flag_invalid
, status
);
6169 return floatx80_default_nan(status
);
6171 return packFloatx80(zSign
, floatx80_infinity_high
,
6172 floatx80_infinity_low
);
6175 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
6176 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
6179 if ( bSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
6180 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
6182 zExp
= aExp
+ bExp
- 0x3FFE;
6183 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
6184 if ( 0 < (int64_t) zSig0
) {
6185 shortShift128Left( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
6188 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
6189 zSign
, zExp
, zSig0
, zSig1
, status
);
6192 /*----------------------------------------------------------------------------
6193 | Returns the result of dividing the extended double-precision floating-point
6194 | value `a' by the corresponding value `b'. The operation is performed
6195 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6196 *----------------------------------------------------------------------------*/
6198 floatx80
floatx80_div(floatx80 a
, floatx80 b
, float_status
*status
)
6200 bool aSign
, bSign
, zSign
;
6201 int32_t aExp
, bExp
, zExp
;
6202 uint64_t aSig
, bSig
, zSig0
, zSig1
;
6203 uint64_t rem0
, rem1
, rem2
, term0
, term1
, term2
;
6205 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
6206 float_raise(float_flag_invalid
, status
);
6207 return floatx80_default_nan(status
);
6209 aSig
= extractFloatx80Frac( a
);
6210 aExp
= extractFloatx80Exp( a
);
6211 aSign
= extractFloatx80Sign( a
);
6212 bSig
= extractFloatx80Frac( b
);
6213 bExp
= extractFloatx80Exp( b
);
6214 bSign
= extractFloatx80Sign( b
);
6215 zSign
= aSign
^ bSign
;
6216 if ( aExp
== 0x7FFF ) {
6217 if ((uint64_t)(aSig
<< 1)) {
6218 return propagateFloatx80NaN(a
, b
, status
);
6220 if ( bExp
== 0x7FFF ) {
6221 if ((uint64_t)(bSig
<< 1)) {
6222 return propagateFloatx80NaN(a
, b
, status
);
6226 return packFloatx80(zSign
, floatx80_infinity_high
,
6227 floatx80_infinity_low
);
6229 if ( bExp
== 0x7FFF ) {
6230 if ((uint64_t)(bSig
<< 1)) {
6231 return propagateFloatx80NaN(a
, b
, status
);
6233 return packFloatx80( zSign
, 0, 0 );
6237 if ( ( aExp
| aSig
) == 0 ) {
6239 float_raise(float_flag_invalid
, status
);
6240 return floatx80_default_nan(status
);
6242 float_raise(float_flag_divbyzero
, status
);
6243 return packFloatx80(zSign
, floatx80_infinity_high
,
6244 floatx80_infinity_low
);
6246 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
6249 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
6250 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
6252 zExp
= aExp
- bExp
+ 0x3FFE;
6254 if ( bSig
<= aSig
) {
6255 shift128Right( aSig
, 0, 1, &aSig
, &rem1
);
6258 zSig0
= estimateDiv128To64( aSig
, rem1
, bSig
);
6259 mul64To128( bSig
, zSig0
, &term0
, &term1
);
6260 sub128( aSig
, rem1
, term0
, term1
, &rem0
, &rem1
);
6261 while ( (int64_t) rem0
< 0 ) {
6263 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
6265 zSig1
= estimateDiv128To64( rem1
, 0, bSig
);
6266 if ( (uint64_t) ( zSig1
<<1 ) <= 8 ) {
6267 mul64To128( bSig
, zSig1
, &term1
, &term2
);
6268 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
6269 while ( (int64_t) rem1
< 0 ) {
6271 add128( rem1
, rem2
, 0, bSig
, &rem1
, &rem2
);
6273 zSig1
|= ( ( rem1
| rem2
) != 0 );
6275 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
6276 zSign
, zExp
, zSig0
, zSig1
, status
);
6279 /*----------------------------------------------------------------------------
6280 | Returns the remainder of the extended double-precision floating-point value
6281 | `a' with respect to the corresponding value `b'. The operation is performed
6282 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic,
6283 | if 'mod' is false; if 'mod' is true, return the remainder based on truncating
6284 | the quotient toward zero instead. '*quotient' is set to the low 64 bits of
6285 | the absolute value of the integer quotient.
6286 *----------------------------------------------------------------------------*/
6288 floatx80
floatx80_modrem(floatx80 a
, floatx80 b
, bool mod
, uint64_t *quotient
,
6289 float_status
*status
)
6292 int32_t aExp
, bExp
, expDiff
, aExpOrig
;
6293 uint64_t aSig0
, aSig1
, bSig
;
6294 uint64_t q
, term0
, term1
, alternateASig0
, alternateASig1
;
6297 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
6298 float_raise(float_flag_invalid
, status
);
6299 return floatx80_default_nan(status
);
6301 aSig0
= extractFloatx80Frac( a
);
6302 aExpOrig
= aExp
= extractFloatx80Exp( a
);
6303 aSign
= extractFloatx80Sign( a
);
6304 bSig
= extractFloatx80Frac( b
);
6305 bExp
= extractFloatx80Exp( b
);
6306 if ( aExp
== 0x7FFF ) {
6307 if ( (uint64_t) ( aSig0
<<1 )
6308 || ( ( bExp
== 0x7FFF ) && (uint64_t) ( bSig
<<1 ) ) ) {
6309 return propagateFloatx80NaN(a
, b
, status
);
6313 if ( bExp
== 0x7FFF ) {
6314 if ((uint64_t)(bSig
<< 1)) {
6315 return propagateFloatx80NaN(a
, b
, status
);
6317 if (aExp
== 0 && aSig0
>> 63) {
6319 * Pseudo-denormal argument must be returned in normalized
6322 return packFloatx80(aSign
, 1, aSig0
);
6329 float_raise(float_flag_invalid
, status
);
6330 return floatx80_default_nan(status
);
6332 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
6335 if ( aSig0
== 0 ) return a
;
6336 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
6339 expDiff
= aExp
- bExp
;
6341 if ( expDiff
< 0 ) {
6342 if ( mod
|| expDiff
< -1 ) {
6343 if (aExp
== 1 && aExpOrig
== 0) {
6345 * Pseudo-denormal argument must be returned in
6348 return packFloatx80(aSign
, aExp
, aSig0
);
6352 shift128Right( aSig0
, 0, 1, &aSig0
, &aSig1
);
6355 *quotient
= q
= ( bSig
<= aSig0
);
6356 if ( q
) aSig0
-= bSig
;
6358 while ( 0 < expDiff
) {
6359 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
6360 q
= ( 2 < q
) ? q
- 2 : 0;
6361 mul64To128( bSig
, q
, &term0
, &term1
);
6362 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
6363 shortShift128Left( aSig0
, aSig1
, 62, &aSig0
, &aSig1
);
6369 if ( 0 < expDiff
) {
6370 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
6371 q
= ( 2 < q
) ? q
- 2 : 0;
6373 mul64To128( bSig
, q
<<( 64 - expDiff
), &term0
, &term1
);
6374 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
6375 shortShift128Left( 0, bSig
, 64 - expDiff
, &term0
, &term1
);
6376 while ( le128( term0
, term1
, aSig0
, aSig1
) ) {
6378 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
6381 *quotient
<<= expDiff
;
6392 sub128( term0
, term1
, aSig0
, aSig1
, &alternateASig0
, &alternateASig1
);
6393 if ( lt128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
6394 || ( eq128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
6397 aSig0
= alternateASig0
;
6398 aSig1
= alternateASig1
;
6404 normalizeRoundAndPackFloatx80(
6405 80, zSign
, bExp
+ expDiff
, aSig0
, aSig1
, status
);
6409 /*----------------------------------------------------------------------------
6410 | Returns the remainder of the extended double-precision floating-point value
6411 | `a' with respect to the corresponding value `b'. The operation is performed
6412 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6413 *----------------------------------------------------------------------------*/
6415 floatx80
floatx80_rem(floatx80 a
, floatx80 b
, float_status
*status
)
6418 return floatx80_modrem(a
, b
, false, "ient
, status
);
6421 /*----------------------------------------------------------------------------
6422 | Returns the remainder of the extended double-precision floating-point value
6423 | `a' with respect to the corresponding value `b', with the quotient truncated
6425 *----------------------------------------------------------------------------*/
6427 floatx80
floatx80_mod(floatx80 a
, floatx80 b
, float_status
*status
)
6430 return floatx80_modrem(a
, b
, true, "ient
, status
);
6433 /*----------------------------------------------------------------------------
6434 | Returns the square root of the extended double-precision floating-point
6435 | value `a'. The operation is performed according to the IEC/IEEE Standard
6436 | for Binary Floating-Point Arithmetic.
6437 *----------------------------------------------------------------------------*/
6439 floatx80
floatx80_sqrt(floatx80 a
, float_status
*status
)
6443 uint64_t aSig0
, aSig1
, zSig0
, zSig1
, doubleZSig0
;
6444 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
6446 if (floatx80_invalid_encoding(a
)) {
6447 float_raise(float_flag_invalid
, status
);
6448 return floatx80_default_nan(status
);
6450 aSig0
= extractFloatx80Frac( a
);
6451 aExp
= extractFloatx80Exp( a
);
6452 aSign
= extractFloatx80Sign( a
);
6453 if ( aExp
== 0x7FFF ) {
6454 if ((uint64_t)(aSig0
<< 1)) {
6455 return propagateFloatx80NaN(a
, a
, status
);
6457 if ( ! aSign
) return a
;
6461 if ( ( aExp
| aSig0
) == 0 ) return a
;
6463 float_raise(float_flag_invalid
, status
);
6464 return floatx80_default_nan(status
);
6467 if ( aSig0
== 0 ) return packFloatx80( 0, 0, 0 );
6468 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
6470 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFF;
6471 zSig0
= estimateSqrt32( aExp
, aSig0
>>32 );
6472 shift128Right( aSig0
, 0, 2 + ( aExp
& 1 ), &aSig0
, &aSig1
);
6473 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
6474 doubleZSig0
= zSig0
<<1;
6475 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
6476 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
6477 while ( (int64_t) rem0
< 0 ) {
6480 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
6482 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
6483 if ( ( zSig1
& UINT64_C(0x3FFFFFFFFFFFFFFF) ) <= 5 ) {
6484 if ( zSig1
== 0 ) zSig1
= 1;
6485 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
6486 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
6487 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
6488 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
6489 while ( (int64_t) rem1
< 0 ) {
6491 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
6493 term2
|= doubleZSig0
;
6494 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
6496 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
6498 shortShift128Left( 0, zSig1
, 1, &zSig0
, &zSig1
);
6499 zSig0
|= doubleZSig0
;
6500 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
6501 0, zExp
, zSig0
, zSig1
, status
);
6504 /*----------------------------------------------------------------------------
6505 | Returns the result of converting the quadruple-precision floating-point
6506 | value `a' to the 32-bit two's complement integer format. The conversion
6507 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6508 | Arithmetic---which means in particular that the conversion is rounded
6509 | according to the current rounding mode. If `a' is a NaN, the largest
6510 | positive integer is returned. Otherwise, if the conversion overflows, the
6511 | largest integer with the same sign as `a' is returned.
6512 *----------------------------------------------------------------------------*/
6514 int32_t float128_to_int32(float128 a
, float_status
*status
)
6517 int32_t aExp
, shiftCount
;
6518 uint64_t aSig0
, aSig1
;
6520 aSig1
= extractFloat128Frac1( a
);
6521 aSig0
= extractFloat128Frac0( a
);
6522 aExp
= extractFloat128Exp( a
);
6523 aSign
= extractFloat128Sign( a
);
6524 if ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) aSign
= 0;
6525 if ( aExp
) aSig0
|= UINT64_C(0x0001000000000000);
6526 aSig0
|= ( aSig1
!= 0 );
6527 shiftCount
= 0x4028 - aExp
;
6528 if ( 0 < shiftCount
) shift64RightJamming( aSig0
, shiftCount
, &aSig0
);
6529 return roundAndPackInt32(aSign
, aSig0
, status
);
6533 /*----------------------------------------------------------------------------
6534 | Returns the result of converting the quadruple-precision floating-point
6535 | value `a' to the 32-bit two's complement integer format. The conversion
6536 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6537 | Arithmetic, except that the conversion is always rounded toward zero. If
6538 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
6539 | conversion overflows, the largest integer with the same sign as `a' is
6541 *----------------------------------------------------------------------------*/
6543 int32_t float128_to_int32_round_to_zero(float128 a
, float_status
*status
)
6546 int32_t aExp
, shiftCount
;
6547 uint64_t aSig0
, aSig1
, savedASig
;
6550 aSig1
= extractFloat128Frac1( a
);
6551 aSig0
= extractFloat128Frac0( a
);
6552 aExp
= extractFloat128Exp( a
);
6553 aSign
= extractFloat128Sign( a
);
6554 aSig0
|= ( aSig1
!= 0 );
6555 if ( 0x401E < aExp
) {
6556 if ( ( aExp
== 0x7FFF ) && aSig0
) aSign
= 0;
6559 else if ( aExp
< 0x3FFF ) {
6560 if (aExp
|| aSig0
) {
6561 float_raise(float_flag_inexact
, status
);
6565 aSig0
|= UINT64_C(0x0001000000000000);
6566 shiftCount
= 0x402F - aExp
;
6568 aSig0
>>= shiftCount
;
6570 if ( aSign
) z
= - z
;
6571 if ( ( z
< 0 ) ^ aSign
) {
6573 float_raise(float_flag_invalid
, status
);
6574 return aSign
? INT32_MIN
: INT32_MAX
;
6576 if ( ( aSig0
<<shiftCount
) != savedASig
) {
6577 float_raise(float_flag_inexact
, status
);
6583 /*----------------------------------------------------------------------------
6584 | Returns the result of converting the quadruple-precision floating-point
6585 | value `a' to the 64-bit two's complement integer format. The conversion
6586 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6587 | Arithmetic---which means in particular that the conversion is rounded
6588 | according to the current rounding mode. If `a' is a NaN, the largest
6589 | positive integer is returned. Otherwise, if the conversion overflows, the
6590 | largest integer with the same sign as `a' is returned.
6591 *----------------------------------------------------------------------------*/
6593 int64_t float128_to_int64(float128 a
, float_status
*status
)
6596 int32_t aExp
, shiftCount
;
6597 uint64_t aSig0
, aSig1
;
6599 aSig1
= extractFloat128Frac1( a
);
6600 aSig0
= extractFloat128Frac0( a
);
6601 aExp
= extractFloat128Exp( a
);
6602 aSign
= extractFloat128Sign( a
);
6603 if ( aExp
) aSig0
|= UINT64_C(0x0001000000000000);
6604 shiftCount
= 0x402F - aExp
;
6605 if ( shiftCount
<= 0 ) {
6606 if ( 0x403E < aExp
) {
6607 float_raise(float_flag_invalid
, status
);
6609 || ( ( aExp
== 0x7FFF )
6610 && ( aSig1
|| ( aSig0
!= UINT64_C(0x0001000000000000) ) )
6617 shortShift128Left( aSig0
, aSig1
, - shiftCount
, &aSig0
, &aSig1
);
6620 shift64ExtraRightJamming( aSig0
, aSig1
, shiftCount
, &aSig0
, &aSig1
);
6622 return roundAndPackInt64(aSign
, aSig0
, aSig1
, status
);
6626 /*----------------------------------------------------------------------------
6627 | Returns the result of converting the quadruple-precision floating-point
6628 | value `a' to the 64-bit two's complement integer format. The conversion
6629 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6630 | Arithmetic, except that the conversion is always rounded toward zero.
6631 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
6632 | the conversion overflows, the largest integer with the same sign as `a' is
6634 *----------------------------------------------------------------------------*/
6636 int64_t float128_to_int64_round_to_zero(float128 a
, float_status
*status
)
6639 int32_t aExp
, shiftCount
;
6640 uint64_t aSig0
, aSig1
;
6643 aSig1
= extractFloat128Frac1( a
);
6644 aSig0
= extractFloat128Frac0( a
);
6645 aExp
= extractFloat128Exp( a
);
6646 aSign
= extractFloat128Sign( a
);
6647 if ( aExp
) aSig0
|= UINT64_C(0x0001000000000000);
6648 shiftCount
= aExp
- 0x402F;
6649 if ( 0 < shiftCount
) {
6650 if ( 0x403E <= aExp
) {
6651 aSig0
&= UINT64_C(0x0000FFFFFFFFFFFF);
6652 if ( ( a
.high
== UINT64_C(0xC03E000000000000) )
6653 && ( aSig1
< UINT64_C(0x0002000000000000) ) ) {
6655 float_raise(float_flag_inexact
, status
);
6659 float_raise(float_flag_invalid
, status
);
6660 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) ) {
6666 z
= ( aSig0
<<shiftCount
) | ( aSig1
>>( ( - shiftCount
) & 63 ) );
6667 if ( (uint64_t) ( aSig1
<<shiftCount
) ) {
6668 float_raise(float_flag_inexact
, status
);
6672 if ( aExp
< 0x3FFF ) {
6673 if ( aExp
| aSig0
| aSig1
) {
6674 float_raise(float_flag_inexact
, status
);
6678 z
= aSig0
>>( - shiftCount
);
6680 || ( shiftCount
&& (uint64_t) ( aSig0
<<( shiftCount
& 63 ) ) ) ) {
6681 float_raise(float_flag_inexact
, status
);
6684 if ( aSign
) z
= - z
;
6689 /*----------------------------------------------------------------------------
6690 | Returns the result of converting the quadruple-precision floating-point value
6691 | `a' to the 64-bit unsigned integer format. The conversion is
6692 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6693 | Arithmetic---which means in particular that the conversion is rounded
6694 | according to the current rounding mode. If `a' is a NaN, the largest
6695 | positive integer is returned. If the conversion overflows, the
6696 | largest unsigned integer is returned. If 'a' is negative, the value is
6697 | rounded and zero is returned; negative values that do not round to zero
6698 | will raise the inexact exception.
6699 *----------------------------------------------------------------------------*/
6701 uint64_t float128_to_uint64(float128 a
, float_status
*status
)
6706 uint64_t aSig0
, aSig1
;
6708 aSig0
= extractFloat128Frac0(a
);
6709 aSig1
= extractFloat128Frac1(a
);
6710 aExp
= extractFloat128Exp(a
);
6711 aSign
= extractFloat128Sign(a
);
6712 if (aSign
&& (aExp
> 0x3FFE)) {
6713 float_raise(float_flag_invalid
, status
);
6714 if (float128_is_any_nan(a
)) {
6721 aSig0
|= UINT64_C(0x0001000000000000);
6723 shiftCount
= 0x402F - aExp
;
6724 if (shiftCount
<= 0) {
6725 if (0x403E < aExp
) {
6726 float_raise(float_flag_invalid
, status
);
6729 shortShift128Left(aSig0
, aSig1
, -shiftCount
, &aSig0
, &aSig1
);
6731 shift64ExtraRightJamming(aSig0
, aSig1
, shiftCount
, &aSig0
, &aSig1
);
6733 return roundAndPackUint64(aSign
, aSig0
, aSig1
, status
);
6736 uint64_t float128_to_uint64_round_to_zero(float128 a
, float_status
*status
)
6739 signed char current_rounding_mode
= status
->float_rounding_mode
;
6741 set_float_rounding_mode(float_round_to_zero
, status
);
6742 v
= float128_to_uint64(a
, status
);
6743 set_float_rounding_mode(current_rounding_mode
, status
);
6748 /*----------------------------------------------------------------------------
6749 | Returns the result of converting the quadruple-precision floating-point
6750 | value `a' to the 32-bit unsigned integer format. The conversion
6751 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6752 | Arithmetic except that the conversion is always rounded toward zero.
6753 | If `a' is a NaN, the largest positive integer is returned. Otherwise,
6754 | if the conversion overflows, the largest unsigned integer is returned.
6755 | If 'a' is negative, the value is rounded and zero is returned; negative
6756 | values that do not round to zero will raise the inexact exception.
6757 *----------------------------------------------------------------------------*/
6759 uint32_t float128_to_uint32_round_to_zero(float128 a
, float_status
*status
)
6763 int old_exc_flags
= get_float_exception_flags(status
);
6765 v
= float128_to_uint64_round_to_zero(a
, status
);
6766 if (v
> 0xffffffff) {
6771 set_float_exception_flags(old_exc_flags
, status
);
6772 float_raise(float_flag_invalid
, status
);
6776 /*----------------------------------------------------------------------------
6777 | Returns the result of converting the quadruple-precision floating-point value
6778 | `a' to the 32-bit unsigned integer format. The conversion is
6779 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6780 | Arithmetic---which means in particular that the conversion is rounded
6781 | according to the current rounding mode. If `a' is a NaN, the largest
6782 | positive integer is returned. If the conversion overflows, the
6783 | largest unsigned integer is returned. If 'a' is negative, the value is
6784 | rounded and zero is returned; negative values that do not round to zero
6785 | will raise the inexact exception.
6786 *----------------------------------------------------------------------------*/
6788 uint32_t float128_to_uint32(float128 a
, float_status
*status
)
6792 int old_exc_flags
= get_float_exception_flags(status
);
6794 v
= float128_to_uint64(a
, status
);
6795 if (v
> 0xffffffff) {
6800 set_float_exception_flags(old_exc_flags
, status
);
6801 float_raise(float_flag_invalid
, status
);
6805 /*----------------------------------------------------------------------------
6806 | Returns the result of converting the quadruple-precision floating-point
6807 | value `a' to the single-precision floating-point format. The conversion
6808 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6810 *----------------------------------------------------------------------------*/
6812 float32
float128_to_float32(float128 a
, float_status
*status
)
6816 uint64_t aSig0
, aSig1
;
6819 aSig1
= extractFloat128Frac1( a
);
6820 aSig0
= extractFloat128Frac0( a
);
6821 aExp
= extractFloat128Exp( a
);
6822 aSign
= extractFloat128Sign( a
);
6823 if ( aExp
== 0x7FFF ) {
6824 if ( aSig0
| aSig1
) {
6825 return commonNaNToFloat32(float128ToCommonNaN(a
, status
), status
);
6827 return packFloat32( aSign
, 0xFF, 0 );
6829 aSig0
|= ( aSig1
!= 0 );
6830 shift64RightJamming( aSig0
, 18, &aSig0
);
6832 if ( aExp
|| zSig
) {
6836 return roundAndPackFloat32(aSign
, aExp
, zSig
, status
);
6840 /*----------------------------------------------------------------------------
6841 | Returns the result of converting the quadruple-precision floating-point
6842 | value `a' to the double-precision floating-point format. The conversion
6843 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6845 *----------------------------------------------------------------------------*/
6847 float64
float128_to_float64(float128 a
, float_status
*status
)
6851 uint64_t aSig0
, aSig1
;
6853 aSig1
= extractFloat128Frac1( a
);
6854 aSig0
= extractFloat128Frac0( a
);
6855 aExp
= extractFloat128Exp( a
);
6856 aSign
= extractFloat128Sign( a
);
6857 if ( aExp
== 0x7FFF ) {
6858 if ( aSig0
| aSig1
) {
6859 return commonNaNToFloat64(float128ToCommonNaN(a
, status
), status
);
6861 return packFloat64( aSign
, 0x7FF, 0 );
6863 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
6864 aSig0
|= ( aSig1
!= 0 );
6865 if ( aExp
|| aSig0
) {
6866 aSig0
|= UINT64_C(0x4000000000000000);
6869 return roundAndPackFloat64(aSign
, aExp
, aSig0
, status
);
6873 /*----------------------------------------------------------------------------
6874 | Returns the result of converting the quadruple-precision floating-point
6875 | value `a' to the extended double-precision floating-point format. The
6876 | conversion is performed according to the IEC/IEEE Standard for Binary
6877 | Floating-Point Arithmetic.
6878 *----------------------------------------------------------------------------*/
6880 floatx80
float128_to_floatx80(float128 a
, float_status
*status
)
6884 uint64_t aSig0
, aSig1
;
6886 aSig1
= extractFloat128Frac1( a
);
6887 aSig0
= extractFloat128Frac0( a
);
6888 aExp
= extractFloat128Exp( a
);
6889 aSign
= extractFloat128Sign( a
);
6890 if ( aExp
== 0x7FFF ) {
6891 if ( aSig0
| aSig1
) {
6892 floatx80 res
= commonNaNToFloatx80(float128ToCommonNaN(a
, status
),
6894 return floatx80_silence_nan(res
, status
);
6896 return packFloatx80(aSign
, floatx80_infinity_high
,
6897 floatx80_infinity_low
);
6900 if ( ( aSig0
| aSig1
) == 0 ) return packFloatx80( aSign
, 0, 0 );
6901 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6904 aSig0
|= UINT64_C(0x0001000000000000);
6906 shortShift128Left( aSig0
, aSig1
, 15, &aSig0
, &aSig1
);
6907 return roundAndPackFloatx80(80, aSign
, aExp
, aSig0
, aSig1
, status
);
6911 /*----------------------------------------------------------------------------
6912 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6913 | returns the result as a quadruple-precision floating-point value. The
6914 | operation is performed according to the IEC/IEEE Standard for Binary
6915 | Floating-Point Arithmetic.
6916 *----------------------------------------------------------------------------*/
6918 float128
float128_round_to_int(float128 a
, float_status
*status
)
6922 uint64_t lastBitMask
, roundBitsMask
;
6925 aExp
= extractFloat128Exp( a
);
6926 if ( 0x402F <= aExp
) {
6927 if ( 0x406F <= aExp
) {
6928 if ( ( aExp
== 0x7FFF )
6929 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) )
6931 return propagateFloat128NaN(a
, a
, status
);
6936 lastBitMask
= ( lastBitMask
<<( 0x406E - aExp
) )<<1;
6937 roundBitsMask
= lastBitMask
- 1;
6939 switch (status
->float_rounding_mode
) {
6940 case float_round_nearest_even
:
6941 if ( lastBitMask
) {
6942 add128( z
.high
, z
.low
, 0, lastBitMask
>>1, &z
.high
, &z
.low
);
6943 if ( ( z
.low
& roundBitsMask
) == 0 ) z
.low
&= ~ lastBitMask
;
6946 if ( (int64_t) z
.low
< 0 ) {
6948 if ( (uint64_t) ( z
.low
<<1 ) == 0 ) z
.high
&= ~1;
6952 case float_round_ties_away
:
6954 add128(z
.high
, z
.low
, 0, lastBitMask
>> 1, &z
.high
, &z
.low
);
6956 if ((int64_t) z
.low
< 0) {
6961 case float_round_to_zero
:
6963 case float_round_up
:
6964 if (!extractFloat128Sign(z
)) {
6965 add128(z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
6968 case float_round_down
:
6969 if (extractFloat128Sign(z
)) {
6970 add128(z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
6973 case float_round_to_odd
:
6975 * Note that if lastBitMask == 0, the last bit is the lsb
6976 * of high, and roundBitsMask == -1.
6978 if ((lastBitMask
? z
.low
& lastBitMask
: z
.high
& 1) == 0) {
6979 add128(z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
6985 z
.low
&= ~ roundBitsMask
;
6988 if ( aExp
< 0x3FFF ) {
6989 if ( ( ( (uint64_t) ( a
.high
<<1 ) ) | a
.low
) == 0 ) return a
;
6990 float_raise(float_flag_inexact
, status
);
6991 aSign
= extractFloat128Sign( a
);
6992 switch (status
->float_rounding_mode
) {
6993 case float_round_nearest_even
:
6994 if ( ( aExp
== 0x3FFE )
6995 && ( extractFloat128Frac0( a
)
6996 | extractFloat128Frac1( a
) )
6998 return packFloat128( aSign
, 0x3FFF, 0, 0 );
7001 case float_round_ties_away
:
7002 if (aExp
== 0x3FFE) {
7003 return packFloat128(aSign
, 0x3FFF, 0, 0);
7006 case float_round_down
:
7008 aSign
? packFloat128( 1, 0x3FFF, 0, 0 )
7009 : packFloat128( 0, 0, 0, 0 );
7010 case float_round_up
:
7012 aSign
? packFloat128( 1, 0, 0, 0 )
7013 : packFloat128( 0, 0x3FFF, 0, 0 );
7015 case float_round_to_odd
:
7016 return packFloat128(aSign
, 0x3FFF, 0, 0);
7018 case float_round_to_zero
:
7021 return packFloat128( aSign
, 0, 0, 0 );
7024 lastBitMask
<<= 0x402F - aExp
;
7025 roundBitsMask
= lastBitMask
- 1;
7028 switch (status
->float_rounding_mode
) {
7029 case float_round_nearest_even
:
7030 z
.high
+= lastBitMask
>>1;
7031 if ( ( ( z
.high
& roundBitsMask
) | a
.low
) == 0 ) {
7032 z
.high
&= ~ lastBitMask
;
7035 case float_round_ties_away
:
7036 z
.high
+= lastBitMask
>>1;
7038 case float_round_to_zero
:
7040 case float_round_up
:
7041 if (!extractFloat128Sign(z
)) {
7042 z
.high
|= ( a
.low
!= 0 );
7043 z
.high
+= roundBitsMask
;
7046 case float_round_down
:
7047 if (extractFloat128Sign(z
)) {
7048 z
.high
|= (a
.low
!= 0);
7049 z
.high
+= roundBitsMask
;
7052 case float_round_to_odd
:
7053 if ((z
.high
& lastBitMask
) == 0) {
7054 z
.high
|= (a
.low
!= 0);
7055 z
.high
+= roundBitsMask
;
7061 z
.high
&= ~ roundBitsMask
;
7063 if ( ( z
.low
!= a
.low
) || ( z
.high
!= a
.high
) ) {
7064 float_raise(float_flag_inexact
, status
);
7070 /*----------------------------------------------------------------------------
7071 | Returns the result of dividing the quadruple-precision floating-point value
7072 | `a' by the corresponding value `b'. The operation is performed according to
7073 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7074 *----------------------------------------------------------------------------*/
7076 float128
float128_div(float128 a
, float128 b
, float_status
*status
)
7078 bool aSign
, bSign
, zSign
;
7079 int32_t aExp
, bExp
, zExp
;
7080 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
7081 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
7083 aSig1
= extractFloat128Frac1( a
);
7084 aSig0
= extractFloat128Frac0( a
);
7085 aExp
= extractFloat128Exp( a
);
7086 aSign
= extractFloat128Sign( a
);
7087 bSig1
= extractFloat128Frac1( b
);
7088 bSig0
= extractFloat128Frac0( b
);
7089 bExp
= extractFloat128Exp( b
);
7090 bSign
= extractFloat128Sign( b
);
7091 zSign
= aSign
^ bSign
;
7092 if ( aExp
== 0x7FFF ) {
7093 if (aSig0
| aSig1
) {
7094 return propagateFloat128NaN(a
, b
, status
);
7096 if ( bExp
== 0x7FFF ) {
7097 if (bSig0
| bSig1
) {
7098 return propagateFloat128NaN(a
, b
, status
);
7102 return packFloat128( zSign
, 0x7FFF, 0, 0 );
7104 if ( bExp
== 0x7FFF ) {
7105 if (bSig0
| bSig1
) {
7106 return propagateFloat128NaN(a
, b
, status
);
7108 return packFloat128( zSign
, 0, 0, 0 );
7111 if ( ( bSig0
| bSig1
) == 0 ) {
7112 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
7114 float_raise(float_flag_invalid
, status
);
7115 return float128_default_nan(status
);
7117 float_raise(float_flag_divbyzero
, status
);
7118 return packFloat128( zSign
, 0x7FFF, 0, 0 );
7120 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
7123 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
7124 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
7126 zExp
= aExp
- bExp
+ 0x3FFD;
7128 aSig0
| UINT64_C(0x0001000000000000), aSig1
, 15, &aSig0
, &aSig1
);
7130 bSig0
| UINT64_C(0x0001000000000000), bSig1
, 15, &bSig0
, &bSig1
);
7131 if ( le128( bSig0
, bSig1
, aSig0
, aSig1
) ) {
7132 shift128Right( aSig0
, aSig1
, 1, &aSig0
, &aSig1
);
7135 zSig0
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
7136 mul128By64To192( bSig0
, bSig1
, zSig0
, &term0
, &term1
, &term2
);
7137 sub192( aSig0
, aSig1
, 0, term0
, term1
, term2
, &rem0
, &rem1
, &rem2
);
7138 while ( (int64_t) rem0
< 0 ) {
7140 add192( rem0
, rem1
, rem2
, 0, bSig0
, bSig1
, &rem0
, &rem1
, &rem2
);
7142 zSig1
= estimateDiv128To64( rem1
, rem2
, bSig0
);
7143 if ( ( zSig1
& 0x3FFF ) <= 4 ) {
7144 mul128By64To192( bSig0
, bSig1
, zSig1
, &term1
, &term2
, &term3
);
7145 sub192( rem1
, rem2
, 0, term1
, term2
, term3
, &rem1
, &rem2
, &rem3
);
7146 while ( (int64_t) rem1
< 0 ) {
7148 add192( rem1
, rem2
, rem3
, 0, bSig0
, bSig1
, &rem1
, &rem2
, &rem3
);
7150 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
7152 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 15, &zSig0
, &zSig1
, &zSig2
);
7153 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
7157 /*----------------------------------------------------------------------------
7158 | Returns the remainder of the quadruple-precision floating-point value `a'
7159 | with respect to the corresponding value `b'. The operation is performed
7160 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7161 *----------------------------------------------------------------------------*/
7163 float128
float128_rem(float128 a
, float128 b
, float_status
*status
)
7166 int32_t aExp
, bExp
, expDiff
;
7167 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, q
, term0
, term1
, term2
;
7168 uint64_t allZero
, alternateASig0
, alternateASig1
, sigMean1
;
7171 aSig1
= extractFloat128Frac1( a
);
7172 aSig0
= extractFloat128Frac0( a
);
7173 aExp
= extractFloat128Exp( a
);
7174 aSign
= extractFloat128Sign( a
);
7175 bSig1
= extractFloat128Frac1( b
);
7176 bSig0
= extractFloat128Frac0( b
);
7177 bExp
= extractFloat128Exp( b
);
7178 if ( aExp
== 0x7FFF ) {
7179 if ( ( aSig0
| aSig1
)
7180 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
7181 return propagateFloat128NaN(a
, b
, status
);
7185 if ( bExp
== 0x7FFF ) {
7186 if (bSig0
| bSig1
) {
7187 return propagateFloat128NaN(a
, b
, status
);
7192 if ( ( bSig0
| bSig1
) == 0 ) {
7194 float_raise(float_flag_invalid
, status
);
7195 return float128_default_nan(status
);
7197 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
7200 if ( ( aSig0
| aSig1
) == 0 ) return a
;
7201 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
7203 expDiff
= aExp
- bExp
;
7204 if ( expDiff
< -1 ) return a
;
7206 aSig0
| UINT64_C(0x0001000000000000),
7208 15 - ( expDiff
< 0 ),
7213 bSig0
| UINT64_C(0x0001000000000000), bSig1
, 15, &bSig0
, &bSig1
);
7214 q
= le128( bSig0
, bSig1
, aSig0
, aSig1
);
7215 if ( q
) sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
7217 while ( 0 < expDiff
) {
7218 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
7219 q
= ( 4 < q
) ? q
- 4 : 0;
7220 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
7221 shortShift192Left( term0
, term1
, term2
, 61, &term1
, &term2
, &allZero
);
7222 shortShift128Left( aSig0
, aSig1
, 61, &aSig0
, &allZero
);
7223 sub128( aSig0
, 0, term1
, term2
, &aSig0
, &aSig1
);
7226 if ( -64 < expDiff
) {
7227 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
7228 q
= ( 4 < q
) ? q
- 4 : 0;
7230 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
7232 if ( expDiff
< 0 ) {
7233 shift128Right( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
7236 shortShift128Left( aSig0
, aSig1
, expDiff
, &aSig0
, &aSig1
);
7238 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
7239 sub128( aSig0
, aSig1
, term1
, term2
, &aSig0
, &aSig1
);
7242 shift128Right( aSig0
, aSig1
, 12, &aSig0
, &aSig1
);
7243 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
7246 alternateASig0
= aSig0
;
7247 alternateASig1
= aSig1
;
7249 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
7250 } while ( 0 <= (int64_t) aSig0
);
7252 aSig0
, aSig1
, alternateASig0
, alternateASig1
, (uint64_t *)&sigMean0
, &sigMean1
);
7253 if ( ( sigMean0
< 0 )
7254 || ( ( ( sigMean0
| sigMean1
) == 0 ) && ( q
& 1 ) ) ) {
7255 aSig0
= alternateASig0
;
7256 aSig1
= alternateASig1
;
7258 zSign
= ( (int64_t) aSig0
< 0 );
7259 if ( zSign
) sub128( 0, 0, aSig0
, aSig1
, &aSig0
, &aSig1
);
7260 return normalizeRoundAndPackFloat128(aSign
^ zSign
, bExp
- 4, aSig0
, aSig1
,
7264 /*----------------------------------------------------------------------------
7265 | Returns the square root of the quadruple-precision floating-point value `a'.
7266 | The operation is performed according to the IEC/IEEE Standard for Binary
7267 | Floating-Point Arithmetic.
7268 *----------------------------------------------------------------------------*/
7270 float128
float128_sqrt(float128 a
, float_status
*status
)
7274 uint64_t aSig0
, aSig1
, zSig0
, zSig1
, zSig2
, doubleZSig0
;
7275 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
7277 aSig1
= extractFloat128Frac1( a
);
7278 aSig0
= extractFloat128Frac0( a
);
7279 aExp
= extractFloat128Exp( a
);
7280 aSign
= extractFloat128Sign( a
);
7281 if ( aExp
== 0x7FFF ) {
7282 if (aSig0
| aSig1
) {
7283 return propagateFloat128NaN(a
, a
, status
);
7285 if ( ! aSign
) return a
;
7289 if ( ( aExp
| aSig0
| aSig1
) == 0 ) return a
;
7291 float_raise(float_flag_invalid
, status
);
7292 return float128_default_nan(status
);
7295 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( 0, 0, 0, 0 );
7296 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
7298 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFE;
7299 aSig0
|= UINT64_C(0x0001000000000000);
7300 zSig0
= estimateSqrt32( aExp
, aSig0
>>17 );
7301 shortShift128Left( aSig0
, aSig1
, 13 - ( aExp
& 1 ), &aSig0
, &aSig1
);
7302 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
7303 doubleZSig0
= zSig0
<<1;
7304 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
7305 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
7306 while ( (int64_t) rem0
< 0 ) {
7309 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
7311 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
7312 if ( ( zSig1
& 0x1FFF ) <= 5 ) {
7313 if ( zSig1
== 0 ) zSig1
= 1;
7314 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
7315 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
7316 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
7317 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
7318 while ( (int64_t) rem1
< 0 ) {
7320 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
7322 term2
|= doubleZSig0
;
7323 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
7325 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
7327 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 14, &zSig0
, &zSig1
, &zSig2
);
7328 return roundAndPackFloat128(0, zExp
, zSig0
, zSig1
, zSig2
, status
);
7332 static inline FloatRelation
7333 floatx80_compare_internal(floatx80 a
, floatx80 b
, bool is_quiet
,
7334 float_status
*status
)
7338 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
7339 float_raise(float_flag_invalid
, status
);
7340 return float_relation_unordered
;
7342 if (( ( extractFloatx80Exp( a
) == 0x7fff ) &&
7343 ( extractFloatx80Frac( a
)<<1 ) ) ||
7344 ( ( extractFloatx80Exp( b
) == 0x7fff ) &&
7345 ( extractFloatx80Frac( b
)<<1 ) )) {
7347 floatx80_is_signaling_nan(a
, status
) ||
7348 floatx80_is_signaling_nan(b
, status
)) {
7349 float_raise(float_flag_invalid
, status
);
7351 return float_relation_unordered
;
7353 aSign
= extractFloatx80Sign( a
);
7354 bSign
= extractFloatx80Sign( b
);
7355 if ( aSign
!= bSign
) {
7357 if ( ( ( (uint16_t) ( ( a
.high
| b
.high
) << 1 ) ) == 0) &&
7358 ( ( a
.low
| b
.low
) == 0 ) ) {
7360 return float_relation_equal
;
7362 return 1 - (2 * aSign
);
7365 /* Normalize pseudo-denormals before comparison. */
7366 if ((a
.high
& 0x7fff) == 0 && a
.low
& UINT64_C(0x8000000000000000)) {
7369 if ((b
.high
& 0x7fff) == 0 && b
.low
& UINT64_C(0x8000000000000000)) {
7372 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
7373 return float_relation_equal
;
7375 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
7380 FloatRelation
floatx80_compare(floatx80 a
, floatx80 b
, float_status
*status
)
7382 return floatx80_compare_internal(a
, b
, 0, status
);
7385 FloatRelation
floatx80_compare_quiet(floatx80 a
, floatx80 b
,
7386 float_status
*status
)
7388 return floatx80_compare_internal(a
, b
, 1, status
);
7391 static inline FloatRelation
7392 float128_compare_internal(float128 a
, float128 b
, bool is_quiet
,
7393 float_status
*status
)
7397 if (( ( extractFloat128Exp( a
) == 0x7fff ) &&
7398 ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) ) ||
7399 ( ( extractFloat128Exp( b
) == 0x7fff ) &&
7400 ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )) {
7402 float128_is_signaling_nan(a
, status
) ||
7403 float128_is_signaling_nan(b
, status
)) {
7404 float_raise(float_flag_invalid
, status
);
7406 return float_relation_unordered
;
7408 aSign
= extractFloat128Sign( a
);
7409 bSign
= extractFloat128Sign( b
);
7410 if ( aSign
!= bSign
) {
7411 if ( ( ( ( a
.high
| b
.high
)<<1 ) | a
.low
| b
.low
) == 0 ) {
7413 return float_relation_equal
;
7415 return 1 - (2 * aSign
);
7418 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
7419 return float_relation_equal
;
7421 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
7426 FloatRelation
float128_compare(float128 a
, float128 b
, float_status
*status
)
7428 return float128_compare_internal(a
, b
, 0, status
);
7431 FloatRelation
float128_compare_quiet(float128 a
, float128 b
,
7432 float_status
*status
)
7434 return float128_compare_internal(a
, b
, 1, status
);
7437 floatx80
floatx80_scalbn(floatx80 a
, int n
, float_status
*status
)
7443 if (floatx80_invalid_encoding(a
)) {
7444 float_raise(float_flag_invalid
, status
);
7445 return floatx80_default_nan(status
);
7447 aSig
= extractFloatx80Frac( a
);
7448 aExp
= extractFloatx80Exp( a
);
7449 aSign
= extractFloatx80Sign( a
);
7451 if ( aExp
== 0x7FFF ) {
7453 return propagateFloatx80NaN(a
, a
, status
);
7467 } else if (n
< -0x10000) {
7472 return normalizeRoundAndPackFloatx80(status
->floatx80_rounding_precision
,
7473 aSign
, aExp
, aSig
, 0, status
);
7476 float128
float128_scalbn(float128 a
, int n
, float_status
*status
)
7480 uint64_t aSig0
, aSig1
;
7482 aSig1
= extractFloat128Frac1( a
);
7483 aSig0
= extractFloat128Frac0( a
);
7484 aExp
= extractFloat128Exp( a
);
7485 aSign
= extractFloat128Sign( a
);
7486 if ( aExp
== 0x7FFF ) {
7487 if ( aSig0
| aSig1
) {
7488 return propagateFloat128NaN(a
, a
, status
);
7493 aSig0
|= UINT64_C(0x0001000000000000);
7494 } else if (aSig0
== 0 && aSig1
== 0) {
7502 } else if (n
< -0x10000) {
7507 return normalizeRoundAndPackFloat128( aSign
, aExp
, aSig0
, aSig1
7512 static void __attribute__((constructor
)) softfloat_init(void)
7514 union_float64 ua
, ub
, uc
, ur
;
7516 if (QEMU_NO_HARDFLOAT
) {
7520 * Test that the host's FMA is not obviously broken. For example,
7521 * glibc < 2.23 can perform an incorrect FMA on certain hosts; see
7522 * https://sourceware.org/bugzilla/show_bug.cgi?id=13304
7524 ua
.s
= 0x0020000000000001ULL
;
7525 ub
.s
= 0x3ca0000000000000ULL
;
7526 uc
.s
= 0x0020000000000000ULL
;
7527 ur
.h
= fma(ua
.h
, ub
.h
, uc
.h
);
7528 if (ur
.s
!= 0x0020000000000001ULL
) {
7529 force_soft_fma
= true;