4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
44 ===============================================================================
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
85 #include "qemu/osdep.h"
87 #include "qemu/bitops.h"
88 #include "fpu/softfloat.h"
90 /* We only need stdlib for abort() */
92 /*----------------------------------------------------------------------------
93 | Primitive arithmetic functions, including multi-word arithmetic, and
94 | division and square root approximations. (Can be specialized to target if
96 *----------------------------------------------------------------------------*/
97 #include "fpu/softfloat-macros.h"
102 * Fast emulation of guest FP instructions is challenging for two reasons.
103 * First, FP instruction semantics are similar but not identical, particularly
104 * when handling NaNs. Second, emulating at reasonable speed the guest FP
105 * exception flags is not trivial: reading the host's flags register with a
106 * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp],
107 * and trapping on every FP exception is not fast nor pleasant to work with.
109 * We address these challenges by leveraging the host FPU for a subset of the
110 * operations. To do this we expand on the idea presented in this paper:
112 * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a
113 * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615.
115 * The idea is thus to leverage the host FPU to (1) compute FP operations
116 * and (2) identify whether FP exceptions occurred while avoiding
117 * expensive exception flag register accesses.
119 * An important optimization shown in the paper is that given that exception
120 * flags are rarely cleared by the guest, we can avoid recomputing some flags.
121 * This is particularly useful for the inexact flag, which is very frequently
122 * raised in floating-point workloads.
124 * We optimize the code further by deferring to soft-fp whenever FP exception
125 * detection might get hairy. Two examples: (1) when at least one operand is
126 * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result
127 * and the result is < the minimum normal.
129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t) \
130 static inline void name(soft_t *a, float_status *s) \
132 if (unlikely(soft_t ## _is_denormal(*a))) { \
133 *a = soft_t ## _set_sign(soft_t ## _zero, \
134 soft_t ## _is_neg(*a)); \
135 float_raise(float_flag_input_denormal, s); \
139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck
, float32
)
140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck
, float64
)
141 #undef GEN_INPUT_FLUSH__NOCHECK
143 #define GEN_INPUT_FLUSH1(name, soft_t) \
144 static inline void name(soft_t *a, float_status *s) \
146 if (likely(!s->flush_inputs_to_zero)) { \
149 soft_t ## _input_flush__nocheck(a, s); \
152 GEN_INPUT_FLUSH1(float32_input_flush1
, float32
)
153 GEN_INPUT_FLUSH1(float64_input_flush1
, float64
)
154 #undef GEN_INPUT_FLUSH1
156 #define GEN_INPUT_FLUSH2(name, soft_t) \
157 static inline void name(soft_t *a, soft_t *b, float_status *s) \
159 if (likely(!s->flush_inputs_to_zero)) { \
162 soft_t ## _input_flush__nocheck(a, s); \
163 soft_t ## _input_flush__nocheck(b, s); \
166 GEN_INPUT_FLUSH2(float32_input_flush2
, float32
)
167 GEN_INPUT_FLUSH2(float64_input_flush2
, float64
)
168 #undef GEN_INPUT_FLUSH2
170 #define GEN_INPUT_FLUSH3(name, soft_t) \
171 static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \
173 if (likely(!s->flush_inputs_to_zero)) { \
176 soft_t ## _input_flush__nocheck(a, s); \
177 soft_t ## _input_flush__nocheck(b, s); \
178 soft_t ## _input_flush__nocheck(c, s); \
181 GEN_INPUT_FLUSH3(float32_input_flush3
, float32
)
182 GEN_INPUT_FLUSH3(float64_input_flush3
, float64
)
183 #undef GEN_INPUT_FLUSH3
186 * Choose whether to use fpclassify or float32/64_* primitives in the generated
187 * hardfloat functions. Each combination of number of inputs and float size
188 * gets its own value.
190 #if defined(__x86_64__)
191 # define QEMU_HARDFLOAT_1F32_USE_FP 0
192 # define QEMU_HARDFLOAT_1F64_USE_FP 1
193 # define QEMU_HARDFLOAT_2F32_USE_FP 0
194 # define QEMU_HARDFLOAT_2F64_USE_FP 1
195 # define QEMU_HARDFLOAT_3F32_USE_FP 0
196 # define QEMU_HARDFLOAT_3F64_USE_FP 1
198 # define QEMU_HARDFLOAT_1F32_USE_FP 0
199 # define QEMU_HARDFLOAT_1F64_USE_FP 0
200 # define QEMU_HARDFLOAT_2F32_USE_FP 0
201 # define QEMU_HARDFLOAT_2F64_USE_FP 0
202 # define QEMU_HARDFLOAT_3F32_USE_FP 0
203 # define QEMU_HARDFLOAT_3F64_USE_FP 0
207 * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over
208 * float{32,64}_is_infinity when !USE_FP.
209 * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup.
210 * On power64 however, using isinf() reduces fp-bench performance by up to 50%.
212 #if defined(__x86_64__) || defined(__aarch64__)
213 # define QEMU_HARDFLOAT_USE_ISINF 1
215 # define QEMU_HARDFLOAT_USE_ISINF 0
219 * Some targets clear the FP flags before most FP operations. This prevents
220 * the use of hardfloat, since hardfloat relies on the inexact flag being
223 #if defined(TARGET_PPC) || defined(__FAST_MATH__)
224 # if defined(__FAST_MATH__)
225 # warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \
228 # define QEMU_NO_HARDFLOAT 1
229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN
231 # define QEMU_NO_HARDFLOAT 0
232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline))
235 static inline bool can_use_fpu(const float_status
*s
)
237 if (QEMU_NO_HARDFLOAT
) {
240 return likely(s
->float_exception_flags
& float_flag_inexact
&&
241 s
->float_rounding_mode
== float_round_nearest_even
);
245 * Hardfloat generation functions. Each operation can have two flavors:
246 * either using softfloat primitives (e.g. float32_is_zero_or_normal) for
247 * most condition checks, or native ones (e.g. fpclassify).
249 * The flavor is chosen by the callers. Instead of using macros, we rely on the
250 * compiler to propagate constants and inline everything into the callers.
252 * We only generate functions for operations with two inputs, since only
253 * these are common enough to justify consolidating them into common code.
266 typedef bool (*f32_check_fn
)(union_float32 a
, union_float32 b
);
267 typedef bool (*f64_check_fn
)(union_float64 a
, union_float64 b
);
269 typedef float32 (*soft_f32_op2_fn
)(float32 a
, float32 b
, float_status
*s
);
270 typedef float64 (*soft_f64_op2_fn
)(float64 a
, float64 b
, float_status
*s
);
271 typedef float (*hard_f32_op2_fn
)(float a
, float b
);
272 typedef double (*hard_f64_op2_fn
)(double a
, double b
);
274 /* 2-input is-zero-or-normal */
275 static inline bool f32_is_zon2(union_float32 a
, union_float32 b
)
277 if (QEMU_HARDFLOAT_2F32_USE_FP
) {
279 * Not using a temp variable for consecutive fpclassify calls ends up
280 * generating faster code.
282 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
283 (fpclassify(b
.h
) == FP_NORMAL
|| fpclassify(b
.h
) == FP_ZERO
);
285 return float32_is_zero_or_normal(a
.s
) &&
286 float32_is_zero_or_normal(b
.s
);
289 static inline bool f64_is_zon2(union_float64 a
, union_float64 b
)
291 if (QEMU_HARDFLOAT_2F64_USE_FP
) {
292 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
293 (fpclassify(b
.h
) == FP_NORMAL
|| fpclassify(b
.h
) == FP_ZERO
);
295 return float64_is_zero_or_normal(a
.s
) &&
296 float64_is_zero_or_normal(b
.s
);
299 /* 3-input is-zero-or-normal */
301 bool f32_is_zon3(union_float32 a
, union_float32 b
, union_float32 c
)
303 if (QEMU_HARDFLOAT_3F32_USE_FP
) {
304 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
305 (fpclassify(b
.h
) == FP_NORMAL
|| fpclassify(b
.h
) == FP_ZERO
) &&
306 (fpclassify(c
.h
) == FP_NORMAL
|| fpclassify(c
.h
) == FP_ZERO
);
308 return float32_is_zero_or_normal(a
.s
) &&
309 float32_is_zero_or_normal(b
.s
) &&
310 float32_is_zero_or_normal(c
.s
);
314 bool f64_is_zon3(union_float64 a
, union_float64 b
, union_float64 c
)
316 if (QEMU_HARDFLOAT_3F64_USE_FP
) {
317 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
318 (fpclassify(b
.h
) == FP_NORMAL
|| fpclassify(b
.h
) == FP_ZERO
) &&
319 (fpclassify(c
.h
) == FP_NORMAL
|| fpclassify(c
.h
) == FP_ZERO
);
321 return float64_is_zero_or_normal(a
.s
) &&
322 float64_is_zero_or_normal(b
.s
) &&
323 float64_is_zero_or_normal(c
.s
);
326 static inline bool f32_is_inf(union_float32 a
)
328 if (QEMU_HARDFLOAT_USE_ISINF
) {
331 return float32_is_infinity(a
.s
);
334 static inline bool f64_is_inf(union_float64 a
)
336 if (QEMU_HARDFLOAT_USE_ISINF
) {
339 return float64_is_infinity(a
.s
);
342 static inline float32
343 float32_gen2(float32 xa
, float32 xb
, float_status
*s
,
344 hard_f32_op2_fn hard
, soft_f32_op2_fn soft
,
345 f32_check_fn pre
, f32_check_fn post
)
347 union_float32 ua
, ub
, ur
;
352 if (unlikely(!can_use_fpu(s
))) {
356 float32_input_flush2(&ua
.s
, &ub
.s
, s
);
357 if (unlikely(!pre(ua
, ub
))) {
361 ur
.h
= hard(ua
.h
, ub
.h
);
362 if (unlikely(f32_is_inf(ur
))) {
363 float_raise(float_flag_overflow
, s
);
364 } else if (unlikely(fabsf(ur
.h
) <= FLT_MIN
) && post(ua
, ub
)) {
370 return soft(ua
.s
, ub
.s
, s
);
373 static inline float64
374 float64_gen2(float64 xa
, float64 xb
, float_status
*s
,
375 hard_f64_op2_fn hard
, soft_f64_op2_fn soft
,
376 f64_check_fn pre
, f64_check_fn post
)
378 union_float64 ua
, ub
, ur
;
383 if (unlikely(!can_use_fpu(s
))) {
387 float64_input_flush2(&ua
.s
, &ub
.s
, s
);
388 if (unlikely(!pre(ua
, ub
))) {
392 ur
.h
= hard(ua
.h
, ub
.h
);
393 if (unlikely(f64_is_inf(ur
))) {
394 float_raise(float_flag_overflow
, s
);
395 } else if (unlikely(fabs(ur
.h
) <= DBL_MIN
) && post(ua
, ub
)) {
401 return soft(ua
.s
, ub
.s
, s
);
404 /*----------------------------------------------------------------------------
405 | Returns the fraction bits of the single-precision floating-point value `a'.
406 *----------------------------------------------------------------------------*/
408 static inline uint32_t extractFloat32Frac(float32 a
)
410 return float32_val(a
) & 0x007FFFFF;
413 /*----------------------------------------------------------------------------
414 | Returns the exponent bits of the single-precision floating-point value `a'.
415 *----------------------------------------------------------------------------*/
417 static inline int extractFloat32Exp(float32 a
)
419 return (float32_val(a
) >> 23) & 0xFF;
422 /*----------------------------------------------------------------------------
423 | Returns the sign bit of the single-precision floating-point value `a'.
424 *----------------------------------------------------------------------------*/
426 static inline bool extractFloat32Sign(float32 a
)
428 return float32_val(a
) >> 31;
431 /*----------------------------------------------------------------------------
432 | Returns the fraction bits of the double-precision floating-point value `a'.
433 *----------------------------------------------------------------------------*/
435 static inline uint64_t extractFloat64Frac(float64 a
)
437 return float64_val(a
) & UINT64_C(0x000FFFFFFFFFFFFF);
440 /*----------------------------------------------------------------------------
441 | Returns the exponent bits of the double-precision floating-point value `a'.
442 *----------------------------------------------------------------------------*/
444 static inline int extractFloat64Exp(float64 a
)
446 return (float64_val(a
) >> 52) & 0x7FF;
449 /*----------------------------------------------------------------------------
450 | Returns the sign bit of the double-precision floating-point value `a'.
451 *----------------------------------------------------------------------------*/
453 static inline bool extractFloat64Sign(float64 a
)
455 return float64_val(a
) >> 63;
459 * Classify a floating point number. Everything above float_class_qnan
460 * is a NaN so cls >= float_class_qnan is any NaN.
463 typedef enum __attribute__ ((__packed__
)) {
464 float_class_unclassified
,
468 float_class_qnan
, /* all NaNs from here */
472 #define float_cmask(bit) (1u << (bit))
475 float_cmask_zero
= float_cmask(float_class_zero
),
476 float_cmask_normal
= float_cmask(float_class_normal
),
477 float_cmask_inf
= float_cmask(float_class_inf
),
478 float_cmask_qnan
= float_cmask(float_class_qnan
),
479 float_cmask_snan
= float_cmask(float_class_snan
),
481 float_cmask_infzero
= float_cmask_zero
| float_cmask_inf
,
482 float_cmask_anynan
= float_cmask_qnan
| float_cmask_snan
,
486 /* Simple helpers for checking if, or what kind of, NaN we have */
487 static inline __attribute__((unused
)) bool is_nan(FloatClass c
)
489 return unlikely(c
>= float_class_qnan
);
492 static inline __attribute__((unused
)) bool is_snan(FloatClass c
)
494 return c
== float_class_snan
;
497 static inline __attribute__((unused
)) bool is_qnan(FloatClass c
)
499 return c
== float_class_qnan
;
503 * Structure holding all of the decomposed parts of a float.
504 * The exponent is unbiased and the fraction is normalized.
506 * The fraction words are stored in big-endian word ordering,
507 * so that truncation from a larger format to a smaller format
508 * can be done simply by ignoring subsequent elements.
516 /* Routines that know the structure may reference the singular name. */
519 * Routines expanded with multiple structures reference "hi" and "lo"
520 * depending on the operation. In FloatParts64, "hi" and "lo" are
521 * both the same word and aliased here.
541 uint64_t frac_hm
; /* high-middle */
542 uint64_t frac_lm
; /* low-middle */
546 /* These apply to the most significant word of each FloatPartsN. */
547 #define DECOMPOSED_BINARY_POINT 63
548 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT)
550 /* Structure holding all of the relevant parameters for a format.
551 * exp_size: the size of the exponent field
552 * exp_bias: the offset applied to the exponent field
553 * exp_max: the maximum normalised exponent
554 * frac_size: the size of the fraction field
555 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
556 * The following are computed based the size of fraction
557 * frac_lsb: least significant bit of fraction
558 * frac_lsbm1: the bit below the least significant bit (for rounding)
559 * round_mask/roundeven_mask: masks used for rounding
560 * The following optional modifiers are available:
561 * arm_althp: handle ARM Alternative Half Precision
572 uint64_t roundeven_mask
;
576 /* Expand fields based on the size of exponent and fraction */
577 #define FLOAT_PARAMS(E, F) \
579 .exp_bias = ((1 << E) - 1) >> 1, \
580 .exp_max = (1 << E) - 1, \
582 .frac_shift = (-F - 1) & 63, \
583 .frac_lsb = 1ull << ((-F - 1) & 63), \
584 .frac_lsbm1 = 1ull << ((-F - 2) & 63), \
585 .round_mask = (1ull << ((-F - 1) & 63)) - 1, \
586 .roundeven_mask = (2ull << ((-F - 1) & 63)) - 1
588 static const FloatFmt float16_params
= {
592 static const FloatFmt float16_params_ahp
= {
597 static const FloatFmt bfloat16_params
= {
601 static const FloatFmt float32_params
= {
605 static const FloatFmt float64_params
= {
609 static const FloatFmt float128_params
= {
610 FLOAT_PARAMS(15, 112)
613 /* Unpack a float to parts, but do not canonicalize. */
614 static void unpack_raw64(FloatParts64
*r
, const FloatFmt
*fmt
, uint64_t raw
)
616 const int f_size
= fmt
->frac_size
;
617 const int e_size
= fmt
->exp_size
;
619 *r
= (FloatParts64
) {
620 .cls
= float_class_unclassified
,
621 .sign
= extract64(raw
, f_size
+ e_size
, 1),
622 .exp
= extract64(raw
, f_size
, e_size
),
623 .frac
= extract64(raw
, 0, f_size
)
627 static inline void float16_unpack_raw(FloatParts64
*p
, float16 f
)
629 unpack_raw64(p
, &float16_params
, f
);
632 static inline void bfloat16_unpack_raw(FloatParts64
*p
, bfloat16 f
)
634 unpack_raw64(p
, &bfloat16_params
, f
);
637 static inline void float32_unpack_raw(FloatParts64
*p
, float32 f
)
639 unpack_raw64(p
, &float32_params
, f
);
642 static inline void float64_unpack_raw(FloatParts64
*p
, float64 f
)
644 unpack_raw64(p
, &float64_params
, f
);
647 static void float128_unpack_raw(FloatParts128
*p
, float128 f
)
649 const int f_size
= float128_params
.frac_size
- 64;
650 const int e_size
= float128_params
.exp_size
;
652 *p
= (FloatParts128
) {
653 .cls
= float_class_unclassified
,
654 .sign
= extract64(f
.high
, f_size
+ e_size
, 1),
655 .exp
= extract64(f
.high
, f_size
, e_size
),
656 .frac_hi
= extract64(f
.high
, 0, f_size
),
661 /* Pack a float from parts, but do not canonicalize. */
662 static uint64_t pack_raw64(const FloatParts64
*p
, const FloatFmt
*fmt
)
664 const int f_size
= fmt
->frac_size
;
665 const int e_size
= fmt
->exp_size
;
668 ret
= (uint64_t)p
->sign
<< (f_size
+ e_size
);
669 ret
= deposit64(ret
, f_size
, e_size
, p
->exp
);
670 ret
= deposit64(ret
, 0, f_size
, p
->frac
);
674 static inline float16
float16_pack_raw(const FloatParts64
*p
)
676 return make_float16(pack_raw64(p
, &float16_params
));
679 static inline bfloat16
bfloat16_pack_raw(const FloatParts64
*p
)
681 return pack_raw64(p
, &bfloat16_params
);
684 static inline float32
float32_pack_raw(const FloatParts64
*p
)
686 return make_float32(pack_raw64(p
, &float32_params
));
689 static inline float64
float64_pack_raw(const FloatParts64
*p
)
691 return make_float64(pack_raw64(p
, &float64_params
));
694 static float128
float128_pack_raw(const FloatParts128
*p
)
696 const int f_size
= float128_params
.frac_size
- 64;
697 const int e_size
= float128_params
.exp_size
;
700 hi
= (uint64_t)p
->sign
<< (f_size
+ e_size
);
701 hi
= deposit64(hi
, f_size
, e_size
, p
->exp
);
702 hi
= deposit64(hi
, 0, f_size
, p
->frac_hi
);
703 return make_float128(hi
, p
->frac_lo
);
706 /*----------------------------------------------------------------------------
707 | Functions and definitions to determine: (1) whether tininess for underflow
708 | is detected before or after rounding by default, (2) what (if anything)
709 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
710 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
711 | are propagated from function inputs to output. These details are target-
713 *----------------------------------------------------------------------------*/
714 #include "softfloat-specialize.c.inc"
716 #define PARTS_GENERIC_64_128(NAME, P) \
717 QEMU_GENERIC(P, (FloatParts128 *, parts128_##NAME), parts64_##NAME)
719 #define PARTS_GENERIC_64_128_256(NAME, P) \
720 QEMU_GENERIC(P, (FloatParts256 *, parts256_##NAME), \
721 (FloatParts128 *, parts128_##NAME), parts64_##NAME)
723 #define parts_default_nan(P, S) PARTS_GENERIC_64_128(default_nan, P)(P, S)
724 #define parts_silence_nan(P, S) PARTS_GENERIC_64_128(silence_nan, P)(P, S)
726 static void parts64_return_nan(FloatParts64
*a
, float_status
*s
);
727 static void parts128_return_nan(FloatParts128
*a
, float_status
*s
);
729 #define parts_return_nan(P, S) PARTS_GENERIC_64_128(return_nan, P)(P, S)
731 static FloatParts64
*parts64_pick_nan(FloatParts64
*a
, FloatParts64
*b
,
733 static FloatParts128
*parts128_pick_nan(FloatParts128
*a
, FloatParts128
*b
,
736 #define parts_pick_nan(A, B, S) PARTS_GENERIC_64_128(pick_nan, A)(A, B, S)
738 static FloatParts64
*parts64_pick_nan_muladd(FloatParts64
*a
, FloatParts64
*b
,
739 FloatParts64
*c
, float_status
*s
,
740 int ab_mask
, int abc_mask
);
741 static FloatParts128
*parts128_pick_nan_muladd(FloatParts128
*a
,
745 int ab_mask
, int abc_mask
);
747 #define parts_pick_nan_muladd(A, B, C, S, ABM, ABCM) \
748 PARTS_GENERIC_64_128(pick_nan_muladd, A)(A, B, C, S, ABM, ABCM)
750 static void parts64_canonicalize(FloatParts64
*p
, float_status
*status
,
751 const FloatFmt
*fmt
);
752 static void parts128_canonicalize(FloatParts128
*p
, float_status
*status
,
753 const FloatFmt
*fmt
);
755 #define parts_canonicalize(A, S, F) \
756 PARTS_GENERIC_64_128(canonicalize, A)(A, S, F)
758 static void parts64_uncanon(FloatParts64
*p
, float_status
*status
,
759 const FloatFmt
*fmt
);
760 static void parts128_uncanon(FloatParts128
*p
, float_status
*status
,
761 const FloatFmt
*fmt
);
763 #define parts_uncanon(A, S, F) \
764 PARTS_GENERIC_64_128(uncanon, A)(A, S, F)
766 static void parts64_add_normal(FloatParts64
*a
, FloatParts64
*b
);
767 static void parts128_add_normal(FloatParts128
*a
, FloatParts128
*b
);
768 static void parts256_add_normal(FloatParts256
*a
, FloatParts256
*b
);
770 #define parts_add_normal(A, B) \
771 PARTS_GENERIC_64_128_256(add_normal, A)(A, B)
773 static bool parts64_sub_normal(FloatParts64
*a
, FloatParts64
*b
);
774 static bool parts128_sub_normal(FloatParts128
*a
, FloatParts128
*b
);
775 static bool parts256_sub_normal(FloatParts256
*a
, FloatParts256
*b
);
777 #define parts_sub_normal(A, B) \
778 PARTS_GENERIC_64_128_256(sub_normal, A)(A, B)
780 static FloatParts64
*parts64_addsub(FloatParts64
*a
, FloatParts64
*b
,
781 float_status
*s
, bool subtract
);
782 static FloatParts128
*parts128_addsub(FloatParts128
*a
, FloatParts128
*b
,
783 float_status
*s
, bool subtract
);
785 #define parts_addsub(A, B, S, Z) \
786 PARTS_GENERIC_64_128(addsub, A)(A, B, S, Z)
788 static FloatParts64
*parts64_mul(FloatParts64
*a
, FloatParts64
*b
,
790 static FloatParts128
*parts128_mul(FloatParts128
*a
, FloatParts128
*b
,
793 #define parts_mul(A, B, S) \
794 PARTS_GENERIC_64_128(mul, A)(A, B, S)
796 static FloatParts64
*parts64_muladd(FloatParts64
*a
, FloatParts64
*b
,
797 FloatParts64
*c
, int flags
,
799 static FloatParts128
*parts128_muladd(FloatParts128
*a
, FloatParts128
*b
,
800 FloatParts128
*c
, int flags
,
803 #define parts_muladd(A, B, C, Z, S) \
804 PARTS_GENERIC_64_128(muladd, A)(A, B, C, Z, S)
806 static FloatParts64
*parts64_div(FloatParts64
*a
, FloatParts64
*b
,
808 static FloatParts128
*parts128_div(FloatParts128
*a
, FloatParts128
*b
,
811 #define parts_div(A, B, S) \
812 PARTS_GENERIC_64_128(div, A)(A, B, S)
814 static bool parts64_round_to_int_normal(FloatParts64
*a
, FloatRoundMode rm
,
815 int scale
, int frac_size
);
816 static bool parts128_round_to_int_normal(FloatParts128
*a
, FloatRoundMode r
,
817 int scale
, int frac_size
);
819 #define parts_round_to_int_normal(A, R, C, F) \
820 PARTS_GENERIC_64_128(round_to_int_normal, A)(A, R, C, F)
822 static void parts64_round_to_int(FloatParts64
*a
, FloatRoundMode rm
,
823 int scale
, float_status
*s
,
824 const FloatFmt
*fmt
);
825 static void parts128_round_to_int(FloatParts128
*a
, FloatRoundMode r
,
826 int scale
, float_status
*s
,
827 const FloatFmt
*fmt
);
829 #define parts_round_to_int(A, R, C, S, F) \
830 PARTS_GENERIC_64_128(round_to_int, A)(A, R, C, S, F)
832 static int64_t parts64_float_to_sint(FloatParts64
*p
, FloatRoundMode rmode
,
833 int scale
, int64_t min
, int64_t max
,
835 static int64_t parts128_float_to_sint(FloatParts128
*p
, FloatRoundMode rmode
,
836 int scale
, int64_t min
, int64_t max
,
839 #define parts_float_to_sint(P, R, Z, MN, MX, S) \
840 PARTS_GENERIC_64_128(float_to_sint, P)(P, R, Z, MN, MX, S)
842 static uint64_t parts64_float_to_uint(FloatParts64
*p
, FloatRoundMode rmode
,
843 int scale
, uint64_t max
,
845 static uint64_t parts128_float_to_uint(FloatParts128
*p
, FloatRoundMode rmode
,
846 int scale
, uint64_t max
,
849 #define parts_float_to_uint(P, R, Z, M, S) \
850 PARTS_GENERIC_64_128(float_to_uint, P)(P, R, Z, M, S)
853 * Helper functions for softfloat-parts.c.inc, per-size operations.
856 #define FRAC_GENERIC_64_128(NAME, P) \
857 QEMU_GENERIC(P, (FloatParts128 *, frac128_##NAME), frac64_##NAME)
859 #define FRAC_GENERIC_64_128_256(NAME, P) \
860 QEMU_GENERIC(P, (FloatParts256 *, frac256_##NAME), \
861 (FloatParts128 *, frac128_##NAME), frac64_##NAME)
863 static bool frac64_add(FloatParts64
*r
, FloatParts64
*a
, FloatParts64
*b
)
865 return uadd64_overflow(a
->frac
, b
->frac
, &r
->frac
);
868 static bool frac128_add(FloatParts128
*r
, FloatParts128
*a
, FloatParts128
*b
)
871 r
->frac_lo
= uadd64_carry(a
->frac_lo
, b
->frac_lo
, &c
);
872 r
->frac_hi
= uadd64_carry(a
->frac_hi
, b
->frac_hi
, &c
);
876 static bool frac256_add(FloatParts256
*r
, FloatParts256
*a
, FloatParts256
*b
)
879 r
->frac_lo
= uadd64_carry(a
->frac_lo
, b
->frac_lo
, &c
);
880 r
->frac_lm
= uadd64_carry(a
->frac_lm
, b
->frac_lm
, &c
);
881 r
->frac_hm
= uadd64_carry(a
->frac_hm
, b
->frac_hm
, &c
);
882 r
->frac_hi
= uadd64_carry(a
->frac_hi
, b
->frac_hi
, &c
);
886 #define frac_add(R, A, B) FRAC_GENERIC_64_128_256(add, R)(R, A, B)
888 static bool frac64_addi(FloatParts64
*r
, FloatParts64
*a
, uint64_t c
)
890 return uadd64_overflow(a
->frac
, c
, &r
->frac
);
893 static bool frac128_addi(FloatParts128
*r
, FloatParts128
*a
, uint64_t c
)
895 c
= uadd64_overflow(a
->frac_lo
, c
, &r
->frac_lo
);
896 return uadd64_overflow(a
->frac_hi
, c
, &r
->frac_hi
);
899 #define frac_addi(R, A, C) FRAC_GENERIC_64_128(addi, R)(R, A, C)
901 static void frac64_allones(FloatParts64
*a
)
906 static void frac128_allones(FloatParts128
*a
)
908 a
->frac_hi
= a
->frac_lo
= -1;
911 #define frac_allones(A) FRAC_GENERIC_64_128(allones, A)(A)
913 static int frac64_cmp(FloatParts64
*a
, FloatParts64
*b
)
915 return a
->frac
== b
->frac
? 0 : a
->frac
< b
->frac
? -1 : 1;
918 static int frac128_cmp(FloatParts128
*a
, FloatParts128
*b
)
920 uint64_t ta
= a
->frac_hi
, tb
= b
->frac_hi
;
922 ta
= a
->frac_lo
, tb
= b
->frac_lo
;
927 return ta
< tb
? -1 : 1;
930 #define frac_cmp(A, B) FRAC_GENERIC_64_128(cmp, A)(A, B)
932 static void frac64_clear(FloatParts64
*a
)
937 static void frac128_clear(FloatParts128
*a
)
939 a
->frac_hi
= a
->frac_lo
= 0;
942 #define frac_clear(A) FRAC_GENERIC_64_128(clear, A)(A)
944 static bool frac64_div(FloatParts64
*a
, FloatParts64
*b
)
946 uint64_t n1
, n0
, r
, q
;
950 * We want a 2*N / N-bit division to produce exactly an N-bit
951 * result, so that we do not lose any precision and so that we
952 * do not have to renormalize afterward. If A.frac < B.frac,
953 * then division would produce an (N-1)-bit result; shift A left
954 * by one to produce the an N-bit result, and return true to
955 * decrement the exponent to match.
957 * The udiv_qrnnd algorithm that we're using requires normalization,
958 * i.e. the msb of the denominator must be set, which is already true.
960 ret
= a
->frac
< b
->frac
;
968 q
= udiv_qrnnd(&r
, n0
, n1
, b
->frac
);
970 /* Set lsb if there is a remainder, to set inexact. */
971 a
->frac
= q
| (r
!= 0);
976 static bool frac128_div(FloatParts128
*a
, FloatParts128
*b
)
978 uint64_t q0
, q1
, a0
, a1
, b0
, b1
;
979 uint64_t r0
, r1
, r2
, r3
, t0
, t1
, t2
, t3
;
982 a0
= a
->frac_hi
, a1
= a
->frac_lo
;
983 b0
= b
->frac_hi
, b1
= b
->frac_lo
;
985 ret
= lt128(a0
, a1
, b0
, b1
);
987 a1
= shr_double(a0
, a1
, 1);
991 /* Use 128/64 -> 64 division as estimate for 192/128 -> 128 division. */
992 q0
= estimateDiv128To64(a0
, a1
, b0
);
995 * Estimate is high because B1 was not included (unless B1 == 0).
996 * Reduce quotient and increase remainder until remainder is non-negative.
997 * This loop will execute 0 to 2 times.
999 mul128By64To192(b0
, b1
, q0
, &t0
, &t1
, &t2
);
1000 sub192(a0
, a1
, 0, t0
, t1
, t2
, &r0
, &r1
, &r2
);
1003 add192(r0
, r1
, r2
, 0, b0
, b1
, &r0
, &r1
, &r2
);
1006 /* Repeat using the remainder, producing a second word of quotient. */
1007 q1
= estimateDiv128To64(r1
, r2
, b0
);
1008 mul128By64To192(b0
, b1
, q1
, &t1
, &t2
, &t3
);
1009 sub192(r1
, r2
, 0, t1
, t2
, t3
, &r1
, &r2
, &r3
);
1012 add192(r1
, r2
, r3
, 0, b0
, b1
, &r1
, &r2
, &r3
);
1015 /* Any remainder indicates inexact; set sticky bit. */
1016 q1
|= (r2
| r3
) != 0;
1023 #define frac_div(A, B) FRAC_GENERIC_64_128(div, A)(A, B)
1025 static bool frac64_eqz(FloatParts64
*a
)
1027 return a
->frac
== 0;
1030 static bool frac128_eqz(FloatParts128
*a
)
1032 return (a
->frac_hi
| a
->frac_lo
) == 0;
1035 #define frac_eqz(A) FRAC_GENERIC_64_128(eqz, A)(A)
1037 static void frac64_mulw(FloatParts128
*r
, FloatParts64
*a
, FloatParts64
*b
)
1039 mulu64(&r
->frac_lo
, &r
->frac_hi
, a
->frac
, b
->frac
);
1042 static void frac128_mulw(FloatParts256
*r
, FloatParts128
*a
, FloatParts128
*b
)
1044 mul128To256(a
->frac_hi
, a
->frac_lo
, b
->frac_hi
, b
->frac_lo
,
1045 &r
->frac_hi
, &r
->frac_hm
, &r
->frac_lm
, &r
->frac_lo
);
1048 #define frac_mulw(R, A, B) FRAC_GENERIC_64_128(mulw, A)(R, A, B)
1050 static void frac64_neg(FloatParts64
*a
)
1055 static void frac128_neg(FloatParts128
*a
)
1058 a
->frac_lo
= usub64_borrow(0, a
->frac_lo
, &c
);
1059 a
->frac_hi
= usub64_borrow(0, a
->frac_hi
, &c
);
1062 static void frac256_neg(FloatParts256
*a
)
1065 a
->frac_lo
= usub64_borrow(0, a
->frac_lo
, &c
);
1066 a
->frac_lm
= usub64_borrow(0, a
->frac_lm
, &c
);
1067 a
->frac_hm
= usub64_borrow(0, a
->frac_hm
, &c
);
1068 a
->frac_hi
= usub64_borrow(0, a
->frac_hi
, &c
);
1071 #define frac_neg(A) FRAC_GENERIC_64_128_256(neg, A)(A)
1073 static int frac64_normalize(FloatParts64
*a
)
1076 int shift
= clz64(a
->frac
);
1083 static int frac128_normalize(FloatParts128
*a
)
1086 int shl
= clz64(a
->frac_hi
);
1087 a
->frac_hi
= shl_double(a
->frac_hi
, a
->frac_lo
, shl
);
1090 } else if (a
->frac_lo
) {
1091 int shl
= clz64(a
->frac_lo
);
1092 a
->frac_hi
= a
->frac_lo
<< shl
;
1099 static int frac256_normalize(FloatParts256
*a
)
1101 uint64_t a0
= a
->frac_hi
, a1
= a
->frac_hm
;
1102 uint64_t a2
= a
->frac_lm
, a3
= a
->frac_lo
;
1114 a0
= a1
, a1
= a2
, a2
= a3
, a3
= 0;
1117 a0
= a2
, a1
= a3
, a2
= 0, a3
= 0;
1120 a0
= a3
, a1
= 0, a2
= 0, a3
= 0;
1123 a0
= 0, a1
= 0, a2
= 0, a3
= 0;
1133 a0
= shl_double(a0
, a1
, shl
);
1134 a1
= shl_double(a1
, a2
, shl
);
1135 a2
= shl_double(a2
, a3
, shl
);
1146 #define frac_normalize(A) FRAC_GENERIC_64_128_256(normalize, A)(A)
1148 static void frac64_shl(FloatParts64
*a
, int c
)
1153 static void frac128_shl(FloatParts128
*a
, int c
)
1155 uint64_t a0
= a
->frac_hi
, a1
= a
->frac_lo
;
1163 a0
= shl_double(a0
, a1
, c
);
1171 #define frac_shl(A, C) FRAC_GENERIC_64_128(shl, A)(A, C)
1173 static void frac64_shr(FloatParts64
*a
, int c
)
1178 static void frac128_shr(FloatParts128
*a
, int c
)
1180 uint64_t a0
= a
->frac_hi
, a1
= a
->frac_lo
;
1188 a1
= shr_double(a0
, a1
, c
);
1196 #define frac_shr(A, C) FRAC_GENERIC_64_128(shr, A)(A, C)
1198 static void frac64_shrjam(FloatParts64
*a
, int c
)
1200 uint64_t a0
= a
->frac
;
1202 if (likely(c
!= 0)) {
1203 if (likely(c
< 64)) {
1204 a0
= (a0
>> c
) | (shr_double(a0
, 0, c
) != 0);
1212 static void frac128_shrjam(FloatParts128
*a
, int c
)
1214 uint64_t a0
= a
->frac_hi
, a1
= a
->frac_lo
;
1215 uint64_t sticky
= 0;
1217 if (unlikely(c
== 0)) {
1219 } else if (likely(c
< 64)) {
1221 } else if (likely(c
< 128)) {
1235 sticky
|= shr_double(a1
, 0, c
);
1236 a1
= shr_double(a0
, a1
, c
);
1240 a
->frac_lo
= a1
| (sticky
!= 0);
1244 static void frac256_shrjam(FloatParts256
*a
, int c
)
1246 uint64_t a0
= a
->frac_hi
, a1
= a
->frac_hm
;
1247 uint64_t a2
= a
->frac_lm
, a3
= a
->frac_lo
;
1248 uint64_t sticky
= 0;
1250 if (unlikely(c
== 0)) {
1252 } else if (likely(c
< 64)) {
1254 } else if (likely(c
< 256)) {
1255 if (unlikely(c
& 128)) {
1257 a3
= a1
, a2
= a0
, a1
= 0, a0
= 0;
1259 if (unlikely(c
& 64)) {
1261 a3
= a2
, a2
= a1
, a1
= a0
, a0
= 0;
1268 sticky
= a0
| a1
| a2
| a3
;
1269 a0
= a1
= a2
= a3
= 0;
1273 sticky
|= shr_double(a3
, 0, c
);
1274 a3
= shr_double(a2
, a3
, c
);
1275 a2
= shr_double(a1
, a2
, c
);
1276 a1
= shr_double(a0
, a1
, c
);
1280 a
->frac_lo
= a3
| (sticky
!= 0);
1286 #define frac_shrjam(A, C) FRAC_GENERIC_64_128_256(shrjam, A)(A, C)
1288 static bool frac64_sub(FloatParts64
*r
, FloatParts64
*a
, FloatParts64
*b
)
1290 return usub64_overflow(a
->frac
, b
->frac
, &r
->frac
);
1293 static bool frac128_sub(FloatParts128
*r
, FloatParts128
*a
, FloatParts128
*b
)
1296 r
->frac_lo
= usub64_borrow(a
->frac_lo
, b
->frac_lo
, &c
);
1297 r
->frac_hi
= usub64_borrow(a
->frac_hi
, b
->frac_hi
, &c
);
1301 static bool frac256_sub(FloatParts256
*r
, FloatParts256
*a
, FloatParts256
*b
)
1304 r
->frac_lo
= usub64_borrow(a
->frac_lo
, b
->frac_lo
, &c
);
1305 r
->frac_lm
= usub64_borrow(a
->frac_lm
, b
->frac_lm
, &c
);
1306 r
->frac_hm
= usub64_borrow(a
->frac_hm
, b
->frac_hm
, &c
);
1307 r
->frac_hi
= usub64_borrow(a
->frac_hi
, b
->frac_hi
, &c
);
1311 #define frac_sub(R, A, B) FRAC_GENERIC_64_128_256(sub, R)(R, A, B)
1313 static void frac64_truncjam(FloatParts64
*r
, FloatParts128
*a
)
1315 r
->frac
= a
->frac_hi
| (a
->frac_lo
!= 0);
1318 static void frac128_truncjam(FloatParts128
*r
, FloatParts256
*a
)
1320 r
->frac_hi
= a
->frac_hi
;
1321 r
->frac_lo
= a
->frac_hm
| ((a
->frac_lm
| a
->frac_lo
) != 0);
1324 #define frac_truncjam(R, A) FRAC_GENERIC_64_128(truncjam, R)(R, A)
1326 static void frac64_widen(FloatParts128
*r
, FloatParts64
*a
)
1328 r
->frac_hi
= a
->frac
;
1332 static void frac128_widen(FloatParts256
*r
, FloatParts128
*a
)
1334 r
->frac_hi
= a
->frac_hi
;
1335 r
->frac_hm
= a
->frac_lo
;
1340 #define frac_widen(A, B) FRAC_GENERIC_64_128(widen, B)(A, B)
1342 #define partsN(NAME) glue(glue(glue(parts,N),_),NAME)
1343 #define FloatPartsN glue(FloatParts,N)
1344 #define FloatPartsW glue(FloatParts,W)
1349 #include "softfloat-parts-addsub.c.inc"
1350 #include "softfloat-parts.c.inc"
1357 #include "softfloat-parts-addsub.c.inc"
1358 #include "softfloat-parts.c.inc"
1364 #include "softfloat-parts-addsub.c.inc"
1373 * Pack/unpack routines with a specific FloatFmt.
1376 static void float16a_unpack_canonical(FloatParts64
*p
, float16 f
,
1377 float_status
*s
, const FloatFmt
*params
)
1379 float16_unpack_raw(p
, f
);
1380 parts_canonicalize(p
, s
, params
);
1383 static void float16_unpack_canonical(FloatParts64
*p
, float16 f
,
1386 float16a_unpack_canonical(p
, f
, s
, &float16_params
);
1389 static void bfloat16_unpack_canonical(FloatParts64
*p
, bfloat16 f
,
1392 bfloat16_unpack_raw(p
, f
);
1393 parts_canonicalize(p
, s
, &bfloat16_params
);
1396 static float16
float16a_round_pack_canonical(FloatParts64
*p
,
1398 const FloatFmt
*params
)
1400 parts_uncanon(p
, s
, params
);
1401 return float16_pack_raw(p
);
1404 static float16
float16_round_pack_canonical(FloatParts64
*p
,
1407 return float16a_round_pack_canonical(p
, s
, &float16_params
);
1410 static bfloat16
bfloat16_round_pack_canonical(FloatParts64
*p
,
1413 parts_uncanon(p
, s
, &bfloat16_params
);
1414 return bfloat16_pack_raw(p
);
1417 static void float32_unpack_canonical(FloatParts64
*p
, float32 f
,
1420 float32_unpack_raw(p
, f
);
1421 parts_canonicalize(p
, s
, &float32_params
);
1424 static float32
float32_round_pack_canonical(FloatParts64
*p
,
1427 parts_uncanon(p
, s
, &float32_params
);
1428 return float32_pack_raw(p
);
1431 static void float64_unpack_canonical(FloatParts64
*p
, float64 f
,
1434 float64_unpack_raw(p
, f
);
1435 parts_canonicalize(p
, s
, &float64_params
);
1438 static float64
float64_round_pack_canonical(FloatParts64
*p
,
1441 parts_uncanon(p
, s
, &float64_params
);
1442 return float64_pack_raw(p
);
1445 static void float128_unpack_canonical(FloatParts128
*p
, float128 f
,
1448 float128_unpack_raw(p
, f
);
1449 parts_canonicalize(p
, s
, &float128_params
);
1452 static float128
float128_round_pack_canonical(FloatParts128
*p
,
1455 parts_uncanon(p
, s
, &float128_params
);
1456 return float128_pack_raw(p
);
1460 * Addition and subtraction
1463 static float16 QEMU_FLATTEN
1464 float16_addsub(float16 a
, float16 b
, float_status
*status
, bool subtract
)
1466 FloatParts64 pa
, pb
, *pr
;
1468 float16_unpack_canonical(&pa
, a
, status
);
1469 float16_unpack_canonical(&pb
, b
, status
);
1470 pr
= parts_addsub(&pa
, &pb
, status
, subtract
);
1472 return float16_round_pack_canonical(pr
, status
);
1475 float16
float16_add(float16 a
, float16 b
, float_status
*status
)
1477 return float16_addsub(a
, b
, status
, false);
1480 float16
float16_sub(float16 a
, float16 b
, float_status
*status
)
1482 return float16_addsub(a
, b
, status
, true);
1485 static float32 QEMU_SOFTFLOAT_ATTR
1486 soft_f32_addsub(float32 a
, float32 b
, float_status
*status
, bool subtract
)
1488 FloatParts64 pa
, pb
, *pr
;
1490 float32_unpack_canonical(&pa
, a
, status
);
1491 float32_unpack_canonical(&pb
, b
, status
);
1492 pr
= parts_addsub(&pa
, &pb
, status
, subtract
);
1494 return float32_round_pack_canonical(pr
, status
);
1497 static float32
soft_f32_add(float32 a
, float32 b
, float_status
*status
)
1499 return soft_f32_addsub(a
, b
, status
, false);
1502 static float32
soft_f32_sub(float32 a
, float32 b
, float_status
*status
)
1504 return soft_f32_addsub(a
, b
, status
, true);
1507 static float64 QEMU_SOFTFLOAT_ATTR
1508 soft_f64_addsub(float64 a
, float64 b
, float_status
*status
, bool subtract
)
1510 FloatParts64 pa
, pb
, *pr
;
1512 float64_unpack_canonical(&pa
, a
, status
);
1513 float64_unpack_canonical(&pb
, b
, status
);
1514 pr
= parts_addsub(&pa
, &pb
, status
, subtract
);
1516 return float64_round_pack_canonical(pr
, status
);
1519 static float64
soft_f64_add(float64 a
, float64 b
, float_status
*status
)
1521 return soft_f64_addsub(a
, b
, status
, false);
1524 static float64
soft_f64_sub(float64 a
, float64 b
, float_status
*status
)
1526 return soft_f64_addsub(a
, b
, status
, true);
1529 static float hard_f32_add(float a
, float b
)
1534 static float hard_f32_sub(float a
, float b
)
1539 static double hard_f64_add(double a
, double b
)
1544 static double hard_f64_sub(double a
, double b
)
1549 static bool f32_addsubmul_post(union_float32 a
, union_float32 b
)
1551 if (QEMU_HARDFLOAT_2F32_USE_FP
) {
1552 return !(fpclassify(a
.h
) == FP_ZERO
&& fpclassify(b
.h
) == FP_ZERO
);
1554 return !(float32_is_zero(a
.s
) && float32_is_zero(b
.s
));
1557 static bool f64_addsubmul_post(union_float64 a
, union_float64 b
)
1559 if (QEMU_HARDFLOAT_2F64_USE_FP
) {
1560 return !(fpclassify(a
.h
) == FP_ZERO
&& fpclassify(b
.h
) == FP_ZERO
);
1562 return !(float64_is_zero(a
.s
) && float64_is_zero(b
.s
));
1566 static float32
float32_addsub(float32 a
, float32 b
, float_status
*s
,
1567 hard_f32_op2_fn hard
, soft_f32_op2_fn soft
)
1569 return float32_gen2(a
, b
, s
, hard
, soft
,
1570 f32_is_zon2
, f32_addsubmul_post
);
1573 static float64
float64_addsub(float64 a
, float64 b
, float_status
*s
,
1574 hard_f64_op2_fn hard
, soft_f64_op2_fn soft
)
1576 return float64_gen2(a
, b
, s
, hard
, soft
,
1577 f64_is_zon2
, f64_addsubmul_post
);
1580 float32 QEMU_FLATTEN
1581 float32_add(float32 a
, float32 b
, float_status
*s
)
1583 return float32_addsub(a
, b
, s
, hard_f32_add
, soft_f32_add
);
1586 float32 QEMU_FLATTEN
1587 float32_sub(float32 a
, float32 b
, float_status
*s
)
1589 return float32_addsub(a
, b
, s
, hard_f32_sub
, soft_f32_sub
);
1592 float64 QEMU_FLATTEN
1593 float64_add(float64 a
, float64 b
, float_status
*s
)
1595 return float64_addsub(a
, b
, s
, hard_f64_add
, soft_f64_add
);
1598 float64 QEMU_FLATTEN
1599 float64_sub(float64 a
, float64 b
, float_status
*s
)
1601 return float64_addsub(a
, b
, s
, hard_f64_sub
, soft_f64_sub
);
1604 static bfloat16 QEMU_FLATTEN
1605 bfloat16_addsub(bfloat16 a
, bfloat16 b
, float_status
*status
, bool subtract
)
1607 FloatParts64 pa
, pb
, *pr
;
1609 bfloat16_unpack_canonical(&pa
, a
, status
);
1610 bfloat16_unpack_canonical(&pb
, b
, status
);
1611 pr
= parts_addsub(&pa
, &pb
, status
, subtract
);
1613 return bfloat16_round_pack_canonical(pr
, status
);
1616 bfloat16
bfloat16_add(bfloat16 a
, bfloat16 b
, float_status
*status
)
1618 return bfloat16_addsub(a
, b
, status
, false);
1621 bfloat16
bfloat16_sub(bfloat16 a
, bfloat16 b
, float_status
*status
)
1623 return bfloat16_addsub(a
, b
, status
, true);
1626 static float128 QEMU_FLATTEN
1627 float128_addsub(float128 a
, float128 b
, float_status
*status
, bool subtract
)
1629 FloatParts128 pa
, pb
, *pr
;
1631 float128_unpack_canonical(&pa
, a
, status
);
1632 float128_unpack_canonical(&pb
, b
, status
);
1633 pr
= parts_addsub(&pa
, &pb
, status
, subtract
);
1635 return float128_round_pack_canonical(pr
, status
);
1638 float128
float128_add(float128 a
, float128 b
, float_status
*status
)
1640 return float128_addsub(a
, b
, status
, false);
1643 float128
float128_sub(float128 a
, float128 b
, float_status
*status
)
1645 return float128_addsub(a
, b
, status
, true);
1652 float16 QEMU_FLATTEN
float16_mul(float16 a
, float16 b
, float_status
*status
)
1654 FloatParts64 pa
, pb
, *pr
;
1656 float16_unpack_canonical(&pa
, a
, status
);
1657 float16_unpack_canonical(&pb
, b
, status
);
1658 pr
= parts_mul(&pa
, &pb
, status
);
1660 return float16_round_pack_canonical(pr
, status
);
1663 static float32 QEMU_SOFTFLOAT_ATTR
1664 soft_f32_mul(float32 a
, float32 b
, float_status
*status
)
1666 FloatParts64 pa
, pb
, *pr
;
1668 float32_unpack_canonical(&pa
, a
, status
);
1669 float32_unpack_canonical(&pb
, b
, status
);
1670 pr
= parts_mul(&pa
, &pb
, status
);
1672 return float32_round_pack_canonical(pr
, status
);
1675 static float64 QEMU_SOFTFLOAT_ATTR
1676 soft_f64_mul(float64 a
, float64 b
, float_status
*status
)
1678 FloatParts64 pa
, pb
, *pr
;
1680 float64_unpack_canonical(&pa
, a
, status
);
1681 float64_unpack_canonical(&pb
, b
, status
);
1682 pr
= parts_mul(&pa
, &pb
, status
);
1684 return float64_round_pack_canonical(pr
, status
);
1687 static float hard_f32_mul(float a
, float b
)
1692 static double hard_f64_mul(double a
, double b
)
1697 float32 QEMU_FLATTEN
1698 float32_mul(float32 a
, float32 b
, float_status
*s
)
1700 return float32_gen2(a
, b
, s
, hard_f32_mul
, soft_f32_mul
,
1701 f32_is_zon2
, f32_addsubmul_post
);
1704 float64 QEMU_FLATTEN
1705 float64_mul(float64 a
, float64 b
, float_status
*s
)
1707 return float64_gen2(a
, b
, s
, hard_f64_mul
, soft_f64_mul
,
1708 f64_is_zon2
, f64_addsubmul_post
);
1711 bfloat16 QEMU_FLATTEN
1712 bfloat16_mul(bfloat16 a
, bfloat16 b
, float_status
*status
)
1714 FloatParts64 pa
, pb
, *pr
;
1716 bfloat16_unpack_canonical(&pa
, a
, status
);
1717 bfloat16_unpack_canonical(&pb
, b
, status
);
1718 pr
= parts_mul(&pa
, &pb
, status
);
1720 return bfloat16_round_pack_canonical(pr
, status
);
1723 float128 QEMU_FLATTEN
1724 float128_mul(float128 a
, float128 b
, float_status
*status
)
1726 FloatParts128 pa
, pb
, *pr
;
1728 float128_unpack_canonical(&pa
, a
, status
);
1729 float128_unpack_canonical(&pb
, b
, status
);
1730 pr
= parts_mul(&pa
, &pb
, status
);
1732 return float128_round_pack_canonical(pr
, status
);
1736 * Fused multiply-add
1739 float16 QEMU_FLATTEN
float16_muladd(float16 a
, float16 b
, float16 c
,
1740 int flags
, float_status
*status
)
1742 FloatParts64 pa
, pb
, pc
, *pr
;
1744 float16_unpack_canonical(&pa
, a
, status
);
1745 float16_unpack_canonical(&pb
, b
, status
);
1746 float16_unpack_canonical(&pc
, c
, status
);
1747 pr
= parts_muladd(&pa
, &pb
, &pc
, flags
, status
);
1749 return float16_round_pack_canonical(pr
, status
);
1752 static float32 QEMU_SOFTFLOAT_ATTR
1753 soft_f32_muladd(float32 a
, float32 b
, float32 c
, int flags
,
1754 float_status
*status
)
1756 FloatParts64 pa
, pb
, pc
, *pr
;
1758 float32_unpack_canonical(&pa
, a
, status
);
1759 float32_unpack_canonical(&pb
, b
, status
);
1760 float32_unpack_canonical(&pc
, c
, status
);
1761 pr
= parts_muladd(&pa
, &pb
, &pc
, flags
, status
);
1763 return float32_round_pack_canonical(pr
, status
);
1766 static float64 QEMU_SOFTFLOAT_ATTR
1767 soft_f64_muladd(float64 a
, float64 b
, float64 c
, int flags
,
1768 float_status
*status
)
1770 FloatParts64 pa
, pb
, pc
, *pr
;
1772 float64_unpack_canonical(&pa
, a
, status
);
1773 float64_unpack_canonical(&pb
, b
, status
);
1774 float64_unpack_canonical(&pc
, c
, status
);
1775 pr
= parts_muladd(&pa
, &pb
, &pc
, flags
, status
);
1777 return float64_round_pack_canonical(pr
, status
);
1780 static bool force_soft_fma
;
1782 float32 QEMU_FLATTEN
1783 float32_muladd(float32 xa
, float32 xb
, float32 xc
, int flags
, float_status
*s
)
1785 union_float32 ua
, ub
, uc
, ur
;
1791 if (unlikely(!can_use_fpu(s
))) {
1794 if (unlikely(flags
& float_muladd_halve_result
)) {
1798 float32_input_flush3(&ua
.s
, &ub
.s
, &uc
.s
, s
);
1799 if (unlikely(!f32_is_zon3(ua
, ub
, uc
))) {
1803 if (unlikely(force_soft_fma
)) {
1808 * When (a || b) == 0, there's no need to check for under/over flow,
1809 * since we know the addend is (normal || 0) and the product is 0.
1811 if (float32_is_zero(ua
.s
) || float32_is_zero(ub
.s
)) {
1815 prod_sign
= float32_is_neg(ua
.s
) ^ float32_is_neg(ub
.s
);
1816 prod_sign
^= !!(flags
& float_muladd_negate_product
);
1817 up
.s
= float32_set_sign(float32_zero
, prod_sign
);
1819 if (flags
& float_muladd_negate_c
) {
1824 union_float32 ua_orig
= ua
;
1825 union_float32 uc_orig
= uc
;
1827 if (flags
& float_muladd_negate_product
) {
1830 if (flags
& float_muladd_negate_c
) {
1834 ur
.h
= fmaf(ua
.h
, ub
.h
, uc
.h
);
1836 if (unlikely(f32_is_inf(ur
))) {
1837 float_raise(float_flag_overflow
, s
);
1838 } else if (unlikely(fabsf(ur
.h
) <= FLT_MIN
)) {
1844 if (flags
& float_muladd_negate_result
) {
1845 return float32_chs(ur
.s
);
1850 return soft_f32_muladd(ua
.s
, ub
.s
, uc
.s
, flags
, s
);
1853 float64 QEMU_FLATTEN
1854 float64_muladd(float64 xa
, float64 xb
, float64 xc
, int flags
, float_status
*s
)
1856 union_float64 ua
, ub
, uc
, ur
;
1862 if (unlikely(!can_use_fpu(s
))) {
1865 if (unlikely(flags
& float_muladd_halve_result
)) {
1869 float64_input_flush3(&ua
.s
, &ub
.s
, &uc
.s
, s
);
1870 if (unlikely(!f64_is_zon3(ua
, ub
, uc
))) {
1874 if (unlikely(force_soft_fma
)) {
1879 * When (a || b) == 0, there's no need to check for under/over flow,
1880 * since we know the addend is (normal || 0) and the product is 0.
1882 if (float64_is_zero(ua
.s
) || float64_is_zero(ub
.s
)) {
1886 prod_sign
= float64_is_neg(ua
.s
) ^ float64_is_neg(ub
.s
);
1887 prod_sign
^= !!(flags
& float_muladd_negate_product
);
1888 up
.s
= float64_set_sign(float64_zero
, prod_sign
);
1890 if (flags
& float_muladd_negate_c
) {
1895 union_float64 ua_orig
= ua
;
1896 union_float64 uc_orig
= uc
;
1898 if (flags
& float_muladd_negate_product
) {
1901 if (flags
& float_muladd_negate_c
) {
1905 ur
.h
= fma(ua
.h
, ub
.h
, uc
.h
);
1907 if (unlikely(f64_is_inf(ur
))) {
1908 float_raise(float_flag_overflow
, s
);
1909 } else if (unlikely(fabs(ur
.h
) <= FLT_MIN
)) {
1915 if (flags
& float_muladd_negate_result
) {
1916 return float64_chs(ur
.s
);
1921 return soft_f64_muladd(ua
.s
, ub
.s
, uc
.s
, flags
, s
);
1924 bfloat16 QEMU_FLATTEN
bfloat16_muladd(bfloat16 a
, bfloat16 b
, bfloat16 c
,
1925 int flags
, float_status
*status
)
1927 FloatParts64 pa
, pb
, pc
, *pr
;
1929 bfloat16_unpack_canonical(&pa
, a
, status
);
1930 bfloat16_unpack_canonical(&pb
, b
, status
);
1931 bfloat16_unpack_canonical(&pc
, c
, status
);
1932 pr
= parts_muladd(&pa
, &pb
, &pc
, flags
, status
);
1934 return bfloat16_round_pack_canonical(pr
, status
);
1937 float128 QEMU_FLATTEN
float128_muladd(float128 a
, float128 b
, float128 c
,
1938 int flags
, float_status
*status
)
1940 FloatParts128 pa
, pb
, pc
, *pr
;
1942 float128_unpack_canonical(&pa
, a
, status
);
1943 float128_unpack_canonical(&pb
, b
, status
);
1944 float128_unpack_canonical(&pc
, c
, status
);
1945 pr
= parts_muladd(&pa
, &pb
, &pc
, flags
, status
);
1947 return float128_round_pack_canonical(pr
, status
);
1954 float16
float16_div(float16 a
, float16 b
, float_status
*status
)
1956 FloatParts64 pa
, pb
, *pr
;
1958 float16_unpack_canonical(&pa
, a
, status
);
1959 float16_unpack_canonical(&pb
, b
, status
);
1960 pr
= parts_div(&pa
, &pb
, status
);
1962 return float16_round_pack_canonical(pr
, status
);
1965 static float32 QEMU_SOFTFLOAT_ATTR
1966 soft_f32_div(float32 a
, float32 b
, float_status
*status
)
1968 FloatParts64 pa
, pb
, *pr
;
1970 float32_unpack_canonical(&pa
, a
, status
);
1971 float32_unpack_canonical(&pb
, b
, status
);
1972 pr
= parts_div(&pa
, &pb
, status
);
1974 return float32_round_pack_canonical(pr
, status
);
1977 static float64 QEMU_SOFTFLOAT_ATTR
1978 soft_f64_div(float64 a
, float64 b
, float_status
*status
)
1980 FloatParts64 pa
, pb
, *pr
;
1982 float64_unpack_canonical(&pa
, a
, status
);
1983 float64_unpack_canonical(&pb
, b
, status
);
1984 pr
= parts_div(&pa
, &pb
, status
);
1986 return float64_round_pack_canonical(pr
, status
);
1989 static float hard_f32_div(float a
, float b
)
1994 static double hard_f64_div(double a
, double b
)
1999 static bool f32_div_pre(union_float32 a
, union_float32 b
)
2001 if (QEMU_HARDFLOAT_2F32_USE_FP
) {
2002 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
2003 fpclassify(b
.h
) == FP_NORMAL
;
2005 return float32_is_zero_or_normal(a
.s
) && float32_is_normal(b
.s
);
2008 static bool f64_div_pre(union_float64 a
, union_float64 b
)
2010 if (QEMU_HARDFLOAT_2F64_USE_FP
) {
2011 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
2012 fpclassify(b
.h
) == FP_NORMAL
;
2014 return float64_is_zero_or_normal(a
.s
) && float64_is_normal(b
.s
);
2017 static bool f32_div_post(union_float32 a
, union_float32 b
)
2019 if (QEMU_HARDFLOAT_2F32_USE_FP
) {
2020 return fpclassify(a
.h
) != FP_ZERO
;
2022 return !float32_is_zero(a
.s
);
2025 static bool f64_div_post(union_float64 a
, union_float64 b
)
2027 if (QEMU_HARDFLOAT_2F64_USE_FP
) {
2028 return fpclassify(a
.h
) != FP_ZERO
;
2030 return !float64_is_zero(a
.s
);
2033 float32 QEMU_FLATTEN
2034 float32_div(float32 a
, float32 b
, float_status
*s
)
2036 return float32_gen2(a
, b
, s
, hard_f32_div
, soft_f32_div
,
2037 f32_div_pre
, f32_div_post
);
2040 float64 QEMU_FLATTEN
2041 float64_div(float64 a
, float64 b
, float_status
*s
)
2043 return float64_gen2(a
, b
, s
, hard_f64_div
, soft_f64_div
,
2044 f64_div_pre
, f64_div_post
);
2047 bfloat16 QEMU_FLATTEN
2048 bfloat16_div(bfloat16 a
, bfloat16 b
, float_status
*status
)
2050 FloatParts64 pa
, pb
, *pr
;
2052 bfloat16_unpack_canonical(&pa
, a
, status
);
2053 bfloat16_unpack_canonical(&pb
, b
, status
);
2054 pr
= parts_div(&pa
, &pb
, status
);
2056 return bfloat16_round_pack_canonical(pr
, status
);
2059 float128 QEMU_FLATTEN
2060 float128_div(float128 a
, float128 b
, float_status
*status
)
2062 FloatParts128 pa
, pb
, *pr
;
2064 float128_unpack_canonical(&pa
, a
, status
);
2065 float128_unpack_canonical(&pb
, b
, status
);
2066 pr
= parts_div(&pa
, &pb
, status
);
2068 return float128_round_pack_canonical(pr
, status
);
2072 * Float to Float conversions
2074 * Returns the result of converting one float format to another. The
2075 * conversion is performed according to the IEC/IEEE Standard for
2076 * Binary Floating-Point Arithmetic.
2078 * Usually this only needs to take care of raising invalid exceptions
2079 * and handling the conversion on NaNs.
2082 static void parts_float_to_ahp(FloatParts64
*a
, float_status
*s
)
2085 case float_class_qnan
:
2086 case float_class_snan
:
2088 * There is no NaN in the destination format. Raise Invalid
2089 * and return a zero with the sign of the input NaN.
2091 float_raise(float_flag_invalid
, s
);
2092 a
->cls
= float_class_zero
;
2095 case float_class_inf
:
2097 * There is no Inf in the destination format. Raise Invalid
2098 * and return the maximum normal with the correct sign.
2100 float_raise(float_flag_invalid
, s
);
2101 a
->cls
= float_class_normal
;
2102 a
->exp
= float16_params_ahp
.exp_max
;
2103 a
->frac
= MAKE_64BIT_MASK(float16_params_ahp
.frac_shift
,
2104 float16_params_ahp
.frac_size
+ 1);
2107 case float_class_normal
:
2108 case float_class_zero
:
2112 g_assert_not_reached();
2116 static void parts64_float_to_float(FloatParts64
*a
, float_status
*s
)
2118 if (is_nan(a
->cls
)) {
2119 parts_return_nan(a
, s
);
2123 static void parts128_float_to_float(FloatParts128
*a
, float_status
*s
)
2125 if (is_nan(a
->cls
)) {
2126 parts_return_nan(a
, s
);
2130 #define parts_float_to_float(P, S) \
2131 PARTS_GENERIC_64_128(float_to_float, P)(P, S)
2133 static void parts_float_to_float_narrow(FloatParts64
*a
, FloatParts128
*b
,
2140 if (a
->cls
== float_class_normal
) {
2141 frac_truncjam(a
, b
);
2142 } else if (is_nan(a
->cls
)) {
2143 /* Discard the low bits of the NaN. */
2144 a
->frac
= b
->frac_hi
;
2145 parts_return_nan(a
, s
);
2149 static void parts_float_to_float_widen(FloatParts128
*a
, FloatParts64
*b
,
2157 if (is_nan(a
->cls
)) {
2158 parts_return_nan(a
, s
);
2162 float32
float16_to_float32(float16 a
, bool ieee
, float_status
*s
)
2164 const FloatFmt
*fmt16
= ieee
? &float16_params
: &float16_params_ahp
;
2167 float16a_unpack_canonical(&p
, a
, s
, fmt16
);
2168 parts_float_to_float(&p
, s
);
2169 return float32_round_pack_canonical(&p
, s
);
2172 float64
float16_to_float64(float16 a
, bool ieee
, float_status
*s
)
2174 const FloatFmt
*fmt16
= ieee
? &float16_params
: &float16_params_ahp
;
2177 float16a_unpack_canonical(&p
, a
, s
, fmt16
);
2178 parts_float_to_float(&p
, s
);
2179 return float64_round_pack_canonical(&p
, s
);
2182 float16
float32_to_float16(float32 a
, bool ieee
, float_status
*s
)
2185 const FloatFmt
*fmt
;
2187 float32_unpack_canonical(&p
, a
, s
);
2189 parts_float_to_float(&p
, s
);
2190 fmt
= &float16_params
;
2192 parts_float_to_ahp(&p
, s
);
2193 fmt
= &float16_params_ahp
;
2195 return float16a_round_pack_canonical(&p
, s
, fmt
);
2198 static float64 QEMU_SOFTFLOAT_ATTR
2199 soft_float32_to_float64(float32 a
, float_status
*s
)
2203 float32_unpack_canonical(&p
, a
, s
);
2204 parts_float_to_float(&p
, s
);
2205 return float64_round_pack_canonical(&p
, s
);
2208 float64
float32_to_float64(float32 a
, float_status
*s
)
2210 if (likely(float32_is_normal(a
))) {
2211 /* Widening conversion can never produce inexact results. */
2217 } else if (float32_is_zero(a
)) {
2218 return float64_set_sign(float64_zero
, float32_is_neg(a
));
2220 return soft_float32_to_float64(a
, s
);
2224 float16
float64_to_float16(float64 a
, bool ieee
, float_status
*s
)
2227 const FloatFmt
*fmt
;
2229 float64_unpack_canonical(&p
, a
, s
);
2231 parts_float_to_float(&p
, s
);
2232 fmt
= &float16_params
;
2234 parts_float_to_ahp(&p
, s
);
2235 fmt
= &float16_params_ahp
;
2237 return float16a_round_pack_canonical(&p
, s
, fmt
);
2240 float32
float64_to_float32(float64 a
, float_status
*s
)
2244 float64_unpack_canonical(&p
, a
, s
);
2245 parts_float_to_float(&p
, s
);
2246 return float32_round_pack_canonical(&p
, s
);
2249 float32
bfloat16_to_float32(bfloat16 a
, float_status
*s
)
2253 bfloat16_unpack_canonical(&p
, a
, s
);
2254 parts_float_to_float(&p
, s
);
2255 return float32_round_pack_canonical(&p
, s
);
2258 float64
bfloat16_to_float64(bfloat16 a
, float_status
*s
)
2262 bfloat16_unpack_canonical(&p
, a
, s
);
2263 parts_float_to_float(&p
, s
);
2264 return float64_round_pack_canonical(&p
, s
);
2267 bfloat16
float32_to_bfloat16(float32 a
, float_status
*s
)
2271 float32_unpack_canonical(&p
, a
, s
);
2272 parts_float_to_float(&p
, s
);
2273 return bfloat16_round_pack_canonical(&p
, s
);
2276 bfloat16
float64_to_bfloat16(float64 a
, float_status
*s
)
2280 float64_unpack_canonical(&p
, a
, s
);
2281 parts_float_to_float(&p
, s
);
2282 return bfloat16_round_pack_canonical(&p
, s
);
2285 float32
float128_to_float32(float128 a
, float_status
*s
)
2290 float128_unpack_canonical(&p128
, a
, s
);
2291 parts_float_to_float_narrow(&p64
, &p128
, s
);
2292 return float32_round_pack_canonical(&p64
, s
);
2295 float64
float128_to_float64(float128 a
, float_status
*s
)
2300 float128_unpack_canonical(&p128
, a
, s
);
2301 parts_float_to_float_narrow(&p64
, &p128
, s
);
2302 return float64_round_pack_canonical(&p64
, s
);
2305 float128
float32_to_float128(float32 a
, float_status
*s
)
2310 float32_unpack_canonical(&p64
, a
, s
);
2311 parts_float_to_float_widen(&p128
, &p64
, s
);
2312 return float128_round_pack_canonical(&p128
, s
);
2315 float128
float64_to_float128(float64 a
, float_status
*s
)
2320 float64_unpack_canonical(&p64
, a
, s
);
2321 parts_float_to_float_widen(&p128
, &p64
, s
);
2322 return float128_round_pack_canonical(&p128
, s
);
2326 * Round to integral value
2329 float16
float16_round_to_int(float16 a
, float_status
*s
)
2333 float16_unpack_canonical(&p
, a
, s
);
2334 parts_round_to_int(&p
, s
->float_rounding_mode
, 0, s
, &float16_params
);
2335 return float16_round_pack_canonical(&p
, s
);
2338 float32
float32_round_to_int(float32 a
, float_status
*s
)
2342 float32_unpack_canonical(&p
, a
, s
);
2343 parts_round_to_int(&p
, s
->float_rounding_mode
, 0, s
, &float32_params
);
2344 return float32_round_pack_canonical(&p
, s
);
2347 float64
float64_round_to_int(float64 a
, float_status
*s
)
2351 float64_unpack_canonical(&p
, a
, s
);
2352 parts_round_to_int(&p
, s
->float_rounding_mode
, 0, s
, &float64_params
);
2353 return float64_round_pack_canonical(&p
, s
);
2356 bfloat16
bfloat16_round_to_int(bfloat16 a
, float_status
*s
)
2360 bfloat16_unpack_canonical(&p
, a
, s
);
2361 parts_round_to_int(&p
, s
->float_rounding_mode
, 0, s
, &bfloat16_params
);
2362 return bfloat16_round_pack_canonical(&p
, s
);
2365 float128
float128_round_to_int(float128 a
, float_status
*s
)
2369 float128_unpack_canonical(&p
, a
, s
);
2370 parts_round_to_int(&p
, s
->float_rounding_mode
, 0, s
, &float128_params
);
2371 return float128_round_pack_canonical(&p
, s
);
2375 * Floating-point to signed integer conversions
2378 int8_t float16_to_int8_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2383 float16_unpack_canonical(&p
, a
, s
);
2384 return parts_float_to_sint(&p
, rmode
, scale
, INT8_MIN
, INT8_MAX
, s
);
2387 int16_t float16_to_int16_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2392 float16_unpack_canonical(&p
, a
, s
);
2393 return parts_float_to_sint(&p
, rmode
, scale
, INT16_MIN
, INT16_MAX
, s
);
2396 int32_t float16_to_int32_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2401 float16_unpack_canonical(&p
, a
, s
);
2402 return parts_float_to_sint(&p
, rmode
, scale
, INT32_MIN
, INT32_MAX
, s
);
2405 int64_t float16_to_int64_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2410 float16_unpack_canonical(&p
, a
, s
);
2411 return parts_float_to_sint(&p
, rmode
, scale
, INT64_MIN
, INT64_MAX
, s
);
2414 int16_t float32_to_int16_scalbn(float32 a
, FloatRoundMode rmode
, int scale
,
2419 float32_unpack_canonical(&p
, a
, s
);
2420 return parts_float_to_sint(&p
, rmode
, scale
, INT16_MIN
, INT16_MAX
, s
);
2423 int32_t float32_to_int32_scalbn(float32 a
, FloatRoundMode rmode
, int scale
,
2428 float32_unpack_canonical(&p
, a
, s
);
2429 return parts_float_to_sint(&p
, rmode
, scale
, INT32_MIN
, INT32_MAX
, s
);
2432 int64_t float32_to_int64_scalbn(float32 a
, FloatRoundMode rmode
, int scale
,
2437 float32_unpack_canonical(&p
, a
, s
);
2438 return parts_float_to_sint(&p
, rmode
, scale
, INT64_MIN
, INT64_MAX
, s
);
2441 int16_t float64_to_int16_scalbn(float64 a
, FloatRoundMode rmode
, int scale
,
2446 float64_unpack_canonical(&p
, a
, s
);
2447 return parts_float_to_sint(&p
, rmode
, scale
, INT16_MIN
, INT16_MAX
, s
);
2450 int32_t float64_to_int32_scalbn(float64 a
, FloatRoundMode rmode
, int scale
,
2455 float64_unpack_canonical(&p
, a
, s
);
2456 return parts_float_to_sint(&p
, rmode
, scale
, INT32_MIN
, INT32_MAX
, s
);
2459 int64_t float64_to_int64_scalbn(float64 a
, FloatRoundMode rmode
, int scale
,
2464 float64_unpack_canonical(&p
, a
, s
);
2465 return parts_float_to_sint(&p
, rmode
, scale
, INT64_MIN
, INT64_MAX
, s
);
2468 int16_t bfloat16_to_int16_scalbn(bfloat16 a
, FloatRoundMode rmode
, int scale
,
2473 bfloat16_unpack_canonical(&p
, a
, s
);
2474 return parts_float_to_sint(&p
, rmode
, scale
, INT16_MIN
, INT16_MAX
, s
);
2477 int32_t bfloat16_to_int32_scalbn(bfloat16 a
, FloatRoundMode rmode
, int scale
,
2482 bfloat16_unpack_canonical(&p
, a
, s
);
2483 return parts_float_to_sint(&p
, rmode
, scale
, INT32_MIN
, INT32_MAX
, s
);
2486 int64_t bfloat16_to_int64_scalbn(bfloat16 a
, FloatRoundMode rmode
, int scale
,
2491 bfloat16_unpack_canonical(&p
, a
, s
);
2492 return parts_float_to_sint(&p
, rmode
, scale
, INT64_MIN
, INT64_MAX
, s
);
2495 static int32_t float128_to_int32_scalbn(float128 a
, FloatRoundMode rmode
,
2496 int scale
, float_status
*s
)
2500 float128_unpack_canonical(&p
, a
, s
);
2501 return parts_float_to_sint(&p
, rmode
, scale
, INT32_MIN
, INT32_MAX
, s
);
2504 static int64_t float128_to_int64_scalbn(float128 a
, FloatRoundMode rmode
,
2505 int scale
, float_status
*s
)
2509 float128_unpack_canonical(&p
, a
, s
);
2510 return parts_float_to_sint(&p
, rmode
, scale
, INT64_MIN
, INT64_MAX
, s
);
2513 int8_t float16_to_int8(float16 a
, float_status
*s
)
2515 return float16_to_int8_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2518 int16_t float16_to_int16(float16 a
, float_status
*s
)
2520 return float16_to_int16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2523 int32_t float16_to_int32(float16 a
, float_status
*s
)
2525 return float16_to_int32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2528 int64_t float16_to_int64(float16 a
, float_status
*s
)
2530 return float16_to_int64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2533 int16_t float32_to_int16(float32 a
, float_status
*s
)
2535 return float32_to_int16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2538 int32_t float32_to_int32(float32 a
, float_status
*s
)
2540 return float32_to_int32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2543 int64_t float32_to_int64(float32 a
, float_status
*s
)
2545 return float32_to_int64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2548 int16_t float64_to_int16(float64 a
, float_status
*s
)
2550 return float64_to_int16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2553 int32_t float64_to_int32(float64 a
, float_status
*s
)
2555 return float64_to_int32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2558 int64_t float64_to_int64(float64 a
, float_status
*s
)
2560 return float64_to_int64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2563 int32_t float128_to_int32(float128 a
, float_status
*s
)
2565 return float128_to_int32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2568 int64_t float128_to_int64(float128 a
, float_status
*s
)
2570 return float128_to_int64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2573 int16_t float16_to_int16_round_to_zero(float16 a
, float_status
*s
)
2575 return float16_to_int16_scalbn(a
, float_round_to_zero
, 0, s
);
2578 int32_t float16_to_int32_round_to_zero(float16 a
, float_status
*s
)
2580 return float16_to_int32_scalbn(a
, float_round_to_zero
, 0, s
);
2583 int64_t float16_to_int64_round_to_zero(float16 a
, float_status
*s
)
2585 return float16_to_int64_scalbn(a
, float_round_to_zero
, 0, s
);
2588 int16_t float32_to_int16_round_to_zero(float32 a
, float_status
*s
)
2590 return float32_to_int16_scalbn(a
, float_round_to_zero
, 0, s
);
2593 int32_t float32_to_int32_round_to_zero(float32 a
, float_status
*s
)
2595 return float32_to_int32_scalbn(a
, float_round_to_zero
, 0, s
);
2598 int64_t float32_to_int64_round_to_zero(float32 a
, float_status
*s
)
2600 return float32_to_int64_scalbn(a
, float_round_to_zero
, 0, s
);
2603 int16_t float64_to_int16_round_to_zero(float64 a
, float_status
*s
)
2605 return float64_to_int16_scalbn(a
, float_round_to_zero
, 0, s
);
2608 int32_t float64_to_int32_round_to_zero(float64 a
, float_status
*s
)
2610 return float64_to_int32_scalbn(a
, float_round_to_zero
, 0, s
);
2613 int64_t float64_to_int64_round_to_zero(float64 a
, float_status
*s
)
2615 return float64_to_int64_scalbn(a
, float_round_to_zero
, 0, s
);
2618 int32_t float128_to_int32_round_to_zero(float128 a
, float_status
*s
)
2620 return float128_to_int32_scalbn(a
, float_round_to_zero
, 0, s
);
2623 int64_t float128_to_int64_round_to_zero(float128 a
, float_status
*s
)
2625 return float128_to_int64_scalbn(a
, float_round_to_zero
, 0, s
);
2628 int16_t bfloat16_to_int16(bfloat16 a
, float_status
*s
)
2630 return bfloat16_to_int16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2633 int32_t bfloat16_to_int32(bfloat16 a
, float_status
*s
)
2635 return bfloat16_to_int32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2638 int64_t bfloat16_to_int64(bfloat16 a
, float_status
*s
)
2640 return bfloat16_to_int64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2643 int16_t bfloat16_to_int16_round_to_zero(bfloat16 a
, float_status
*s
)
2645 return bfloat16_to_int16_scalbn(a
, float_round_to_zero
, 0, s
);
2648 int32_t bfloat16_to_int32_round_to_zero(bfloat16 a
, float_status
*s
)
2650 return bfloat16_to_int32_scalbn(a
, float_round_to_zero
, 0, s
);
2653 int64_t bfloat16_to_int64_round_to_zero(bfloat16 a
, float_status
*s
)
2655 return bfloat16_to_int64_scalbn(a
, float_round_to_zero
, 0, s
);
2659 * Floating-point to unsigned integer conversions
2662 uint8_t float16_to_uint8_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2667 float16_unpack_canonical(&p
, a
, s
);
2668 return parts_float_to_uint(&p
, rmode
, scale
, UINT8_MAX
, s
);
2671 uint16_t float16_to_uint16_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2676 float16_unpack_canonical(&p
, a
, s
);
2677 return parts_float_to_uint(&p
, rmode
, scale
, UINT16_MAX
, s
);
2680 uint32_t float16_to_uint32_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2685 float16_unpack_canonical(&p
, a
, s
);
2686 return parts_float_to_uint(&p
, rmode
, scale
, UINT32_MAX
, s
);
2689 uint64_t float16_to_uint64_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2694 float16_unpack_canonical(&p
, a
, s
);
2695 return parts_float_to_uint(&p
, rmode
, scale
, UINT64_MAX
, s
);
2698 uint16_t float32_to_uint16_scalbn(float32 a
, FloatRoundMode rmode
, int scale
,
2703 float32_unpack_canonical(&p
, a
, s
);
2704 return parts_float_to_uint(&p
, rmode
, scale
, UINT16_MAX
, s
);
2707 uint32_t float32_to_uint32_scalbn(float32 a
, FloatRoundMode rmode
, int scale
,
2712 float32_unpack_canonical(&p
, a
, s
);
2713 return parts_float_to_uint(&p
, rmode
, scale
, UINT32_MAX
, s
);
2716 uint64_t float32_to_uint64_scalbn(float32 a
, FloatRoundMode rmode
, int scale
,
2721 float32_unpack_canonical(&p
, a
, s
);
2722 return parts_float_to_uint(&p
, rmode
, scale
, UINT64_MAX
, s
);
2725 uint16_t float64_to_uint16_scalbn(float64 a
, FloatRoundMode rmode
, int scale
,
2730 float64_unpack_canonical(&p
, a
, s
);
2731 return parts_float_to_uint(&p
, rmode
, scale
, UINT16_MAX
, s
);
2734 uint32_t float64_to_uint32_scalbn(float64 a
, FloatRoundMode rmode
, int scale
,
2739 float64_unpack_canonical(&p
, a
, s
);
2740 return parts_float_to_uint(&p
, rmode
, scale
, UINT32_MAX
, s
);
2743 uint64_t float64_to_uint64_scalbn(float64 a
, FloatRoundMode rmode
, int scale
,
2748 float64_unpack_canonical(&p
, a
, s
);
2749 return parts_float_to_uint(&p
, rmode
, scale
, UINT64_MAX
, s
);
2752 uint16_t bfloat16_to_uint16_scalbn(bfloat16 a
, FloatRoundMode rmode
,
2753 int scale
, float_status
*s
)
2757 bfloat16_unpack_canonical(&p
, a
, s
);
2758 return parts_float_to_uint(&p
, rmode
, scale
, UINT16_MAX
, s
);
2761 uint32_t bfloat16_to_uint32_scalbn(bfloat16 a
, FloatRoundMode rmode
,
2762 int scale
, float_status
*s
)
2766 bfloat16_unpack_canonical(&p
, a
, s
);
2767 return parts_float_to_uint(&p
, rmode
, scale
, UINT32_MAX
, s
);
2770 uint64_t bfloat16_to_uint64_scalbn(bfloat16 a
, FloatRoundMode rmode
,
2771 int scale
, float_status
*s
)
2775 bfloat16_unpack_canonical(&p
, a
, s
);
2776 return parts_float_to_uint(&p
, rmode
, scale
, UINT64_MAX
, s
);
2779 static uint32_t float128_to_uint32_scalbn(float128 a
, FloatRoundMode rmode
,
2780 int scale
, float_status
*s
)
2784 float128_unpack_canonical(&p
, a
, s
);
2785 return parts_float_to_uint(&p
, rmode
, scale
, UINT32_MAX
, s
);
2788 static uint64_t float128_to_uint64_scalbn(float128 a
, FloatRoundMode rmode
,
2789 int scale
, float_status
*s
)
2793 float128_unpack_canonical(&p
, a
, s
);
2794 return parts_float_to_uint(&p
, rmode
, scale
, UINT64_MAX
, s
);
2797 uint8_t float16_to_uint8(float16 a
, float_status
*s
)
2799 return float16_to_uint8_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2802 uint16_t float16_to_uint16(float16 a
, float_status
*s
)
2804 return float16_to_uint16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2807 uint32_t float16_to_uint32(float16 a
, float_status
*s
)
2809 return float16_to_uint32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2812 uint64_t float16_to_uint64(float16 a
, float_status
*s
)
2814 return float16_to_uint64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2817 uint16_t float32_to_uint16(float32 a
, float_status
*s
)
2819 return float32_to_uint16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2822 uint32_t float32_to_uint32(float32 a
, float_status
*s
)
2824 return float32_to_uint32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2827 uint64_t float32_to_uint64(float32 a
, float_status
*s
)
2829 return float32_to_uint64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2832 uint16_t float64_to_uint16(float64 a
, float_status
*s
)
2834 return float64_to_uint16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2837 uint32_t float64_to_uint32(float64 a
, float_status
*s
)
2839 return float64_to_uint32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2842 uint64_t float64_to_uint64(float64 a
, float_status
*s
)
2844 return float64_to_uint64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2847 uint32_t float128_to_uint32(float128 a
, float_status
*s
)
2849 return float128_to_uint32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2852 uint64_t float128_to_uint64(float128 a
, float_status
*s
)
2854 return float128_to_uint64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2857 uint16_t float16_to_uint16_round_to_zero(float16 a
, float_status
*s
)
2859 return float16_to_uint16_scalbn(a
, float_round_to_zero
, 0, s
);
2862 uint32_t float16_to_uint32_round_to_zero(float16 a
, float_status
*s
)
2864 return float16_to_uint32_scalbn(a
, float_round_to_zero
, 0, s
);
2867 uint64_t float16_to_uint64_round_to_zero(float16 a
, float_status
*s
)
2869 return float16_to_uint64_scalbn(a
, float_round_to_zero
, 0, s
);
2872 uint16_t float32_to_uint16_round_to_zero(float32 a
, float_status
*s
)
2874 return float32_to_uint16_scalbn(a
, float_round_to_zero
, 0, s
);
2877 uint32_t float32_to_uint32_round_to_zero(float32 a
, float_status
*s
)
2879 return float32_to_uint32_scalbn(a
, float_round_to_zero
, 0, s
);
2882 uint64_t float32_to_uint64_round_to_zero(float32 a
, float_status
*s
)
2884 return float32_to_uint64_scalbn(a
, float_round_to_zero
, 0, s
);
2887 uint16_t float64_to_uint16_round_to_zero(float64 a
, float_status
*s
)
2889 return float64_to_uint16_scalbn(a
, float_round_to_zero
, 0, s
);
2892 uint32_t float64_to_uint32_round_to_zero(float64 a
, float_status
*s
)
2894 return float64_to_uint32_scalbn(a
, float_round_to_zero
, 0, s
);
2897 uint64_t float64_to_uint64_round_to_zero(float64 a
, float_status
*s
)
2899 return float64_to_uint64_scalbn(a
, float_round_to_zero
, 0, s
);
2902 uint32_t float128_to_uint32_round_to_zero(float128 a
, float_status
*s
)
2904 return float128_to_uint32_scalbn(a
, float_round_to_zero
, 0, s
);
2907 uint64_t float128_to_uint64_round_to_zero(float128 a
, float_status
*s
)
2909 return float128_to_uint64_scalbn(a
, float_round_to_zero
, 0, s
);
2912 uint16_t bfloat16_to_uint16(bfloat16 a
, float_status
*s
)
2914 return bfloat16_to_uint16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2917 uint32_t bfloat16_to_uint32(bfloat16 a
, float_status
*s
)
2919 return bfloat16_to_uint32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2922 uint64_t bfloat16_to_uint64(bfloat16 a
, float_status
*s
)
2924 return bfloat16_to_uint64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2927 uint16_t bfloat16_to_uint16_round_to_zero(bfloat16 a
, float_status
*s
)
2929 return bfloat16_to_uint16_scalbn(a
, float_round_to_zero
, 0, s
);
2932 uint32_t bfloat16_to_uint32_round_to_zero(bfloat16 a
, float_status
*s
)
2934 return bfloat16_to_uint32_scalbn(a
, float_round_to_zero
, 0, s
);
2937 uint64_t bfloat16_to_uint64_round_to_zero(bfloat16 a
, float_status
*s
)
2939 return bfloat16_to_uint64_scalbn(a
, float_round_to_zero
, 0, s
);
2943 * Integer to float conversions
2945 * Returns the result of converting the two's complement integer `a'
2946 * to the floating-point format. The conversion is performed according
2947 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2950 static FloatParts64
int_to_float(int64_t a
, int scale
, float_status
*status
)
2952 FloatParts64 r
= { .sign
= false };
2955 r
.cls
= float_class_zero
;
2960 r
.cls
= float_class_normal
;
2966 scale
= MIN(MAX(scale
, -0x10000), 0x10000);
2968 r
.exp
= DECOMPOSED_BINARY_POINT
- shift
+ scale
;
2969 r
.frac
= f
<< shift
;
2975 float16
int64_to_float16_scalbn(int64_t a
, int scale
, float_status
*status
)
2977 FloatParts64 pa
= int_to_float(a
, scale
, status
);
2978 return float16_round_pack_canonical(&pa
, status
);
2981 float16
int32_to_float16_scalbn(int32_t a
, int scale
, float_status
*status
)
2983 return int64_to_float16_scalbn(a
, scale
, status
);
2986 float16
int16_to_float16_scalbn(int16_t a
, int scale
, float_status
*status
)
2988 return int64_to_float16_scalbn(a
, scale
, status
);
2991 float16
int64_to_float16(int64_t a
, float_status
*status
)
2993 return int64_to_float16_scalbn(a
, 0, status
);
2996 float16
int32_to_float16(int32_t a
, float_status
*status
)
2998 return int64_to_float16_scalbn(a
, 0, status
);
3001 float16
int16_to_float16(int16_t a
, float_status
*status
)
3003 return int64_to_float16_scalbn(a
, 0, status
);
3006 float16
int8_to_float16(int8_t a
, float_status
*status
)
3008 return int64_to_float16_scalbn(a
, 0, status
);
3011 float32
int64_to_float32_scalbn(int64_t a
, int scale
, float_status
*status
)
3013 FloatParts64 pa
= int_to_float(a
, scale
, status
);
3014 return float32_round_pack_canonical(&pa
, status
);
3017 float32
int32_to_float32_scalbn(int32_t a
, int scale
, float_status
*status
)
3019 return int64_to_float32_scalbn(a
, scale
, status
);
3022 float32
int16_to_float32_scalbn(int16_t a
, int scale
, float_status
*status
)
3024 return int64_to_float32_scalbn(a
, scale
, status
);
3027 float32
int64_to_float32(int64_t a
, float_status
*status
)
3029 return int64_to_float32_scalbn(a
, 0, status
);
3032 float32
int32_to_float32(int32_t a
, float_status
*status
)
3034 return int64_to_float32_scalbn(a
, 0, status
);
3037 float32
int16_to_float32(int16_t a
, float_status
*status
)
3039 return int64_to_float32_scalbn(a
, 0, status
);
3042 float64
int64_to_float64_scalbn(int64_t a
, int scale
, float_status
*status
)
3044 FloatParts64 pa
= int_to_float(a
, scale
, status
);
3045 return float64_round_pack_canonical(&pa
, status
);
3048 float64
int32_to_float64_scalbn(int32_t a
, int scale
, float_status
*status
)
3050 return int64_to_float64_scalbn(a
, scale
, status
);
3053 float64
int16_to_float64_scalbn(int16_t a
, int scale
, float_status
*status
)
3055 return int64_to_float64_scalbn(a
, scale
, status
);
3058 float64
int64_to_float64(int64_t a
, float_status
*status
)
3060 return int64_to_float64_scalbn(a
, 0, status
);
3063 float64
int32_to_float64(int32_t a
, float_status
*status
)
3065 return int64_to_float64_scalbn(a
, 0, status
);
3068 float64
int16_to_float64(int16_t a
, float_status
*status
)
3070 return int64_to_float64_scalbn(a
, 0, status
);
3074 * Returns the result of converting the two's complement integer `a'
3075 * to the bfloat16 format.
3078 bfloat16
int64_to_bfloat16_scalbn(int64_t a
, int scale
, float_status
*status
)
3080 FloatParts64 pa
= int_to_float(a
, scale
, status
);
3081 return bfloat16_round_pack_canonical(&pa
, status
);
3084 bfloat16
int32_to_bfloat16_scalbn(int32_t a
, int scale
, float_status
*status
)
3086 return int64_to_bfloat16_scalbn(a
, scale
, status
);
3089 bfloat16
int16_to_bfloat16_scalbn(int16_t a
, int scale
, float_status
*status
)
3091 return int64_to_bfloat16_scalbn(a
, scale
, status
);
3094 bfloat16
int64_to_bfloat16(int64_t a
, float_status
*status
)
3096 return int64_to_bfloat16_scalbn(a
, 0, status
);
3099 bfloat16
int32_to_bfloat16(int32_t a
, float_status
*status
)
3101 return int64_to_bfloat16_scalbn(a
, 0, status
);
3104 bfloat16
int16_to_bfloat16(int16_t a
, float_status
*status
)
3106 return int64_to_bfloat16_scalbn(a
, 0, status
);
3110 * Unsigned Integer to float conversions
3112 * Returns the result of converting the unsigned integer `a' to the
3113 * floating-point format. The conversion is performed according to the
3114 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3117 static FloatParts64
uint_to_float(uint64_t a
, int scale
, float_status
*status
)
3119 FloatParts64 r
= { .sign
= false };
3123 r
.cls
= float_class_zero
;
3125 scale
= MIN(MAX(scale
, -0x10000), 0x10000);
3127 r
.cls
= float_class_normal
;
3128 r
.exp
= DECOMPOSED_BINARY_POINT
- shift
+ scale
;
3129 r
.frac
= a
<< shift
;
3135 float16
uint64_to_float16_scalbn(uint64_t a
, int scale
, float_status
*status
)
3137 FloatParts64 pa
= uint_to_float(a
, scale
, status
);
3138 return float16_round_pack_canonical(&pa
, status
);
3141 float16
uint32_to_float16_scalbn(uint32_t a
, int scale
, float_status
*status
)
3143 return uint64_to_float16_scalbn(a
, scale
, status
);
3146 float16
uint16_to_float16_scalbn(uint16_t a
, int scale
, float_status
*status
)
3148 return uint64_to_float16_scalbn(a
, scale
, status
);
3151 float16
uint64_to_float16(uint64_t a
, float_status
*status
)
3153 return uint64_to_float16_scalbn(a
, 0, status
);
3156 float16
uint32_to_float16(uint32_t a
, float_status
*status
)
3158 return uint64_to_float16_scalbn(a
, 0, status
);
3161 float16
uint16_to_float16(uint16_t a
, float_status
*status
)
3163 return uint64_to_float16_scalbn(a
, 0, status
);
3166 float16
uint8_to_float16(uint8_t a
, float_status
*status
)
3168 return uint64_to_float16_scalbn(a
, 0, status
);
3171 float32
uint64_to_float32_scalbn(uint64_t a
, int scale
, float_status
*status
)
3173 FloatParts64 pa
= uint_to_float(a
, scale
, status
);
3174 return float32_round_pack_canonical(&pa
, status
);
3177 float32
uint32_to_float32_scalbn(uint32_t a
, int scale
, float_status
*status
)
3179 return uint64_to_float32_scalbn(a
, scale
, status
);
3182 float32
uint16_to_float32_scalbn(uint16_t a
, int scale
, float_status
*status
)
3184 return uint64_to_float32_scalbn(a
, scale
, status
);
3187 float32
uint64_to_float32(uint64_t a
, float_status
*status
)
3189 return uint64_to_float32_scalbn(a
, 0, status
);
3192 float32
uint32_to_float32(uint32_t a
, float_status
*status
)
3194 return uint64_to_float32_scalbn(a
, 0, status
);
3197 float32
uint16_to_float32(uint16_t a
, float_status
*status
)
3199 return uint64_to_float32_scalbn(a
, 0, status
);
3202 float64
uint64_to_float64_scalbn(uint64_t a
, int scale
, float_status
*status
)
3204 FloatParts64 pa
= uint_to_float(a
, scale
, status
);
3205 return float64_round_pack_canonical(&pa
, status
);
3208 float64
uint32_to_float64_scalbn(uint32_t a
, int scale
, float_status
*status
)
3210 return uint64_to_float64_scalbn(a
, scale
, status
);
3213 float64
uint16_to_float64_scalbn(uint16_t a
, int scale
, float_status
*status
)
3215 return uint64_to_float64_scalbn(a
, scale
, status
);
3218 float64
uint64_to_float64(uint64_t a
, float_status
*status
)
3220 return uint64_to_float64_scalbn(a
, 0, status
);
3223 float64
uint32_to_float64(uint32_t a
, float_status
*status
)
3225 return uint64_to_float64_scalbn(a
, 0, status
);
3228 float64
uint16_to_float64(uint16_t a
, float_status
*status
)
3230 return uint64_to_float64_scalbn(a
, 0, status
);
3234 * Returns the result of converting the unsigned integer `a' to the
3238 bfloat16
uint64_to_bfloat16_scalbn(uint64_t a
, int scale
, float_status
*status
)
3240 FloatParts64 pa
= uint_to_float(a
, scale
, status
);
3241 return bfloat16_round_pack_canonical(&pa
, status
);
3244 bfloat16
uint32_to_bfloat16_scalbn(uint32_t a
, int scale
, float_status
*status
)
3246 return uint64_to_bfloat16_scalbn(a
, scale
, status
);
3249 bfloat16
uint16_to_bfloat16_scalbn(uint16_t a
, int scale
, float_status
*status
)
3251 return uint64_to_bfloat16_scalbn(a
, scale
, status
);
3254 bfloat16
uint64_to_bfloat16(uint64_t a
, float_status
*status
)
3256 return uint64_to_bfloat16_scalbn(a
, 0, status
);
3259 bfloat16
uint32_to_bfloat16(uint32_t a
, float_status
*status
)
3261 return uint64_to_bfloat16_scalbn(a
, 0, status
);
3264 bfloat16
uint16_to_bfloat16(uint16_t a
, float_status
*status
)
3266 return uint64_to_bfloat16_scalbn(a
, 0, status
);
3270 /* min() and max() functions. These can't be implemented as
3271 * 'compare and pick one input' because that would mishandle
3272 * NaNs and +0 vs -0.
3274 * minnum() and maxnum() functions. These are similar to the min()
3275 * and max() functions but if one of the arguments is a QNaN and
3276 * the other is numerical then the numerical argument is returned.
3277 * SNaNs will get quietened before being returned.
3278 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
3279 * and maxNum() operations. min() and max() are the typical min/max
3280 * semantics provided by many CPUs which predate that specification.
3282 * minnummag() and maxnummag() functions correspond to minNumMag()
3283 * and minNumMag() from the IEEE-754 2008.
3285 static FloatParts64
minmax_floats(FloatParts64 a
, FloatParts64 b
, bool ismin
,
3286 bool ieee
, bool ismag
, float_status
*s
)
3288 if (unlikely(is_nan(a
.cls
) || is_nan(b
.cls
))) {
3290 /* Takes two floating-point values `a' and `b', one of
3291 * which is a NaN, and returns the appropriate NaN
3292 * result. If either `a' or `b' is a signaling NaN,
3293 * the invalid exception is raised.
3295 if (is_snan(a
.cls
) || is_snan(b
.cls
)) {
3296 return *parts_pick_nan(&a
, &b
, s
);
3297 } else if (is_nan(a
.cls
) && !is_nan(b
.cls
)) {
3299 } else if (is_nan(b
.cls
) && !is_nan(a
.cls
)) {
3303 return *parts_pick_nan(&a
, &b
, s
);
3308 case float_class_normal
:
3311 case float_class_inf
:
3314 case float_class_zero
:
3318 g_assert_not_reached();
3322 case float_class_normal
:
3325 case float_class_inf
:
3328 case float_class_zero
:
3332 g_assert_not_reached();
3336 if (ismag
&& (a_exp
!= b_exp
|| a
.frac
!= b
.frac
)) {
3337 bool a_less
= a_exp
< b_exp
;
3338 if (a_exp
== b_exp
) {
3339 a_less
= a
.frac
< b
.frac
;
3341 return a_less
^ ismin
? b
: a
;
3344 if (a
.sign
== b
.sign
) {
3345 bool a_less
= a_exp
< b_exp
;
3346 if (a_exp
== b_exp
) {
3347 a_less
= a
.frac
< b
.frac
;
3349 return a
.sign
^ a_less
^ ismin
? b
: a
;
3351 return a
.sign
^ ismin
? b
: a
;
3356 #define MINMAX(sz, name, ismin, isiee, ismag) \
3357 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \
3360 FloatParts64 pa, pb, pr; \
3361 float ## sz ## _unpack_canonical(&pa, a, s); \
3362 float ## sz ## _unpack_canonical(&pb, b, s); \
3363 pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
3364 return float ## sz ## _round_pack_canonical(&pr, s); \
3367 MINMAX(16, min
, true, false, false)
3368 MINMAX(16, minnum
, true, true, false)
3369 MINMAX(16, minnummag
, true, true, true)
3370 MINMAX(16, max
, false, false, false)
3371 MINMAX(16, maxnum
, false, true, false)
3372 MINMAX(16, maxnummag
, false, true, true)
3374 MINMAX(32, min
, true, false, false)
3375 MINMAX(32, minnum
, true, true, false)
3376 MINMAX(32, minnummag
, true, true, true)
3377 MINMAX(32, max
, false, false, false)
3378 MINMAX(32, maxnum
, false, true, false)
3379 MINMAX(32, maxnummag
, false, true, true)
3381 MINMAX(64, min
, true, false, false)
3382 MINMAX(64, minnum
, true, true, false)
3383 MINMAX(64, minnummag
, true, true, true)
3384 MINMAX(64, max
, false, false, false)
3385 MINMAX(64, maxnum
, false, true, false)
3386 MINMAX(64, maxnummag
, false, true, true)
3390 #define BF16_MINMAX(name, ismin, isiee, ismag) \
3391 bfloat16 bfloat16_ ## name(bfloat16 a, bfloat16 b, float_status *s) \
3393 FloatParts64 pa, pb, pr; \
3394 bfloat16_unpack_canonical(&pa, a, s); \
3395 bfloat16_unpack_canonical(&pb, b, s); \
3396 pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
3397 return bfloat16_round_pack_canonical(&pr, s); \
3400 BF16_MINMAX(min
, true, false, false)
3401 BF16_MINMAX(minnum
, true, true, false)
3402 BF16_MINMAX(minnummag
, true, true, true)
3403 BF16_MINMAX(max
, false, false, false)
3404 BF16_MINMAX(maxnum
, false, true, false)
3405 BF16_MINMAX(maxnummag
, false, true, true)
3409 /* Floating point compare */
3410 static FloatRelation
compare_floats(FloatParts64 a
, FloatParts64 b
, bool is_quiet
,
3413 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
3415 a
.cls
== float_class_snan
||
3416 b
.cls
== float_class_snan
) {
3417 float_raise(float_flag_invalid
, s
);
3419 return float_relation_unordered
;
3422 if (a
.cls
== float_class_zero
) {
3423 if (b
.cls
== float_class_zero
) {
3424 return float_relation_equal
;
3426 return b
.sign
? float_relation_greater
: float_relation_less
;
3427 } else if (b
.cls
== float_class_zero
) {
3428 return a
.sign
? float_relation_less
: float_relation_greater
;
3431 /* The only really important thing about infinity is its sign. If
3432 * both are infinities the sign marks the smallest of the two.
3434 if (a
.cls
== float_class_inf
) {
3435 if ((b
.cls
== float_class_inf
) && (a
.sign
== b
.sign
)) {
3436 return float_relation_equal
;
3438 return a
.sign
? float_relation_less
: float_relation_greater
;
3439 } else if (b
.cls
== float_class_inf
) {
3440 return b
.sign
? float_relation_greater
: float_relation_less
;
3443 if (a
.sign
!= b
.sign
) {
3444 return a
.sign
? float_relation_less
: float_relation_greater
;
3447 if (a
.exp
== b
.exp
) {
3448 if (a
.frac
== b
.frac
) {
3449 return float_relation_equal
;
3452 return a
.frac
> b
.frac
?
3453 float_relation_less
: float_relation_greater
;
3455 return a
.frac
> b
.frac
?
3456 float_relation_greater
: float_relation_less
;
3460 return a
.exp
> b
.exp
? float_relation_less
: float_relation_greater
;
3462 return a
.exp
> b
.exp
? float_relation_greater
: float_relation_less
;
3467 #define COMPARE(name, attr, sz) \
3469 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s) \
3471 FloatParts64 pa, pb; \
3472 float ## sz ## _unpack_canonical(&pa, a, s); \
3473 float ## sz ## _unpack_canonical(&pb, b, s); \
3474 return compare_floats(pa, pb, is_quiet, s); \
3477 COMPARE(soft_f16_compare
, QEMU_FLATTEN
, 16)
3478 COMPARE(soft_f32_compare
, QEMU_SOFTFLOAT_ATTR
, 32)
3479 COMPARE(soft_f64_compare
, QEMU_SOFTFLOAT_ATTR
, 64)
3483 FloatRelation
float16_compare(float16 a
, float16 b
, float_status
*s
)
3485 return soft_f16_compare(a
, b
, false, s
);
3488 FloatRelation
float16_compare_quiet(float16 a
, float16 b
, float_status
*s
)
3490 return soft_f16_compare(a
, b
, true, s
);
3493 static FloatRelation QEMU_FLATTEN
3494 f32_compare(float32 xa
, float32 xb
, bool is_quiet
, float_status
*s
)
3496 union_float32 ua
, ub
;
3501 if (QEMU_NO_HARDFLOAT
) {
3505 float32_input_flush2(&ua
.s
, &ub
.s
, s
);
3506 if (isgreaterequal(ua
.h
, ub
.h
)) {
3507 if (isgreater(ua
.h
, ub
.h
)) {
3508 return float_relation_greater
;
3510 return float_relation_equal
;
3512 if (likely(isless(ua
.h
, ub
.h
))) {
3513 return float_relation_less
;
3515 /* The only condition remaining is unordered.
3516 * Fall through to set flags.
3519 return soft_f32_compare(ua
.s
, ub
.s
, is_quiet
, s
);
3522 FloatRelation
float32_compare(float32 a
, float32 b
, float_status
*s
)
3524 return f32_compare(a
, b
, false, s
);
3527 FloatRelation
float32_compare_quiet(float32 a
, float32 b
, float_status
*s
)
3529 return f32_compare(a
, b
, true, s
);
3532 static FloatRelation QEMU_FLATTEN
3533 f64_compare(float64 xa
, float64 xb
, bool is_quiet
, float_status
*s
)
3535 union_float64 ua
, ub
;
3540 if (QEMU_NO_HARDFLOAT
) {
3544 float64_input_flush2(&ua
.s
, &ub
.s
, s
);
3545 if (isgreaterequal(ua
.h
, ub
.h
)) {
3546 if (isgreater(ua
.h
, ub
.h
)) {
3547 return float_relation_greater
;
3549 return float_relation_equal
;
3551 if (likely(isless(ua
.h
, ub
.h
))) {
3552 return float_relation_less
;
3554 /* The only condition remaining is unordered.
3555 * Fall through to set flags.
3558 return soft_f64_compare(ua
.s
, ub
.s
, is_quiet
, s
);
3561 FloatRelation
float64_compare(float64 a
, float64 b
, float_status
*s
)
3563 return f64_compare(a
, b
, false, s
);
3566 FloatRelation
float64_compare_quiet(float64 a
, float64 b
, float_status
*s
)
3568 return f64_compare(a
, b
, true, s
);
3571 static FloatRelation QEMU_FLATTEN
3572 soft_bf16_compare(bfloat16 a
, bfloat16 b
, bool is_quiet
, float_status
*s
)
3574 FloatParts64 pa
, pb
;
3576 bfloat16_unpack_canonical(&pa
, a
, s
);
3577 bfloat16_unpack_canonical(&pb
, b
, s
);
3578 return compare_floats(pa
, pb
, is_quiet
, s
);
3581 FloatRelation
bfloat16_compare(bfloat16 a
, bfloat16 b
, float_status
*s
)
3583 return soft_bf16_compare(a
, b
, false, s
);
3586 FloatRelation
bfloat16_compare_quiet(bfloat16 a
, bfloat16 b
, float_status
*s
)
3588 return soft_bf16_compare(a
, b
, true, s
);
3591 /* Multiply A by 2 raised to the power N. */
3592 static FloatParts64
scalbn_decomposed(FloatParts64 a
, int n
, float_status
*s
)
3594 if (unlikely(is_nan(a
.cls
))) {
3595 parts_return_nan(&a
, s
);
3597 if (a
.cls
== float_class_normal
) {
3598 /* The largest float type (even though not supported by FloatParts64)
3599 * is float128, which has a 15 bit exponent. Bounding N to 16 bits
3600 * still allows rounding to infinity, without allowing overflow
3601 * within the int32_t that backs FloatParts64.exp.
3603 n
= MIN(MAX(n
, -0x10000), 0x10000);
3609 float16
float16_scalbn(float16 a
, int n
, float_status
*status
)
3611 FloatParts64 pa
, pr
;
3613 float16_unpack_canonical(&pa
, a
, status
);
3614 pr
= scalbn_decomposed(pa
, n
, status
);
3615 return float16_round_pack_canonical(&pr
, status
);
3618 float32
float32_scalbn(float32 a
, int n
, float_status
*status
)
3620 FloatParts64 pa
, pr
;
3622 float32_unpack_canonical(&pa
, a
, status
);
3623 pr
= scalbn_decomposed(pa
, n
, status
);
3624 return float32_round_pack_canonical(&pr
, status
);
3627 float64
float64_scalbn(float64 a
, int n
, float_status
*status
)
3629 FloatParts64 pa
, pr
;
3631 float64_unpack_canonical(&pa
, a
, status
);
3632 pr
= scalbn_decomposed(pa
, n
, status
);
3633 return float64_round_pack_canonical(&pr
, status
);
3636 bfloat16
bfloat16_scalbn(bfloat16 a
, int n
, float_status
*status
)
3638 FloatParts64 pa
, pr
;
3640 bfloat16_unpack_canonical(&pa
, a
, status
);
3641 pr
= scalbn_decomposed(pa
, n
, status
);
3642 return bfloat16_round_pack_canonical(&pr
, status
);
3648 * The old softfloat code did an approximation step before zeroing in
3649 * on the final result. However for simpleness we just compute the
3650 * square root by iterating down from the implicit bit to enough extra
3651 * bits to ensure we get a correctly rounded result.
3653 * This does mean however the calculation is slower than before,
3654 * especially for 64 bit floats.
3657 static FloatParts64
sqrt_float(FloatParts64 a
, float_status
*s
, const FloatFmt
*p
)
3659 uint64_t a_frac
, r_frac
, s_frac
;
3662 if (is_nan(a
.cls
)) {
3663 parts_return_nan(&a
, s
);
3666 if (a
.cls
== float_class_zero
) {
3667 return a
; /* sqrt(+-0) = +-0 */
3670 float_raise(float_flag_invalid
, s
);
3671 parts_default_nan(&a
, s
);
3674 if (a
.cls
== float_class_inf
) {
3675 return a
; /* sqrt(+inf) = +inf */
3678 assert(a
.cls
== float_class_normal
);
3680 /* We need two overflow bits at the top. Adding room for that is a
3681 * right shift. If the exponent is odd, we can discard the low bit
3682 * by multiplying the fraction by 2; that's a left shift. Combine
3683 * those and we shift right by 1 if the exponent is odd, otherwise 2.
3685 a_frac
= a
.frac
>> (2 - (a
.exp
& 1));
3688 /* Bit-by-bit computation of sqrt. */
3692 /* Iterate from implicit bit down to the 3 extra bits to compute a
3693 * properly rounded result. Remember we've inserted two more bits
3694 * at the top, so these positions are two less.
3696 bit
= DECOMPOSED_BINARY_POINT
- 2;
3697 last_bit
= MAX(p
->frac_shift
- 4, 0);
3699 uint64_t q
= 1ULL << bit
;
3700 uint64_t t_frac
= s_frac
+ q
;
3701 if (t_frac
<= a_frac
) {
3702 s_frac
= t_frac
+ q
;
3707 } while (--bit
>= last_bit
);
3709 /* Undo the right shift done above. If there is any remaining
3710 * fraction, the result is inexact. Set the sticky bit.
3712 a
.frac
= (r_frac
<< 2) + (a_frac
!= 0);
3717 float16 QEMU_FLATTEN
float16_sqrt(float16 a
, float_status
*status
)
3719 FloatParts64 pa
, pr
;
3721 float16_unpack_canonical(&pa
, a
, status
);
3722 pr
= sqrt_float(pa
, status
, &float16_params
);
3723 return float16_round_pack_canonical(&pr
, status
);
3726 static float32 QEMU_SOFTFLOAT_ATTR
3727 soft_f32_sqrt(float32 a
, float_status
*status
)
3729 FloatParts64 pa
, pr
;
3731 float32_unpack_canonical(&pa
, a
, status
);
3732 pr
= sqrt_float(pa
, status
, &float32_params
);
3733 return float32_round_pack_canonical(&pr
, status
);
3736 static float64 QEMU_SOFTFLOAT_ATTR
3737 soft_f64_sqrt(float64 a
, float_status
*status
)
3739 FloatParts64 pa
, pr
;
3741 float64_unpack_canonical(&pa
, a
, status
);
3742 pr
= sqrt_float(pa
, status
, &float64_params
);
3743 return float64_round_pack_canonical(&pr
, status
);
3746 float32 QEMU_FLATTEN
float32_sqrt(float32 xa
, float_status
*s
)
3748 union_float32 ua
, ur
;
3751 if (unlikely(!can_use_fpu(s
))) {
3755 float32_input_flush1(&ua
.s
, s
);
3756 if (QEMU_HARDFLOAT_1F32_USE_FP
) {
3757 if (unlikely(!(fpclassify(ua
.h
) == FP_NORMAL
||
3758 fpclassify(ua
.h
) == FP_ZERO
) ||
3762 } else if (unlikely(!float32_is_zero_or_normal(ua
.s
) ||
3763 float32_is_neg(ua
.s
))) {
3770 return soft_f32_sqrt(ua
.s
, s
);
3773 float64 QEMU_FLATTEN
float64_sqrt(float64 xa
, float_status
*s
)
3775 union_float64 ua
, ur
;
3778 if (unlikely(!can_use_fpu(s
))) {
3782 float64_input_flush1(&ua
.s
, s
);
3783 if (QEMU_HARDFLOAT_1F64_USE_FP
) {
3784 if (unlikely(!(fpclassify(ua
.h
) == FP_NORMAL
||
3785 fpclassify(ua
.h
) == FP_ZERO
) ||
3789 } else if (unlikely(!float64_is_zero_or_normal(ua
.s
) ||
3790 float64_is_neg(ua
.s
))) {
3797 return soft_f64_sqrt(ua
.s
, s
);
3800 bfloat16 QEMU_FLATTEN
bfloat16_sqrt(bfloat16 a
, float_status
*status
)
3802 FloatParts64 pa
, pr
;
3804 bfloat16_unpack_canonical(&pa
, a
, status
);
3805 pr
= sqrt_float(pa
, status
, &bfloat16_params
);
3806 return bfloat16_round_pack_canonical(&pr
, status
);
3809 /*----------------------------------------------------------------------------
3810 | The pattern for a default generated NaN.
3811 *----------------------------------------------------------------------------*/
3813 float16
float16_default_nan(float_status
*status
)
3817 parts_default_nan(&p
, status
);
3818 p
.frac
>>= float16_params
.frac_shift
;
3819 return float16_pack_raw(&p
);
3822 float32
float32_default_nan(float_status
*status
)
3826 parts_default_nan(&p
, status
);
3827 p
.frac
>>= float32_params
.frac_shift
;
3828 return float32_pack_raw(&p
);
3831 float64
float64_default_nan(float_status
*status
)
3835 parts_default_nan(&p
, status
);
3836 p
.frac
>>= float64_params
.frac_shift
;
3837 return float64_pack_raw(&p
);
3840 float128
float128_default_nan(float_status
*status
)
3844 parts_default_nan(&p
, status
);
3845 frac_shr(&p
, float128_params
.frac_shift
);
3846 return float128_pack_raw(&p
);
3849 bfloat16
bfloat16_default_nan(float_status
*status
)
3853 parts_default_nan(&p
, status
);
3854 p
.frac
>>= bfloat16_params
.frac_shift
;
3855 return bfloat16_pack_raw(&p
);
3858 /*----------------------------------------------------------------------------
3859 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
3860 *----------------------------------------------------------------------------*/
3862 float16
float16_silence_nan(float16 a
, float_status
*status
)
3866 float16_unpack_raw(&p
, a
);
3867 p
.frac
<<= float16_params
.frac_shift
;
3868 parts_silence_nan(&p
, status
);
3869 p
.frac
>>= float16_params
.frac_shift
;
3870 return float16_pack_raw(&p
);
3873 float32
float32_silence_nan(float32 a
, float_status
*status
)
3877 float32_unpack_raw(&p
, a
);
3878 p
.frac
<<= float32_params
.frac_shift
;
3879 parts_silence_nan(&p
, status
);
3880 p
.frac
>>= float32_params
.frac_shift
;
3881 return float32_pack_raw(&p
);
3884 float64
float64_silence_nan(float64 a
, float_status
*status
)
3888 float64_unpack_raw(&p
, a
);
3889 p
.frac
<<= float64_params
.frac_shift
;
3890 parts_silence_nan(&p
, status
);
3891 p
.frac
>>= float64_params
.frac_shift
;
3892 return float64_pack_raw(&p
);
3895 bfloat16
bfloat16_silence_nan(bfloat16 a
, float_status
*status
)
3899 bfloat16_unpack_raw(&p
, a
);
3900 p
.frac
<<= bfloat16_params
.frac_shift
;
3901 parts_silence_nan(&p
, status
);
3902 p
.frac
>>= bfloat16_params
.frac_shift
;
3903 return bfloat16_pack_raw(&p
);
3906 float128
float128_silence_nan(float128 a
, float_status
*status
)
3910 float128_unpack_raw(&p
, a
);
3911 frac_shl(&p
, float128_params
.frac_shift
);
3912 parts_silence_nan(&p
, status
);
3913 frac_shr(&p
, float128_params
.frac_shift
);
3914 return float128_pack_raw(&p
);
3917 /*----------------------------------------------------------------------------
3918 | If `a' is denormal and we are in flush-to-zero mode then set the
3919 | input-denormal exception and return zero. Otherwise just return the value.
3920 *----------------------------------------------------------------------------*/
3922 static bool parts_squash_denormal(FloatParts64 p
, float_status
*status
)
3924 if (p
.exp
== 0 && p
.frac
!= 0) {
3925 float_raise(float_flag_input_denormal
, status
);
3932 float16
float16_squash_input_denormal(float16 a
, float_status
*status
)
3934 if (status
->flush_inputs_to_zero
) {
3937 float16_unpack_raw(&p
, a
);
3938 if (parts_squash_denormal(p
, status
)) {
3939 return float16_set_sign(float16_zero
, p
.sign
);
3945 float32
float32_squash_input_denormal(float32 a
, float_status
*status
)
3947 if (status
->flush_inputs_to_zero
) {
3950 float32_unpack_raw(&p
, a
);
3951 if (parts_squash_denormal(p
, status
)) {
3952 return float32_set_sign(float32_zero
, p
.sign
);
3958 float64
float64_squash_input_denormal(float64 a
, float_status
*status
)
3960 if (status
->flush_inputs_to_zero
) {
3963 float64_unpack_raw(&p
, a
);
3964 if (parts_squash_denormal(p
, status
)) {
3965 return float64_set_sign(float64_zero
, p
.sign
);
3971 bfloat16
bfloat16_squash_input_denormal(bfloat16 a
, float_status
*status
)
3973 if (status
->flush_inputs_to_zero
) {
3976 bfloat16_unpack_raw(&p
, a
);
3977 if (parts_squash_denormal(p
, status
)) {
3978 return bfloat16_set_sign(bfloat16_zero
, p
.sign
);
3984 /*----------------------------------------------------------------------------
3985 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
3986 | and 7, and returns the properly rounded 32-bit integer corresponding to the
3987 | input. If `zSign' is 1, the input is negated before being converted to an
3988 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
3989 | is simply rounded to an integer, with the inexact exception raised if the
3990 | input cannot be represented exactly as an integer. However, if the fixed-
3991 | point input is too large, the invalid exception is raised and the largest
3992 | positive or negative integer is returned.
3993 *----------------------------------------------------------------------------*/
3995 static int32_t roundAndPackInt32(bool zSign
, uint64_t absZ
,
3996 float_status
*status
)
3998 int8_t roundingMode
;
3999 bool roundNearestEven
;
4000 int8_t roundIncrement
, roundBits
;
4003 roundingMode
= status
->float_rounding_mode
;
4004 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
4005 switch (roundingMode
) {
4006 case float_round_nearest_even
:
4007 case float_round_ties_away
:
4008 roundIncrement
= 0x40;
4010 case float_round_to_zero
:
4013 case float_round_up
:
4014 roundIncrement
= zSign
? 0 : 0x7f;
4016 case float_round_down
:
4017 roundIncrement
= zSign
? 0x7f : 0;
4019 case float_round_to_odd
:
4020 roundIncrement
= absZ
& 0x80 ? 0 : 0x7f;
4025 roundBits
= absZ
& 0x7F;
4026 absZ
= ( absZ
+ roundIncrement
)>>7;
4027 if (!(roundBits
^ 0x40) && roundNearestEven
) {
4031 if ( zSign
) z
= - z
;
4032 if ( ( absZ
>>32 ) || ( z
&& ( ( z
< 0 ) ^ zSign
) ) ) {
4033 float_raise(float_flag_invalid
, status
);
4034 return zSign
? INT32_MIN
: INT32_MAX
;
4037 float_raise(float_flag_inexact
, status
);
4043 /*----------------------------------------------------------------------------
4044 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
4045 | `absZ1', with binary point between bits 63 and 64 (between the input words),
4046 | and returns the properly rounded 64-bit integer corresponding to the input.
4047 | If `zSign' is 1, the input is negated before being converted to an integer.
4048 | Ordinarily, the fixed-point input is simply rounded to an integer, with
4049 | the inexact exception raised if the input cannot be represented exactly as
4050 | an integer. However, if the fixed-point input is too large, the invalid
4051 | exception is raised and the largest positive or negative integer is
4053 *----------------------------------------------------------------------------*/
4055 static int64_t roundAndPackInt64(bool zSign
, uint64_t absZ0
, uint64_t absZ1
,
4056 float_status
*status
)
4058 int8_t roundingMode
;
4059 bool roundNearestEven
, increment
;
4062 roundingMode
= status
->float_rounding_mode
;
4063 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
4064 switch (roundingMode
) {
4065 case float_round_nearest_even
:
4066 case float_round_ties_away
:
4067 increment
= ((int64_t) absZ1
< 0);
4069 case float_round_to_zero
:
4072 case float_round_up
:
4073 increment
= !zSign
&& absZ1
;
4075 case float_round_down
:
4076 increment
= zSign
&& absZ1
;
4078 case float_round_to_odd
:
4079 increment
= !(absZ0
& 1) && absZ1
;
4086 if ( absZ0
== 0 ) goto overflow
;
4087 if (!(absZ1
<< 1) && roundNearestEven
) {
4092 if ( zSign
) z
= - z
;
4093 if ( z
&& ( ( z
< 0 ) ^ zSign
) ) {
4095 float_raise(float_flag_invalid
, status
);
4096 return zSign
? INT64_MIN
: INT64_MAX
;
4099 float_raise(float_flag_inexact
, status
);
4105 /*----------------------------------------------------------------------------
4106 | Normalizes the subnormal single-precision floating-point value represented
4107 | by the denormalized significand `aSig'. The normalized exponent and
4108 | significand are stored at the locations pointed to by `zExpPtr' and
4109 | `zSigPtr', respectively.
4110 *----------------------------------------------------------------------------*/
4113 normalizeFloat32Subnormal(uint32_t aSig
, int *zExpPtr
, uint32_t *zSigPtr
)
4117 shiftCount
= clz32(aSig
) - 8;
4118 *zSigPtr
= aSig
<<shiftCount
;
4119 *zExpPtr
= 1 - shiftCount
;
4123 /*----------------------------------------------------------------------------
4124 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4125 | and significand `zSig', and returns the proper single-precision floating-
4126 | point value corresponding to the abstract input. Ordinarily, the abstract
4127 | value is simply rounded and packed into the single-precision format, with
4128 | the inexact exception raised if the abstract input cannot be represented
4129 | exactly. However, if the abstract value is too large, the overflow and
4130 | inexact exceptions are raised and an infinity or maximal finite value is
4131 | returned. If the abstract value is too small, the input value is rounded to
4132 | a subnormal number, and the underflow and inexact exceptions are raised if
4133 | the abstract input cannot be represented exactly as a subnormal single-
4134 | precision floating-point number.
4135 | The input significand `zSig' has its binary point between bits 30
4136 | and 29, which is 7 bits to the left of the usual location. This shifted
4137 | significand must be normalized or smaller. If `zSig' is not normalized,
4138 | `zExp' must be 0; in that case, the result returned is a subnormal number,
4139 | and it must not require rounding. In the usual case that `zSig' is
4140 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
4141 | The handling of underflow and overflow follows the IEC/IEEE Standard for
4142 | Binary Floating-Point Arithmetic.
4143 *----------------------------------------------------------------------------*/
4145 static float32
roundAndPackFloat32(bool zSign
, int zExp
, uint32_t zSig
,
4146 float_status
*status
)
4148 int8_t roundingMode
;
4149 bool roundNearestEven
;
4150 int8_t roundIncrement
, roundBits
;
4153 roundingMode
= status
->float_rounding_mode
;
4154 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
4155 switch (roundingMode
) {
4156 case float_round_nearest_even
:
4157 case float_round_ties_away
:
4158 roundIncrement
= 0x40;
4160 case float_round_to_zero
:
4163 case float_round_up
:
4164 roundIncrement
= zSign
? 0 : 0x7f;
4166 case float_round_down
:
4167 roundIncrement
= zSign
? 0x7f : 0;
4169 case float_round_to_odd
:
4170 roundIncrement
= zSig
& 0x80 ? 0 : 0x7f;
4176 roundBits
= zSig
& 0x7F;
4177 if ( 0xFD <= (uint16_t) zExp
) {
4178 if ( ( 0xFD < zExp
)
4179 || ( ( zExp
== 0xFD )
4180 && ( (int32_t) ( zSig
+ roundIncrement
) < 0 ) )
4182 bool overflow_to_inf
= roundingMode
!= float_round_to_odd
&&
4183 roundIncrement
!= 0;
4184 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
4185 return packFloat32(zSign
, 0xFF, -!overflow_to_inf
);
4188 if (status
->flush_to_zero
) {
4189 float_raise(float_flag_output_denormal
, status
);
4190 return packFloat32(zSign
, 0, 0);
4192 isTiny
= status
->tininess_before_rounding
4194 || (zSig
+ roundIncrement
< 0x80000000);
4195 shift32RightJamming( zSig
, - zExp
, &zSig
);
4197 roundBits
= zSig
& 0x7F;
4198 if (isTiny
&& roundBits
) {
4199 float_raise(float_flag_underflow
, status
);
4201 if (roundingMode
== float_round_to_odd
) {
4203 * For round-to-odd case, the roundIncrement depends on
4204 * zSig which just changed.
4206 roundIncrement
= zSig
& 0x80 ? 0 : 0x7f;
4211 float_raise(float_flag_inexact
, status
);
4213 zSig
= ( zSig
+ roundIncrement
)>>7;
4214 if (!(roundBits
^ 0x40) && roundNearestEven
) {
4217 if ( zSig
== 0 ) zExp
= 0;
4218 return packFloat32( zSign
, zExp
, zSig
);
4222 /*----------------------------------------------------------------------------
4223 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4224 | and significand `zSig', and returns the proper single-precision floating-
4225 | point value corresponding to the abstract input. This routine is just like
4226 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
4227 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
4228 | floating-point exponent.
4229 *----------------------------------------------------------------------------*/
4232 normalizeRoundAndPackFloat32(bool zSign
, int zExp
, uint32_t zSig
,
4233 float_status
*status
)
4237 shiftCount
= clz32(zSig
) - 1;
4238 return roundAndPackFloat32(zSign
, zExp
- shiftCount
, zSig
<<shiftCount
,
4243 /*----------------------------------------------------------------------------
4244 | Normalizes the subnormal double-precision floating-point value represented
4245 | by the denormalized significand `aSig'. The normalized exponent and
4246 | significand are stored at the locations pointed to by `zExpPtr' and
4247 | `zSigPtr', respectively.
4248 *----------------------------------------------------------------------------*/
4251 normalizeFloat64Subnormal(uint64_t aSig
, int *zExpPtr
, uint64_t *zSigPtr
)
4255 shiftCount
= clz64(aSig
) - 11;
4256 *zSigPtr
= aSig
<<shiftCount
;
4257 *zExpPtr
= 1 - shiftCount
;
4261 /*----------------------------------------------------------------------------
4262 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
4263 | double-precision floating-point value, returning the result. After being
4264 | shifted into the proper positions, the three fields are simply added
4265 | together to form the result. This means that any integer portion of `zSig'
4266 | will be added into the exponent. Since a properly normalized significand
4267 | will have an integer portion equal to 1, the `zExp' input should be 1 less
4268 | than the desired result exponent whenever `zSig' is a complete, normalized
4270 *----------------------------------------------------------------------------*/
4272 static inline float64
packFloat64(bool zSign
, int zExp
, uint64_t zSig
)
4275 return make_float64(
4276 ( ( (uint64_t) zSign
)<<63 ) + ( ( (uint64_t) zExp
)<<52 ) + zSig
);
4280 /*----------------------------------------------------------------------------
4281 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4282 | and significand `zSig', and returns the proper double-precision floating-
4283 | point value corresponding to the abstract input. Ordinarily, the abstract
4284 | value is simply rounded and packed into the double-precision format, with
4285 | the inexact exception raised if the abstract input cannot be represented
4286 | exactly. However, if the abstract value is too large, the overflow and
4287 | inexact exceptions are raised and an infinity or maximal finite value is
4288 | returned. If the abstract value is too small, the input value is rounded to
4289 | a subnormal number, and the underflow and inexact exceptions are raised if
4290 | the abstract input cannot be represented exactly as a subnormal double-
4291 | precision floating-point number.
4292 | The input significand `zSig' has its binary point between bits 62
4293 | and 61, which is 10 bits to the left of the usual location. This shifted
4294 | significand must be normalized or smaller. If `zSig' is not normalized,
4295 | `zExp' must be 0; in that case, the result returned is a subnormal number,
4296 | and it must not require rounding. In the usual case that `zSig' is
4297 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
4298 | The handling of underflow and overflow follows the IEC/IEEE Standard for
4299 | Binary Floating-Point Arithmetic.
4300 *----------------------------------------------------------------------------*/
4302 static float64
roundAndPackFloat64(bool zSign
, int zExp
, uint64_t zSig
,
4303 float_status
*status
)
4305 int8_t roundingMode
;
4306 bool roundNearestEven
;
4307 int roundIncrement
, roundBits
;
4310 roundingMode
= status
->float_rounding_mode
;
4311 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
4312 switch (roundingMode
) {
4313 case float_round_nearest_even
:
4314 case float_round_ties_away
:
4315 roundIncrement
= 0x200;
4317 case float_round_to_zero
:
4320 case float_round_up
:
4321 roundIncrement
= zSign
? 0 : 0x3ff;
4323 case float_round_down
:
4324 roundIncrement
= zSign
? 0x3ff : 0;
4326 case float_round_to_odd
:
4327 roundIncrement
= (zSig
& 0x400) ? 0 : 0x3ff;
4332 roundBits
= zSig
& 0x3FF;
4333 if ( 0x7FD <= (uint16_t) zExp
) {
4334 if ( ( 0x7FD < zExp
)
4335 || ( ( zExp
== 0x7FD )
4336 && ( (int64_t) ( zSig
+ roundIncrement
) < 0 ) )
4338 bool overflow_to_inf
= roundingMode
!= float_round_to_odd
&&
4339 roundIncrement
!= 0;
4340 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
4341 return packFloat64(zSign
, 0x7FF, -(!overflow_to_inf
));
4344 if (status
->flush_to_zero
) {
4345 float_raise(float_flag_output_denormal
, status
);
4346 return packFloat64(zSign
, 0, 0);
4348 isTiny
= status
->tininess_before_rounding
4350 || (zSig
+ roundIncrement
< UINT64_C(0x8000000000000000));
4351 shift64RightJamming( zSig
, - zExp
, &zSig
);
4353 roundBits
= zSig
& 0x3FF;
4354 if (isTiny
&& roundBits
) {
4355 float_raise(float_flag_underflow
, status
);
4357 if (roundingMode
== float_round_to_odd
) {
4359 * For round-to-odd case, the roundIncrement depends on
4360 * zSig which just changed.
4362 roundIncrement
= (zSig
& 0x400) ? 0 : 0x3ff;
4367 float_raise(float_flag_inexact
, status
);
4369 zSig
= ( zSig
+ roundIncrement
)>>10;
4370 if (!(roundBits
^ 0x200) && roundNearestEven
) {
4373 if ( zSig
== 0 ) zExp
= 0;
4374 return packFloat64( zSign
, zExp
, zSig
);
4378 /*----------------------------------------------------------------------------
4379 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4380 | and significand `zSig', and returns the proper double-precision floating-
4381 | point value corresponding to the abstract input. This routine is just like
4382 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
4383 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
4384 | floating-point exponent.
4385 *----------------------------------------------------------------------------*/
4388 normalizeRoundAndPackFloat64(bool zSign
, int zExp
, uint64_t zSig
,
4389 float_status
*status
)
4393 shiftCount
= clz64(zSig
) - 1;
4394 return roundAndPackFloat64(zSign
, zExp
- shiftCount
, zSig
<<shiftCount
,
4399 /*----------------------------------------------------------------------------
4400 | Normalizes the subnormal extended double-precision floating-point value
4401 | represented by the denormalized significand `aSig'. The normalized exponent
4402 | and significand are stored at the locations pointed to by `zExpPtr' and
4403 | `zSigPtr', respectively.
4404 *----------------------------------------------------------------------------*/
4406 void normalizeFloatx80Subnormal(uint64_t aSig
, int32_t *zExpPtr
,
4411 shiftCount
= clz64(aSig
);
4412 *zSigPtr
= aSig
<<shiftCount
;
4413 *zExpPtr
= 1 - shiftCount
;
4416 /*----------------------------------------------------------------------------
4417 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4418 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
4419 | and returns the proper extended double-precision floating-point value
4420 | corresponding to the abstract input. Ordinarily, the abstract value is
4421 | rounded and packed into the extended double-precision format, with the
4422 | inexact exception raised if the abstract input cannot be represented
4423 | exactly. However, if the abstract value is too large, the overflow and
4424 | inexact exceptions are raised and an infinity or maximal finite value is
4425 | returned. If the abstract value is too small, the input value is rounded to
4426 | a subnormal number, and the underflow and inexact exceptions are raised if
4427 | the abstract input cannot be represented exactly as a subnormal extended
4428 | double-precision floating-point number.
4429 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
4430 | number of bits as single or double precision, respectively. Otherwise, the
4431 | result is rounded to the full precision of the extended double-precision
4433 | The input significand must be normalized or smaller. If the input
4434 | significand is not normalized, `zExp' must be 0; in that case, the result
4435 | returned is a subnormal number, and it must not require rounding. The
4436 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
4437 | Floating-Point Arithmetic.
4438 *----------------------------------------------------------------------------*/
4440 floatx80
roundAndPackFloatx80(int8_t roundingPrecision
, bool zSign
,
4441 int32_t zExp
, uint64_t zSig0
, uint64_t zSig1
,
4442 float_status
*status
)
4444 int8_t roundingMode
;
4445 bool roundNearestEven
, increment
, isTiny
;
4446 int64_t roundIncrement
, roundMask
, roundBits
;
4448 roundingMode
= status
->float_rounding_mode
;
4449 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
4450 if ( roundingPrecision
== 80 ) goto precision80
;
4451 if ( roundingPrecision
== 64 ) {
4452 roundIncrement
= UINT64_C(0x0000000000000400);
4453 roundMask
= UINT64_C(0x00000000000007FF);
4455 else if ( roundingPrecision
== 32 ) {
4456 roundIncrement
= UINT64_C(0x0000008000000000);
4457 roundMask
= UINT64_C(0x000000FFFFFFFFFF);
4462 zSig0
|= ( zSig1
!= 0 );
4463 switch (roundingMode
) {
4464 case float_round_nearest_even
:
4465 case float_round_ties_away
:
4467 case float_round_to_zero
:
4470 case float_round_up
:
4471 roundIncrement
= zSign
? 0 : roundMask
;
4473 case float_round_down
:
4474 roundIncrement
= zSign
? roundMask
: 0;
4479 roundBits
= zSig0
& roundMask
;
4480 if ( 0x7FFD <= (uint32_t) ( zExp
- 1 ) ) {
4481 if ( ( 0x7FFE < zExp
)
4482 || ( ( zExp
== 0x7FFE ) && ( zSig0
+ roundIncrement
< zSig0
) )
4487 if (status
->flush_to_zero
) {
4488 float_raise(float_flag_output_denormal
, status
);
4489 return packFloatx80(zSign
, 0, 0);
4491 isTiny
= status
->tininess_before_rounding
4493 || (zSig0
<= zSig0
+ roundIncrement
);
4494 shift64RightJamming( zSig0
, 1 - zExp
, &zSig0
);
4496 roundBits
= zSig0
& roundMask
;
4497 if (isTiny
&& roundBits
) {
4498 float_raise(float_flag_underflow
, status
);
4501 float_raise(float_flag_inexact
, status
);
4503 zSig0
+= roundIncrement
;
4504 if ( (int64_t) zSig0
< 0 ) zExp
= 1;
4505 roundIncrement
= roundMask
+ 1;
4506 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
4507 roundMask
|= roundIncrement
;
4509 zSig0
&= ~ roundMask
;
4510 return packFloatx80( zSign
, zExp
, zSig0
);
4514 float_raise(float_flag_inexact
, status
);
4516 zSig0
+= roundIncrement
;
4517 if ( zSig0
< roundIncrement
) {
4519 zSig0
= UINT64_C(0x8000000000000000);
4521 roundIncrement
= roundMask
+ 1;
4522 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
4523 roundMask
|= roundIncrement
;
4525 zSig0
&= ~ roundMask
;
4526 if ( zSig0
== 0 ) zExp
= 0;
4527 return packFloatx80( zSign
, zExp
, zSig0
);
4529 switch (roundingMode
) {
4530 case float_round_nearest_even
:
4531 case float_round_ties_away
:
4532 increment
= ((int64_t)zSig1
< 0);
4534 case float_round_to_zero
:
4537 case float_round_up
:
4538 increment
= !zSign
&& zSig1
;
4540 case float_round_down
:
4541 increment
= zSign
&& zSig1
;
4546 if ( 0x7FFD <= (uint32_t) ( zExp
- 1 ) ) {
4547 if ( ( 0x7FFE < zExp
)
4548 || ( ( zExp
== 0x7FFE )
4549 && ( zSig0
== UINT64_C(0xFFFFFFFFFFFFFFFF) )
4555 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
4556 if ( ( roundingMode
== float_round_to_zero
)
4557 || ( zSign
&& ( roundingMode
== float_round_up
) )
4558 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
4560 return packFloatx80( zSign
, 0x7FFE, ~ roundMask
);
4562 return packFloatx80(zSign
,
4563 floatx80_infinity_high
,
4564 floatx80_infinity_low
);
4567 isTiny
= status
->tininess_before_rounding
4570 || (zSig0
< UINT64_C(0xFFFFFFFFFFFFFFFF));
4571 shift64ExtraRightJamming( zSig0
, zSig1
, 1 - zExp
, &zSig0
, &zSig1
);
4573 if (isTiny
&& zSig1
) {
4574 float_raise(float_flag_underflow
, status
);
4577 float_raise(float_flag_inexact
, status
);
4579 switch (roundingMode
) {
4580 case float_round_nearest_even
:
4581 case float_round_ties_away
:
4582 increment
= ((int64_t)zSig1
< 0);
4584 case float_round_to_zero
:
4587 case float_round_up
:
4588 increment
= !zSign
&& zSig1
;
4590 case float_round_down
:
4591 increment
= zSign
&& zSig1
;
4598 if (!(zSig1
<< 1) && roundNearestEven
) {
4601 if ( (int64_t) zSig0
< 0 ) zExp
= 1;
4603 return packFloatx80( zSign
, zExp
, zSig0
);
4607 float_raise(float_flag_inexact
, status
);
4613 zSig0
= UINT64_C(0x8000000000000000);
4616 if (!(zSig1
<< 1) && roundNearestEven
) {
4622 if ( zSig0
== 0 ) zExp
= 0;
4624 return packFloatx80( zSign
, zExp
, zSig0
);
4628 /*----------------------------------------------------------------------------
4629 | Takes an abstract floating-point value having sign `zSign', exponent
4630 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
4631 | and returns the proper extended double-precision floating-point value
4632 | corresponding to the abstract input. This routine is just like
4633 | `roundAndPackFloatx80' except that the input significand does not have to be
4635 *----------------------------------------------------------------------------*/
4637 floatx80
normalizeRoundAndPackFloatx80(int8_t roundingPrecision
,
4638 bool zSign
, int32_t zExp
,
4639 uint64_t zSig0
, uint64_t zSig1
,
4640 float_status
*status
)
4649 shiftCount
= clz64(zSig0
);
4650 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
4652 return roundAndPackFloatx80(roundingPrecision
, zSign
, zExp
,
4653 zSig0
, zSig1
, status
);
4657 /*----------------------------------------------------------------------------
4658 | Returns the least-significant 64 fraction bits of the quadruple-precision
4659 | floating-point value `a'.
4660 *----------------------------------------------------------------------------*/
4662 static inline uint64_t extractFloat128Frac1( float128 a
)
4669 /*----------------------------------------------------------------------------
4670 | Returns the most-significant 48 fraction bits of the quadruple-precision
4671 | floating-point value `a'.
4672 *----------------------------------------------------------------------------*/
4674 static inline uint64_t extractFloat128Frac0( float128 a
)
4677 return a
.high
& UINT64_C(0x0000FFFFFFFFFFFF);
4681 /*----------------------------------------------------------------------------
4682 | Returns the exponent bits of the quadruple-precision floating-point value
4684 *----------------------------------------------------------------------------*/
4686 static inline int32_t extractFloat128Exp( float128 a
)
4689 return ( a
.high
>>48 ) & 0x7FFF;
4693 /*----------------------------------------------------------------------------
4694 | Returns the sign bit of the quadruple-precision floating-point value `a'.
4695 *----------------------------------------------------------------------------*/
4697 static inline bool extractFloat128Sign(float128 a
)
4699 return a
.high
>> 63;
4702 /*----------------------------------------------------------------------------
4703 | Normalizes the subnormal quadruple-precision floating-point value
4704 | represented by the denormalized significand formed by the concatenation of
4705 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
4706 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
4707 | significand are stored at the location pointed to by `zSig0Ptr', and the
4708 | least significant 64 bits of the normalized significand are stored at the
4709 | location pointed to by `zSig1Ptr'.
4710 *----------------------------------------------------------------------------*/
4713 normalizeFloat128Subnormal(
4724 shiftCount
= clz64(aSig1
) - 15;
4725 if ( shiftCount
< 0 ) {
4726 *zSig0Ptr
= aSig1
>>( - shiftCount
);
4727 *zSig1Ptr
= aSig1
<<( shiftCount
& 63 );
4730 *zSig0Ptr
= aSig1
<<shiftCount
;
4733 *zExpPtr
= - shiftCount
- 63;
4736 shiftCount
= clz64(aSig0
) - 15;
4737 shortShift128Left( aSig0
, aSig1
, shiftCount
, zSig0Ptr
, zSig1Ptr
);
4738 *zExpPtr
= 1 - shiftCount
;
4743 /*----------------------------------------------------------------------------
4744 | Packs the sign `zSign', the exponent `zExp', and the significand formed
4745 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
4746 | floating-point value, returning the result. After being shifted into the
4747 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
4748 | added together to form the most significant 32 bits of the result. This
4749 | means that any integer portion of `zSig0' will be added into the exponent.
4750 | Since a properly normalized significand will have an integer portion equal
4751 | to 1, the `zExp' input should be 1 less than the desired result exponent
4752 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
4754 *----------------------------------------------------------------------------*/
4756 static inline float128
4757 packFloat128(bool zSign
, int32_t zExp
, uint64_t zSig0
, uint64_t zSig1
)
4762 z
.high
= ((uint64_t)zSign
<< 63) + ((uint64_t)zExp
<< 48) + zSig0
;
4766 /*----------------------------------------------------------------------------
4767 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4768 | and extended significand formed by the concatenation of `zSig0', `zSig1',
4769 | and `zSig2', and returns the proper quadruple-precision floating-point value
4770 | corresponding to the abstract input. Ordinarily, the abstract value is
4771 | simply rounded and packed into the quadruple-precision format, with the
4772 | inexact exception raised if the abstract input cannot be represented
4773 | exactly. However, if the abstract value is too large, the overflow and
4774 | inexact exceptions are raised and an infinity or maximal finite value is
4775 | returned. If the abstract value is too small, the input value is rounded to
4776 | a subnormal number, and the underflow and inexact exceptions are raised if
4777 | the abstract input cannot be represented exactly as a subnormal quadruple-
4778 | precision floating-point number.
4779 | The input significand must be normalized or smaller. If the input
4780 | significand is not normalized, `zExp' must be 0; in that case, the result
4781 | returned is a subnormal number, and it must not require rounding. In the
4782 | usual case that the input significand is normalized, `zExp' must be 1 less
4783 | than the ``true'' floating-point exponent. The handling of underflow and
4784 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4785 *----------------------------------------------------------------------------*/
4787 static float128
roundAndPackFloat128(bool zSign
, int32_t zExp
,
4788 uint64_t zSig0
, uint64_t zSig1
,
4789 uint64_t zSig2
, float_status
*status
)
4791 int8_t roundingMode
;
4792 bool roundNearestEven
, increment
, isTiny
;
4794 roundingMode
= status
->float_rounding_mode
;
4795 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
4796 switch (roundingMode
) {
4797 case float_round_nearest_even
:
4798 case float_round_ties_away
:
4799 increment
= ((int64_t)zSig2
< 0);
4801 case float_round_to_zero
:
4804 case float_round_up
:
4805 increment
= !zSign
&& zSig2
;
4807 case float_round_down
:
4808 increment
= zSign
&& zSig2
;
4810 case float_round_to_odd
:
4811 increment
= !(zSig1
& 0x1) && zSig2
;
4816 if ( 0x7FFD <= (uint32_t) zExp
) {
4817 if ( ( 0x7FFD < zExp
)
4818 || ( ( zExp
== 0x7FFD )
4820 UINT64_C(0x0001FFFFFFFFFFFF),
4821 UINT64_C(0xFFFFFFFFFFFFFFFF),
4828 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
4829 if ( ( roundingMode
== float_round_to_zero
)
4830 || ( zSign
&& ( roundingMode
== float_round_up
) )
4831 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
4832 || (roundingMode
== float_round_to_odd
)
4838 UINT64_C(0x0000FFFFFFFFFFFF),
4839 UINT64_C(0xFFFFFFFFFFFFFFFF)
4842 return packFloat128( zSign
, 0x7FFF, 0, 0 );
4845 if (status
->flush_to_zero
) {
4846 float_raise(float_flag_output_denormal
, status
);
4847 return packFloat128(zSign
, 0, 0, 0);
4849 isTiny
= status
->tininess_before_rounding
4852 || lt128(zSig0
, zSig1
,
4853 UINT64_C(0x0001FFFFFFFFFFFF),
4854 UINT64_C(0xFFFFFFFFFFFFFFFF));
4855 shift128ExtraRightJamming(
4856 zSig0
, zSig1
, zSig2
, - zExp
, &zSig0
, &zSig1
, &zSig2
);
4858 if (isTiny
&& zSig2
) {
4859 float_raise(float_flag_underflow
, status
);
4861 switch (roundingMode
) {
4862 case float_round_nearest_even
:
4863 case float_round_ties_away
:
4864 increment
= ((int64_t)zSig2
< 0);
4866 case float_round_to_zero
:
4869 case float_round_up
:
4870 increment
= !zSign
&& zSig2
;
4872 case float_round_down
:
4873 increment
= zSign
&& zSig2
;
4875 case float_round_to_odd
:
4876 increment
= !(zSig1
& 0x1) && zSig2
;
4884 float_raise(float_flag_inexact
, status
);
4887 add128( zSig0
, zSig1
, 0, 1, &zSig0
, &zSig1
);
4888 if ((zSig2
+ zSig2
== 0) && roundNearestEven
) {
4893 if ( ( zSig0
| zSig1
) == 0 ) zExp
= 0;
4895 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
4899 /*----------------------------------------------------------------------------
4900 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4901 | and significand formed by the concatenation of `zSig0' and `zSig1', and
4902 | returns the proper quadruple-precision floating-point value corresponding
4903 | to the abstract input. This routine is just like `roundAndPackFloat128'
4904 | except that the input significand has fewer bits and does not have to be
4905 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
4907 *----------------------------------------------------------------------------*/
4909 static float128
normalizeRoundAndPackFloat128(bool zSign
, int32_t zExp
,
4910 uint64_t zSig0
, uint64_t zSig1
,
4911 float_status
*status
)
4921 shiftCount
= clz64(zSig0
) - 15;
4922 if ( 0 <= shiftCount
) {
4924 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
4927 shift128ExtraRightJamming(
4928 zSig0
, zSig1
, 0, - shiftCount
, &zSig0
, &zSig1
, &zSig2
);
4931 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
4936 /*----------------------------------------------------------------------------
4937 | Returns the result of converting the 32-bit two's complement integer `a'
4938 | to the extended double-precision floating-point format. The conversion
4939 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4941 *----------------------------------------------------------------------------*/
4943 floatx80
int32_to_floatx80(int32_t a
, float_status
*status
)
4950 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
4952 absA
= zSign
? - a
: a
;
4953 shiftCount
= clz32(absA
) + 32;
4955 return packFloatx80( zSign
, 0x403E - shiftCount
, zSig
<<shiftCount
);
4959 /*----------------------------------------------------------------------------
4960 | Returns the result of converting the 32-bit two's complement integer `a' to
4961 | the quadruple-precision floating-point format. The conversion is performed
4962 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4963 *----------------------------------------------------------------------------*/
4965 float128
int32_to_float128(int32_t a
, float_status
*status
)
4972 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
4974 absA
= zSign
? - a
: a
;
4975 shiftCount
= clz32(absA
) + 17;
4977 return packFloat128( zSign
, 0x402E - shiftCount
, zSig0
<<shiftCount
, 0 );
4981 /*----------------------------------------------------------------------------
4982 | Returns the result of converting the 64-bit two's complement integer `a'
4983 | to the extended double-precision floating-point format. The conversion
4984 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4986 *----------------------------------------------------------------------------*/
4988 floatx80
int64_to_floatx80(int64_t a
, float_status
*status
)
4994 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
4996 absA
= zSign
? - a
: a
;
4997 shiftCount
= clz64(absA
);
4998 return packFloatx80( zSign
, 0x403E - shiftCount
, absA
<<shiftCount
);
5002 /*----------------------------------------------------------------------------
5003 | Returns the result of converting the 64-bit two's complement integer `a' to
5004 | the quadruple-precision floating-point format. The conversion is performed
5005 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5006 *----------------------------------------------------------------------------*/
5008 float128
int64_to_float128(int64_t a
, float_status
*status
)
5014 uint64_t zSig0
, zSig1
;
5016 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
5018 absA
= zSign
? - a
: a
;
5019 shiftCount
= clz64(absA
) + 49;
5020 zExp
= 0x406E - shiftCount
;
5021 if ( 64 <= shiftCount
) {
5030 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
5031 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
5035 /*----------------------------------------------------------------------------
5036 | Returns the result of converting the 64-bit unsigned integer `a'
5037 | to the quadruple-precision floating-point format. The conversion is performed
5038 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5039 *----------------------------------------------------------------------------*/
5041 float128
uint64_to_float128(uint64_t a
, float_status
*status
)
5044 return float128_zero
;
5046 return normalizeRoundAndPackFloat128(0, 0x406E, 0, a
, status
);
5049 /*----------------------------------------------------------------------------
5050 | Returns the result of converting the single-precision floating-point value
5051 | `a' to the extended double-precision floating-point format. The conversion
5052 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5054 *----------------------------------------------------------------------------*/
5056 floatx80
float32_to_floatx80(float32 a
, float_status
*status
)
5062 a
= float32_squash_input_denormal(a
, status
);
5063 aSig
= extractFloat32Frac( a
);
5064 aExp
= extractFloat32Exp( a
);
5065 aSign
= extractFloat32Sign( a
);
5066 if ( aExp
== 0xFF ) {
5068 floatx80 res
= commonNaNToFloatx80(float32ToCommonNaN(a
, status
),
5070 return floatx80_silence_nan(res
, status
);
5072 return packFloatx80(aSign
,
5073 floatx80_infinity_high
,
5074 floatx80_infinity_low
);
5077 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
5078 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
5081 return packFloatx80( aSign
, aExp
+ 0x3F80, ( (uint64_t) aSig
)<<40 );
5085 /*----------------------------------------------------------------------------
5086 | Returns the remainder of the single-precision floating-point value `a'
5087 | with respect to the corresponding value `b'. The operation is performed
5088 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5089 *----------------------------------------------------------------------------*/
5091 float32
float32_rem(float32 a
, float32 b
, float_status
*status
)
5094 int aExp
, bExp
, expDiff
;
5095 uint32_t aSig
, bSig
;
5097 uint64_t aSig64
, bSig64
, q64
;
5098 uint32_t alternateASig
;
5100 a
= float32_squash_input_denormal(a
, status
);
5101 b
= float32_squash_input_denormal(b
, status
);
5103 aSig
= extractFloat32Frac( a
);
5104 aExp
= extractFloat32Exp( a
);
5105 aSign
= extractFloat32Sign( a
);
5106 bSig
= extractFloat32Frac( b
);
5107 bExp
= extractFloat32Exp( b
);
5108 if ( aExp
== 0xFF ) {
5109 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
5110 return propagateFloat32NaN(a
, b
, status
);
5112 float_raise(float_flag_invalid
, status
);
5113 return float32_default_nan(status
);
5115 if ( bExp
== 0xFF ) {
5117 return propagateFloat32NaN(a
, b
, status
);
5123 float_raise(float_flag_invalid
, status
);
5124 return float32_default_nan(status
);
5126 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
5129 if ( aSig
== 0 ) return a
;
5130 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
5132 expDiff
= aExp
- bExp
;
5135 if ( expDiff
< 32 ) {
5138 if ( expDiff
< 0 ) {
5139 if ( expDiff
< -1 ) return a
;
5142 q
= ( bSig
<= aSig
);
5143 if ( q
) aSig
-= bSig
;
5144 if ( 0 < expDiff
) {
5145 q
= ( ( (uint64_t) aSig
)<<32 ) / bSig
;
5148 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
5156 if ( bSig
<= aSig
) aSig
-= bSig
;
5157 aSig64
= ( (uint64_t) aSig
)<<40;
5158 bSig64
= ( (uint64_t) bSig
)<<40;
5160 while ( 0 < expDiff
) {
5161 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
5162 q64
= ( 2 < q64
) ? q64
- 2 : 0;
5163 aSig64
= - ( ( bSig
* q64
)<<38 );
5167 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
5168 q64
= ( 2 < q64
) ? q64
- 2 : 0;
5169 q
= q64
>>( 64 - expDiff
);
5171 aSig
= ( ( aSig64
>>33 )<<( expDiff
- 1 ) ) - bSig
* q
;
5174 alternateASig
= aSig
;
5177 } while ( 0 <= (int32_t) aSig
);
5178 sigMean
= aSig
+ alternateASig
;
5179 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
5180 aSig
= alternateASig
;
5182 zSign
= ( (int32_t) aSig
< 0 );
5183 if ( zSign
) aSig
= - aSig
;
5184 return normalizeRoundAndPackFloat32(aSign
^ zSign
, bExp
, aSig
, status
);
5189 /*----------------------------------------------------------------------------
5190 | Returns the binary exponential of the single-precision floating-point value
5191 | `a'. The operation is performed according to the IEC/IEEE Standard for
5192 | Binary Floating-Point Arithmetic.
5194 | Uses the following identities:
5196 | 1. -------------------------------------------------------------------------
5200 | 2. -------------------------------------------------------------------------
5203 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
5205 *----------------------------------------------------------------------------*/
5207 static const float64 float32_exp2_coefficients
[15] =
5209 const_float64( 0x3ff0000000000000ll
), /* 1 */
5210 const_float64( 0x3fe0000000000000ll
), /* 2 */
5211 const_float64( 0x3fc5555555555555ll
), /* 3 */
5212 const_float64( 0x3fa5555555555555ll
), /* 4 */
5213 const_float64( 0x3f81111111111111ll
), /* 5 */
5214 const_float64( 0x3f56c16c16c16c17ll
), /* 6 */
5215 const_float64( 0x3f2a01a01a01a01all
), /* 7 */
5216 const_float64( 0x3efa01a01a01a01all
), /* 8 */
5217 const_float64( 0x3ec71de3a556c734ll
), /* 9 */
5218 const_float64( 0x3e927e4fb7789f5cll
), /* 10 */
5219 const_float64( 0x3e5ae64567f544e4ll
), /* 11 */
5220 const_float64( 0x3e21eed8eff8d898ll
), /* 12 */
5221 const_float64( 0x3de6124613a86d09ll
), /* 13 */
5222 const_float64( 0x3da93974a8c07c9dll
), /* 14 */
5223 const_float64( 0x3d6ae7f3e733b81fll
), /* 15 */
5226 float32
float32_exp2(float32 a
, float_status
*status
)
5233 a
= float32_squash_input_denormal(a
, status
);
5235 aSig
= extractFloat32Frac( a
);
5236 aExp
= extractFloat32Exp( a
);
5237 aSign
= extractFloat32Sign( a
);
5239 if ( aExp
== 0xFF) {
5241 return propagateFloat32NaN(a
, float32_zero
, status
);
5243 return (aSign
) ? float32_zero
: a
;
5246 if (aSig
== 0) return float32_one
;
5249 float_raise(float_flag_inexact
, status
);
5251 /* ******************************* */
5252 /* using float64 for approximation */
5253 /* ******************************* */
5254 x
= float32_to_float64(a
, status
);
5255 x
= float64_mul(x
, float64_ln2
, status
);
5259 for (i
= 0 ; i
< 15 ; i
++) {
5262 f
= float64_mul(xn
, float32_exp2_coefficients
[i
], status
);
5263 r
= float64_add(r
, f
, status
);
5265 xn
= float64_mul(xn
, x
, status
);
5268 return float64_to_float32(r
, status
);
5271 /*----------------------------------------------------------------------------
5272 | Returns the binary log of the single-precision floating-point value `a'.
5273 | The operation is performed according to the IEC/IEEE Standard for Binary
5274 | Floating-Point Arithmetic.
5275 *----------------------------------------------------------------------------*/
5276 float32
float32_log2(float32 a
, float_status
*status
)
5280 uint32_t aSig
, zSig
, i
;
5282 a
= float32_squash_input_denormal(a
, status
);
5283 aSig
= extractFloat32Frac( a
);
5284 aExp
= extractFloat32Exp( a
);
5285 aSign
= extractFloat32Sign( a
);
5288 if ( aSig
== 0 ) return packFloat32( 1, 0xFF, 0 );
5289 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
5292 float_raise(float_flag_invalid
, status
);
5293 return float32_default_nan(status
);
5295 if ( aExp
== 0xFF ) {
5297 return propagateFloat32NaN(a
, float32_zero
, status
);
5307 for (i
= 1 << 22; i
> 0; i
>>= 1) {
5308 aSig
= ( (uint64_t)aSig
* aSig
) >> 23;
5309 if ( aSig
& 0x01000000 ) {
5318 return normalizeRoundAndPackFloat32(zSign
, 0x85, zSig
, status
);
5321 /*----------------------------------------------------------------------------
5322 | Returns the result of converting the double-precision floating-point value
5323 | `a' to the extended double-precision floating-point format. The conversion
5324 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5326 *----------------------------------------------------------------------------*/
5328 floatx80
float64_to_floatx80(float64 a
, float_status
*status
)
5334 a
= float64_squash_input_denormal(a
, status
);
5335 aSig
= extractFloat64Frac( a
);
5336 aExp
= extractFloat64Exp( a
);
5337 aSign
= extractFloat64Sign( a
);
5338 if ( aExp
== 0x7FF ) {
5340 floatx80 res
= commonNaNToFloatx80(float64ToCommonNaN(a
, status
),
5342 return floatx80_silence_nan(res
, status
);
5344 return packFloatx80(aSign
,
5345 floatx80_infinity_high
,
5346 floatx80_infinity_low
);
5349 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
5350 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
5354 aSign
, aExp
+ 0x3C00, (aSig
| UINT64_C(0x0010000000000000)) << 11);
5358 /*----------------------------------------------------------------------------
5359 | Returns the remainder of the double-precision floating-point value `a'
5360 | with respect to the corresponding value `b'. The operation is performed
5361 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5362 *----------------------------------------------------------------------------*/
5364 float64
float64_rem(float64 a
, float64 b
, float_status
*status
)
5367 int aExp
, bExp
, expDiff
;
5368 uint64_t aSig
, bSig
;
5369 uint64_t q
, alternateASig
;
5372 a
= float64_squash_input_denormal(a
, status
);
5373 b
= float64_squash_input_denormal(b
, status
);
5374 aSig
= extractFloat64Frac( a
);
5375 aExp
= extractFloat64Exp( a
);
5376 aSign
= extractFloat64Sign( a
);
5377 bSig
= extractFloat64Frac( b
);
5378 bExp
= extractFloat64Exp( b
);
5379 if ( aExp
== 0x7FF ) {
5380 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
5381 return propagateFloat64NaN(a
, b
, status
);
5383 float_raise(float_flag_invalid
, status
);
5384 return float64_default_nan(status
);
5386 if ( bExp
== 0x7FF ) {
5388 return propagateFloat64NaN(a
, b
, status
);
5394 float_raise(float_flag_invalid
, status
);
5395 return float64_default_nan(status
);
5397 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
5400 if ( aSig
== 0 ) return a
;
5401 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
5403 expDiff
= aExp
- bExp
;
5404 aSig
= (aSig
| UINT64_C(0x0010000000000000)) << 11;
5405 bSig
= (bSig
| UINT64_C(0x0010000000000000)) << 11;
5406 if ( expDiff
< 0 ) {
5407 if ( expDiff
< -1 ) return a
;
5410 q
= ( bSig
<= aSig
);
5411 if ( q
) aSig
-= bSig
;
5413 while ( 0 < expDiff
) {
5414 q
= estimateDiv128To64( aSig
, 0, bSig
);
5415 q
= ( 2 < q
) ? q
- 2 : 0;
5416 aSig
= - ( ( bSig
>>2 ) * q
);
5420 if ( 0 < expDiff
) {
5421 q
= estimateDiv128To64( aSig
, 0, bSig
);
5422 q
= ( 2 < q
) ? q
- 2 : 0;
5425 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
5432 alternateASig
= aSig
;
5435 } while ( 0 <= (int64_t) aSig
);
5436 sigMean
= aSig
+ alternateASig
;
5437 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
5438 aSig
= alternateASig
;
5440 zSign
= ( (int64_t) aSig
< 0 );
5441 if ( zSign
) aSig
= - aSig
;
5442 return normalizeRoundAndPackFloat64(aSign
^ zSign
, bExp
, aSig
, status
);
5446 /*----------------------------------------------------------------------------
5447 | Returns the binary log of the double-precision floating-point value `a'.
5448 | The operation is performed according to the IEC/IEEE Standard for Binary
5449 | Floating-Point Arithmetic.
5450 *----------------------------------------------------------------------------*/
5451 float64
float64_log2(float64 a
, float_status
*status
)
5455 uint64_t aSig
, aSig0
, aSig1
, zSig
, i
;
5456 a
= float64_squash_input_denormal(a
, status
);
5458 aSig
= extractFloat64Frac( a
);
5459 aExp
= extractFloat64Exp( a
);
5460 aSign
= extractFloat64Sign( a
);
5463 if ( aSig
== 0 ) return packFloat64( 1, 0x7FF, 0 );
5464 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
5467 float_raise(float_flag_invalid
, status
);
5468 return float64_default_nan(status
);
5470 if ( aExp
== 0x7FF ) {
5472 return propagateFloat64NaN(a
, float64_zero
, status
);
5478 aSig
|= UINT64_C(0x0010000000000000);
5480 zSig
= (uint64_t)aExp
<< 52;
5481 for (i
= 1LL << 51; i
> 0; i
>>= 1) {
5482 mul64To128( aSig
, aSig
, &aSig0
, &aSig1
);
5483 aSig
= ( aSig0
<< 12 ) | ( aSig1
>> 52 );
5484 if ( aSig
& UINT64_C(0x0020000000000000) ) {
5492 return normalizeRoundAndPackFloat64(zSign
, 0x408, zSig
, status
);
5495 /*----------------------------------------------------------------------------
5496 | Returns the result of converting the extended double-precision floating-
5497 | point value `a' to the 32-bit two's complement integer format. The
5498 | conversion is performed according to the IEC/IEEE Standard for Binary
5499 | Floating-Point Arithmetic---which means in particular that the conversion
5500 | is rounded according to the current rounding mode. If `a' is a NaN, the
5501 | largest positive integer is returned. Otherwise, if the conversion
5502 | overflows, the largest integer with the same sign as `a' is returned.
5503 *----------------------------------------------------------------------------*/
5505 int32_t floatx80_to_int32(floatx80 a
, float_status
*status
)
5508 int32_t aExp
, shiftCount
;
5511 if (floatx80_invalid_encoding(a
)) {
5512 float_raise(float_flag_invalid
, status
);
5515 aSig
= extractFloatx80Frac( a
);
5516 aExp
= extractFloatx80Exp( a
);
5517 aSign
= extractFloatx80Sign( a
);
5518 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) aSign
= 0;
5519 shiftCount
= 0x4037 - aExp
;
5520 if ( shiftCount
<= 0 ) shiftCount
= 1;
5521 shift64RightJamming( aSig
, shiftCount
, &aSig
);
5522 return roundAndPackInt32(aSign
, aSig
, status
);
5526 /*----------------------------------------------------------------------------
5527 | Returns the result of converting the extended double-precision floating-
5528 | point value `a' to the 32-bit two's complement integer format. The
5529 | conversion is performed according to the IEC/IEEE Standard for Binary
5530 | Floating-Point Arithmetic, except that the conversion is always rounded
5531 | toward zero. If `a' is a NaN, the largest positive integer is returned.
5532 | Otherwise, if the conversion overflows, the largest integer with the same
5533 | sign as `a' is returned.
5534 *----------------------------------------------------------------------------*/
5536 int32_t floatx80_to_int32_round_to_zero(floatx80 a
, float_status
*status
)
5539 int32_t aExp
, shiftCount
;
5540 uint64_t aSig
, savedASig
;
5543 if (floatx80_invalid_encoding(a
)) {
5544 float_raise(float_flag_invalid
, status
);
5547 aSig
= extractFloatx80Frac( a
);
5548 aExp
= extractFloatx80Exp( a
);
5549 aSign
= extractFloatx80Sign( a
);
5550 if ( 0x401E < aExp
) {
5551 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) aSign
= 0;
5554 else if ( aExp
< 0x3FFF ) {
5556 float_raise(float_flag_inexact
, status
);
5560 shiftCount
= 0x403E - aExp
;
5562 aSig
>>= shiftCount
;
5564 if ( aSign
) z
= - z
;
5565 if ( ( z
< 0 ) ^ aSign
) {
5567 float_raise(float_flag_invalid
, status
);
5568 return aSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
5570 if ( ( aSig
<<shiftCount
) != savedASig
) {
5571 float_raise(float_flag_inexact
, status
);
5577 /*----------------------------------------------------------------------------
5578 | Returns the result of converting the extended double-precision floating-
5579 | point value `a' to the 64-bit two's complement integer format. The
5580 | conversion is performed according to the IEC/IEEE Standard for Binary
5581 | Floating-Point Arithmetic---which means in particular that the conversion
5582 | is rounded according to the current rounding mode. If `a' is a NaN,
5583 | the largest positive integer is returned. Otherwise, if the conversion
5584 | overflows, the largest integer with the same sign as `a' is returned.
5585 *----------------------------------------------------------------------------*/
5587 int64_t floatx80_to_int64(floatx80 a
, float_status
*status
)
5590 int32_t aExp
, shiftCount
;
5591 uint64_t aSig
, aSigExtra
;
5593 if (floatx80_invalid_encoding(a
)) {
5594 float_raise(float_flag_invalid
, status
);
5597 aSig
= extractFloatx80Frac( a
);
5598 aExp
= extractFloatx80Exp( a
);
5599 aSign
= extractFloatx80Sign( a
);
5600 shiftCount
= 0x403E - aExp
;
5601 if ( shiftCount
<= 0 ) {
5603 float_raise(float_flag_invalid
, status
);
5604 if (!aSign
|| floatx80_is_any_nan(a
)) {
5612 shift64ExtraRightJamming( aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
5614 return roundAndPackInt64(aSign
, aSig
, aSigExtra
, status
);
5618 /*----------------------------------------------------------------------------
5619 | Returns the result of converting the extended double-precision floating-
5620 | point value `a' to the 64-bit two's complement integer format. The
5621 | conversion is performed according to the IEC/IEEE Standard for Binary
5622 | Floating-Point Arithmetic, except that the conversion is always rounded
5623 | toward zero. If `a' is a NaN, the largest positive integer is returned.
5624 | Otherwise, if the conversion overflows, the largest integer with the same
5625 | sign as `a' is returned.
5626 *----------------------------------------------------------------------------*/
5628 int64_t floatx80_to_int64_round_to_zero(floatx80 a
, float_status
*status
)
5631 int32_t aExp
, shiftCount
;
5635 if (floatx80_invalid_encoding(a
)) {
5636 float_raise(float_flag_invalid
, status
);
5639 aSig
= extractFloatx80Frac( a
);
5640 aExp
= extractFloatx80Exp( a
);
5641 aSign
= extractFloatx80Sign( a
);
5642 shiftCount
= aExp
- 0x403E;
5643 if ( 0 <= shiftCount
) {
5644 aSig
&= UINT64_C(0x7FFFFFFFFFFFFFFF);
5645 if ( ( a
.high
!= 0xC03E ) || aSig
) {
5646 float_raise(float_flag_invalid
, status
);
5647 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && aSig
) ) {
5653 else if ( aExp
< 0x3FFF ) {
5655 float_raise(float_flag_inexact
, status
);
5659 z
= aSig
>>( - shiftCount
);
5660 if ( (uint64_t) ( aSig
<<( shiftCount
& 63 ) ) ) {
5661 float_raise(float_flag_inexact
, status
);
5663 if ( aSign
) z
= - z
;
5668 /*----------------------------------------------------------------------------
5669 | Returns the result of converting the extended double-precision floating-
5670 | point value `a' to the single-precision floating-point format. The
5671 | conversion is performed according to the IEC/IEEE Standard for Binary
5672 | Floating-Point Arithmetic.
5673 *----------------------------------------------------------------------------*/
5675 float32
floatx80_to_float32(floatx80 a
, float_status
*status
)
5681 if (floatx80_invalid_encoding(a
)) {
5682 float_raise(float_flag_invalid
, status
);
5683 return float32_default_nan(status
);
5685 aSig
= extractFloatx80Frac( a
);
5686 aExp
= extractFloatx80Exp( a
);
5687 aSign
= extractFloatx80Sign( a
);
5688 if ( aExp
== 0x7FFF ) {
5689 if ( (uint64_t) ( aSig
<<1 ) ) {
5690 float32 res
= commonNaNToFloat32(floatx80ToCommonNaN(a
, status
),
5692 return float32_silence_nan(res
, status
);
5694 return packFloat32( aSign
, 0xFF, 0 );
5696 shift64RightJamming( aSig
, 33, &aSig
);
5697 if ( aExp
|| aSig
) aExp
-= 0x3F81;
5698 return roundAndPackFloat32(aSign
, aExp
, aSig
, status
);
5702 /*----------------------------------------------------------------------------
5703 | Returns the result of converting the extended double-precision floating-
5704 | point value `a' to the double-precision floating-point format. The
5705 | conversion is performed according to the IEC/IEEE Standard for Binary
5706 | Floating-Point Arithmetic.
5707 *----------------------------------------------------------------------------*/
5709 float64
floatx80_to_float64(floatx80 a
, float_status
*status
)
5713 uint64_t aSig
, zSig
;
5715 if (floatx80_invalid_encoding(a
)) {
5716 float_raise(float_flag_invalid
, status
);
5717 return float64_default_nan(status
);
5719 aSig
= extractFloatx80Frac( a
);
5720 aExp
= extractFloatx80Exp( a
);
5721 aSign
= extractFloatx80Sign( a
);
5722 if ( aExp
== 0x7FFF ) {
5723 if ( (uint64_t) ( aSig
<<1 ) ) {
5724 float64 res
= commonNaNToFloat64(floatx80ToCommonNaN(a
, status
),
5726 return float64_silence_nan(res
, status
);
5728 return packFloat64( aSign
, 0x7FF, 0 );
5730 shift64RightJamming( aSig
, 1, &zSig
);
5731 if ( aExp
|| aSig
) aExp
-= 0x3C01;
5732 return roundAndPackFloat64(aSign
, aExp
, zSig
, status
);
5736 /*----------------------------------------------------------------------------
5737 | Returns the result of converting the extended double-precision floating-
5738 | point value `a' to the quadruple-precision floating-point format. The
5739 | conversion is performed according to the IEC/IEEE Standard for Binary
5740 | Floating-Point Arithmetic.
5741 *----------------------------------------------------------------------------*/
5743 float128
floatx80_to_float128(floatx80 a
, float_status
*status
)
5747 uint64_t aSig
, zSig0
, zSig1
;
5749 if (floatx80_invalid_encoding(a
)) {
5750 float_raise(float_flag_invalid
, status
);
5751 return float128_default_nan(status
);
5753 aSig
= extractFloatx80Frac( a
);
5754 aExp
= extractFloatx80Exp( a
);
5755 aSign
= extractFloatx80Sign( a
);
5756 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) {
5757 float128 res
= commonNaNToFloat128(floatx80ToCommonNaN(a
, status
),
5759 return float128_silence_nan(res
, status
);
5761 shift128Right( aSig
<<1, 0, 16, &zSig0
, &zSig1
);
5762 return packFloat128( aSign
, aExp
, zSig0
, zSig1
);
5766 /*----------------------------------------------------------------------------
5767 | Rounds the extended double-precision floating-point value `a'
5768 | to the precision provided by floatx80_rounding_precision and returns the
5769 | result as an extended double-precision floating-point value.
5770 | The operation is performed according to the IEC/IEEE Standard for Binary
5771 | Floating-Point Arithmetic.
5772 *----------------------------------------------------------------------------*/
5774 floatx80
floatx80_round(floatx80 a
, float_status
*status
)
5776 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5777 extractFloatx80Sign(a
),
5778 extractFloatx80Exp(a
),
5779 extractFloatx80Frac(a
), 0, status
);
5782 /*----------------------------------------------------------------------------
5783 | Rounds the extended double-precision floating-point value `a' to an integer,
5784 | and returns the result as an extended quadruple-precision floating-point
5785 | value. The operation is performed according to the IEC/IEEE Standard for
5786 | Binary Floating-Point Arithmetic.
5787 *----------------------------------------------------------------------------*/
5789 floatx80
floatx80_round_to_int(floatx80 a
, float_status
*status
)
5793 uint64_t lastBitMask
, roundBitsMask
;
5796 if (floatx80_invalid_encoding(a
)) {
5797 float_raise(float_flag_invalid
, status
);
5798 return floatx80_default_nan(status
);
5800 aExp
= extractFloatx80Exp( a
);
5801 if ( 0x403E <= aExp
) {
5802 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) ) {
5803 return propagateFloatx80NaN(a
, a
, status
);
5807 if ( aExp
< 0x3FFF ) {
5809 && ( (uint64_t) ( extractFloatx80Frac( a
) ) == 0 ) ) {
5812 float_raise(float_flag_inexact
, status
);
5813 aSign
= extractFloatx80Sign( a
);
5814 switch (status
->float_rounding_mode
) {
5815 case float_round_nearest_even
:
5816 if ( ( aExp
== 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a
)<<1 )
5819 packFloatx80( aSign
, 0x3FFF, UINT64_C(0x8000000000000000));
5822 case float_round_ties_away
:
5823 if (aExp
== 0x3FFE) {
5824 return packFloatx80(aSign
, 0x3FFF, UINT64_C(0x8000000000000000));
5827 case float_round_down
:
5830 packFloatx80( 1, 0x3FFF, UINT64_C(0x8000000000000000))
5831 : packFloatx80( 0, 0, 0 );
5832 case float_round_up
:
5834 aSign
? packFloatx80( 1, 0, 0 )
5835 : packFloatx80( 0, 0x3FFF, UINT64_C(0x8000000000000000));
5837 case float_round_to_zero
:
5840 g_assert_not_reached();
5842 return packFloatx80( aSign
, 0, 0 );
5845 lastBitMask
<<= 0x403E - aExp
;
5846 roundBitsMask
= lastBitMask
- 1;
5848 switch (status
->float_rounding_mode
) {
5849 case float_round_nearest_even
:
5850 z
.low
+= lastBitMask
>>1;
5851 if ((z
.low
& roundBitsMask
) == 0) {
5852 z
.low
&= ~lastBitMask
;
5855 case float_round_ties_away
:
5856 z
.low
+= lastBitMask
>> 1;
5858 case float_round_to_zero
:
5860 case float_round_up
:
5861 if (!extractFloatx80Sign(z
)) {
5862 z
.low
+= roundBitsMask
;
5865 case float_round_down
:
5866 if (extractFloatx80Sign(z
)) {
5867 z
.low
+= roundBitsMask
;
5873 z
.low
&= ~ roundBitsMask
;
5876 z
.low
= UINT64_C(0x8000000000000000);
5878 if (z
.low
!= a
.low
) {
5879 float_raise(float_flag_inexact
, status
);
5885 /*----------------------------------------------------------------------------
5886 | Returns the result of adding the absolute values of the extended double-
5887 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
5888 | negated before being returned. `zSign' is ignored if the result is a NaN.
5889 | The addition is performed according to the IEC/IEEE Standard for Binary
5890 | Floating-Point Arithmetic.
5891 *----------------------------------------------------------------------------*/
5893 static floatx80
addFloatx80Sigs(floatx80 a
, floatx80 b
, bool zSign
,
5894 float_status
*status
)
5896 int32_t aExp
, bExp
, zExp
;
5897 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5900 aSig
= extractFloatx80Frac( a
);
5901 aExp
= extractFloatx80Exp( a
);
5902 bSig
= extractFloatx80Frac( b
);
5903 bExp
= extractFloatx80Exp( b
);
5904 expDiff
= aExp
- bExp
;
5905 if ( 0 < expDiff
) {
5906 if ( aExp
== 0x7FFF ) {
5907 if ((uint64_t)(aSig
<< 1)) {
5908 return propagateFloatx80NaN(a
, b
, status
);
5912 if ( bExp
== 0 ) --expDiff
;
5913 shift64ExtraRightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
5916 else if ( expDiff
< 0 ) {
5917 if ( bExp
== 0x7FFF ) {
5918 if ((uint64_t)(bSig
<< 1)) {
5919 return propagateFloatx80NaN(a
, b
, status
);
5921 return packFloatx80(zSign
,
5922 floatx80_infinity_high
,
5923 floatx80_infinity_low
);
5925 if ( aExp
== 0 ) ++expDiff
;
5926 shift64ExtraRightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
5930 if ( aExp
== 0x7FFF ) {
5931 if ( (uint64_t) ( ( aSig
| bSig
)<<1 ) ) {
5932 return propagateFloatx80NaN(a
, b
, status
);
5937 zSig0
= aSig
+ bSig
;
5939 if ((aSig
| bSig
) & UINT64_C(0x8000000000000000) && zSig0
< aSig
) {
5940 /* At least one of the values is a pseudo-denormal,
5941 * and there is a carry out of the result. */
5946 return packFloatx80(zSign
, 0, 0);
5948 normalizeFloatx80Subnormal( zSig0
, &zExp
, &zSig0
);
5954 zSig0
= aSig
+ bSig
;
5955 if ( (int64_t) zSig0
< 0 ) goto roundAndPack
;
5957 shift64ExtraRightJamming( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
5958 zSig0
|= UINT64_C(0x8000000000000000);
5961 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5962 zSign
, zExp
, zSig0
, zSig1
, status
);
5965 /*----------------------------------------------------------------------------
5966 | Returns the result of subtracting the absolute values of the extended
5967 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
5968 | difference is negated before being returned. `zSign' is ignored if the
5969 | result is a NaN. The subtraction is performed according to the IEC/IEEE
5970 | Standard for Binary Floating-Point Arithmetic.
5971 *----------------------------------------------------------------------------*/
5973 static floatx80
subFloatx80Sigs(floatx80 a
, floatx80 b
, bool zSign
,
5974 float_status
*status
)
5976 int32_t aExp
, bExp
, zExp
;
5977 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5980 aSig
= extractFloatx80Frac( a
);
5981 aExp
= extractFloatx80Exp( a
);
5982 bSig
= extractFloatx80Frac( b
);
5983 bExp
= extractFloatx80Exp( b
);
5984 expDiff
= aExp
- bExp
;
5985 if ( 0 < expDiff
) goto aExpBigger
;
5986 if ( expDiff
< 0 ) goto bExpBigger
;
5987 if ( aExp
== 0x7FFF ) {
5988 if ( (uint64_t) ( ( aSig
| bSig
)<<1 ) ) {
5989 return propagateFloatx80NaN(a
, b
, status
);
5991 float_raise(float_flag_invalid
, status
);
5992 return floatx80_default_nan(status
);
5999 if ( bSig
< aSig
) goto aBigger
;
6000 if ( aSig
< bSig
) goto bBigger
;
6001 return packFloatx80(status
->float_rounding_mode
== float_round_down
, 0, 0);
6003 if ( bExp
== 0x7FFF ) {
6004 if ((uint64_t)(bSig
<< 1)) {
6005 return propagateFloatx80NaN(a
, b
, status
);
6007 return packFloatx80(zSign
^ 1, floatx80_infinity_high
,
6008 floatx80_infinity_low
);
6010 if ( aExp
== 0 ) ++expDiff
;
6011 shift128RightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
6013 sub128( bSig
, 0, aSig
, zSig1
, &zSig0
, &zSig1
);
6016 goto normalizeRoundAndPack
;
6018 if ( aExp
== 0x7FFF ) {
6019 if ((uint64_t)(aSig
<< 1)) {
6020 return propagateFloatx80NaN(a
, b
, status
);
6024 if ( bExp
== 0 ) --expDiff
;
6025 shift128RightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
6027 sub128( aSig
, 0, bSig
, zSig1
, &zSig0
, &zSig1
);
6029 normalizeRoundAndPack
:
6030 return normalizeRoundAndPackFloatx80(status
->floatx80_rounding_precision
,
6031 zSign
, zExp
, zSig0
, zSig1
, status
);
6034 /*----------------------------------------------------------------------------
6035 | Returns the result of adding the extended double-precision floating-point
6036 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6037 | Standard for Binary Floating-Point Arithmetic.
6038 *----------------------------------------------------------------------------*/
6040 floatx80
floatx80_add(floatx80 a
, floatx80 b
, float_status
*status
)
6044 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
6045 float_raise(float_flag_invalid
, status
);
6046 return floatx80_default_nan(status
);
6048 aSign
= extractFloatx80Sign( a
);
6049 bSign
= extractFloatx80Sign( b
);
6050 if ( aSign
== bSign
) {
6051 return addFloatx80Sigs(a
, b
, aSign
, status
);
6054 return subFloatx80Sigs(a
, b
, aSign
, status
);
6059 /*----------------------------------------------------------------------------
6060 | Returns the result of subtracting the extended double-precision floating-
6061 | point values `a' and `b'. The operation is performed according to the
6062 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6063 *----------------------------------------------------------------------------*/
6065 floatx80
floatx80_sub(floatx80 a
, floatx80 b
, float_status
*status
)
6069 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
6070 float_raise(float_flag_invalid
, status
);
6071 return floatx80_default_nan(status
);
6073 aSign
= extractFloatx80Sign( a
);
6074 bSign
= extractFloatx80Sign( b
);
6075 if ( aSign
== bSign
) {
6076 return subFloatx80Sigs(a
, b
, aSign
, status
);
6079 return addFloatx80Sigs(a
, b
, aSign
, status
);
6084 /*----------------------------------------------------------------------------
6085 | Returns the result of multiplying the extended double-precision floating-
6086 | point values `a' and `b'. The operation is performed according to the
6087 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6088 *----------------------------------------------------------------------------*/
6090 floatx80
floatx80_mul(floatx80 a
, floatx80 b
, float_status
*status
)
6092 bool aSign
, bSign
, zSign
;
6093 int32_t aExp
, bExp
, zExp
;
6094 uint64_t aSig
, bSig
, zSig0
, zSig1
;
6096 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
6097 float_raise(float_flag_invalid
, status
);
6098 return floatx80_default_nan(status
);
6100 aSig
= extractFloatx80Frac( a
);
6101 aExp
= extractFloatx80Exp( a
);
6102 aSign
= extractFloatx80Sign( a
);
6103 bSig
= extractFloatx80Frac( b
);
6104 bExp
= extractFloatx80Exp( b
);
6105 bSign
= extractFloatx80Sign( b
);
6106 zSign
= aSign
^ bSign
;
6107 if ( aExp
== 0x7FFF ) {
6108 if ( (uint64_t) ( aSig
<<1 )
6109 || ( ( bExp
== 0x7FFF ) && (uint64_t) ( bSig
<<1 ) ) ) {
6110 return propagateFloatx80NaN(a
, b
, status
);
6112 if ( ( bExp
| bSig
) == 0 ) goto invalid
;
6113 return packFloatx80(zSign
, floatx80_infinity_high
,
6114 floatx80_infinity_low
);
6116 if ( bExp
== 0x7FFF ) {
6117 if ((uint64_t)(bSig
<< 1)) {
6118 return propagateFloatx80NaN(a
, b
, status
);
6120 if ( ( aExp
| aSig
) == 0 ) {
6122 float_raise(float_flag_invalid
, status
);
6123 return floatx80_default_nan(status
);
6125 return packFloatx80(zSign
, floatx80_infinity_high
,
6126 floatx80_infinity_low
);
6129 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
6130 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
6133 if ( bSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
6134 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
6136 zExp
= aExp
+ bExp
- 0x3FFE;
6137 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
6138 if ( 0 < (int64_t) zSig0
) {
6139 shortShift128Left( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
6142 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
6143 zSign
, zExp
, zSig0
, zSig1
, status
);
6146 /*----------------------------------------------------------------------------
6147 | Returns the result of dividing the extended double-precision floating-point
6148 | value `a' by the corresponding value `b'. The operation is performed
6149 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6150 *----------------------------------------------------------------------------*/
6152 floatx80
floatx80_div(floatx80 a
, floatx80 b
, float_status
*status
)
6154 bool aSign
, bSign
, zSign
;
6155 int32_t aExp
, bExp
, zExp
;
6156 uint64_t aSig
, bSig
, zSig0
, zSig1
;
6157 uint64_t rem0
, rem1
, rem2
, term0
, term1
, term2
;
6159 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
6160 float_raise(float_flag_invalid
, status
);
6161 return floatx80_default_nan(status
);
6163 aSig
= extractFloatx80Frac( a
);
6164 aExp
= extractFloatx80Exp( a
);
6165 aSign
= extractFloatx80Sign( a
);
6166 bSig
= extractFloatx80Frac( b
);
6167 bExp
= extractFloatx80Exp( b
);
6168 bSign
= extractFloatx80Sign( b
);
6169 zSign
= aSign
^ bSign
;
6170 if ( aExp
== 0x7FFF ) {
6171 if ((uint64_t)(aSig
<< 1)) {
6172 return propagateFloatx80NaN(a
, b
, status
);
6174 if ( bExp
== 0x7FFF ) {
6175 if ((uint64_t)(bSig
<< 1)) {
6176 return propagateFloatx80NaN(a
, b
, status
);
6180 return packFloatx80(zSign
, floatx80_infinity_high
,
6181 floatx80_infinity_low
);
6183 if ( bExp
== 0x7FFF ) {
6184 if ((uint64_t)(bSig
<< 1)) {
6185 return propagateFloatx80NaN(a
, b
, status
);
6187 return packFloatx80( zSign
, 0, 0 );
6191 if ( ( aExp
| aSig
) == 0 ) {
6193 float_raise(float_flag_invalid
, status
);
6194 return floatx80_default_nan(status
);
6196 float_raise(float_flag_divbyzero
, status
);
6197 return packFloatx80(zSign
, floatx80_infinity_high
,
6198 floatx80_infinity_low
);
6200 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
6203 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
6204 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
6206 zExp
= aExp
- bExp
+ 0x3FFE;
6208 if ( bSig
<= aSig
) {
6209 shift128Right( aSig
, 0, 1, &aSig
, &rem1
);
6212 zSig0
= estimateDiv128To64( aSig
, rem1
, bSig
);
6213 mul64To128( bSig
, zSig0
, &term0
, &term1
);
6214 sub128( aSig
, rem1
, term0
, term1
, &rem0
, &rem1
);
6215 while ( (int64_t) rem0
< 0 ) {
6217 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
6219 zSig1
= estimateDiv128To64( rem1
, 0, bSig
);
6220 if ( (uint64_t) ( zSig1
<<1 ) <= 8 ) {
6221 mul64To128( bSig
, zSig1
, &term1
, &term2
);
6222 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
6223 while ( (int64_t) rem1
< 0 ) {
6225 add128( rem1
, rem2
, 0, bSig
, &rem1
, &rem2
);
6227 zSig1
|= ( ( rem1
| rem2
) != 0 );
6229 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
6230 zSign
, zExp
, zSig0
, zSig1
, status
);
6233 /*----------------------------------------------------------------------------
6234 | Returns the remainder of the extended double-precision floating-point value
6235 | `a' with respect to the corresponding value `b'. The operation is performed
6236 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic,
6237 | if 'mod' is false; if 'mod' is true, return the remainder based on truncating
6238 | the quotient toward zero instead. '*quotient' is set to the low 64 bits of
6239 | the absolute value of the integer quotient.
6240 *----------------------------------------------------------------------------*/
6242 floatx80
floatx80_modrem(floatx80 a
, floatx80 b
, bool mod
, uint64_t *quotient
,
6243 float_status
*status
)
6246 int32_t aExp
, bExp
, expDiff
, aExpOrig
;
6247 uint64_t aSig0
, aSig1
, bSig
;
6248 uint64_t q
, term0
, term1
, alternateASig0
, alternateASig1
;
6251 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
6252 float_raise(float_flag_invalid
, status
);
6253 return floatx80_default_nan(status
);
6255 aSig0
= extractFloatx80Frac( a
);
6256 aExpOrig
= aExp
= extractFloatx80Exp( a
);
6257 aSign
= extractFloatx80Sign( a
);
6258 bSig
= extractFloatx80Frac( b
);
6259 bExp
= extractFloatx80Exp( b
);
6260 if ( aExp
== 0x7FFF ) {
6261 if ( (uint64_t) ( aSig0
<<1 )
6262 || ( ( bExp
== 0x7FFF ) && (uint64_t) ( bSig
<<1 ) ) ) {
6263 return propagateFloatx80NaN(a
, b
, status
);
6267 if ( bExp
== 0x7FFF ) {
6268 if ((uint64_t)(bSig
<< 1)) {
6269 return propagateFloatx80NaN(a
, b
, status
);
6271 if (aExp
== 0 && aSig0
>> 63) {
6273 * Pseudo-denormal argument must be returned in normalized
6276 return packFloatx80(aSign
, 1, aSig0
);
6283 float_raise(float_flag_invalid
, status
);
6284 return floatx80_default_nan(status
);
6286 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
6289 if ( aSig0
== 0 ) return a
;
6290 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
6293 expDiff
= aExp
- bExp
;
6295 if ( expDiff
< 0 ) {
6296 if ( mod
|| expDiff
< -1 ) {
6297 if (aExp
== 1 && aExpOrig
== 0) {
6299 * Pseudo-denormal argument must be returned in
6302 return packFloatx80(aSign
, aExp
, aSig0
);
6306 shift128Right( aSig0
, 0, 1, &aSig0
, &aSig1
);
6309 *quotient
= q
= ( bSig
<= aSig0
);
6310 if ( q
) aSig0
-= bSig
;
6312 while ( 0 < expDiff
) {
6313 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
6314 q
= ( 2 < q
) ? q
- 2 : 0;
6315 mul64To128( bSig
, q
, &term0
, &term1
);
6316 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
6317 shortShift128Left( aSig0
, aSig1
, 62, &aSig0
, &aSig1
);
6323 if ( 0 < expDiff
) {
6324 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
6325 q
= ( 2 < q
) ? q
- 2 : 0;
6327 mul64To128( bSig
, q
<<( 64 - expDiff
), &term0
, &term1
);
6328 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
6329 shortShift128Left( 0, bSig
, 64 - expDiff
, &term0
, &term1
);
6330 while ( le128( term0
, term1
, aSig0
, aSig1
) ) {
6332 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
6335 *quotient
<<= expDiff
;
6346 sub128( term0
, term1
, aSig0
, aSig1
, &alternateASig0
, &alternateASig1
);
6347 if ( lt128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
6348 || ( eq128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
6351 aSig0
= alternateASig0
;
6352 aSig1
= alternateASig1
;
6358 normalizeRoundAndPackFloatx80(
6359 80, zSign
, bExp
+ expDiff
, aSig0
, aSig1
, status
);
6363 /*----------------------------------------------------------------------------
6364 | Returns the remainder of the extended double-precision floating-point value
6365 | `a' with respect to the corresponding value `b'. The operation is performed
6366 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6367 *----------------------------------------------------------------------------*/
6369 floatx80
floatx80_rem(floatx80 a
, floatx80 b
, float_status
*status
)
6372 return floatx80_modrem(a
, b
, false, "ient
, status
);
6375 /*----------------------------------------------------------------------------
6376 | Returns the remainder of the extended double-precision floating-point value
6377 | `a' with respect to the corresponding value `b', with the quotient truncated
6379 *----------------------------------------------------------------------------*/
6381 floatx80
floatx80_mod(floatx80 a
, floatx80 b
, float_status
*status
)
6384 return floatx80_modrem(a
, b
, true, "ient
, status
);
6387 /*----------------------------------------------------------------------------
6388 | Returns the square root of the extended double-precision floating-point
6389 | value `a'. The operation is performed according to the IEC/IEEE Standard
6390 | for Binary Floating-Point Arithmetic.
6391 *----------------------------------------------------------------------------*/
6393 floatx80
floatx80_sqrt(floatx80 a
, float_status
*status
)
6397 uint64_t aSig0
, aSig1
, zSig0
, zSig1
, doubleZSig0
;
6398 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
6400 if (floatx80_invalid_encoding(a
)) {
6401 float_raise(float_flag_invalid
, status
);
6402 return floatx80_default_nan(status
);
6404 aSig0
= extractFloatx80Frac( a
);
6405 aExp
= extractFloatx80Exp( a
);
6406 aSign
= extractFloatx80Sign( a
);
6407 if ( aExp
== 0x7FFF ) {
6408 if ((uint64_t)(aSig0
<< 1)) {
6409 return propagateFloatx80NaN(a
, a
, status
);
6411 if ( ! aSign
) return a
;
6415 if ( ( aExp
| aSig0
) == 0 ) return a
;
6417 float_raise(float_flag_invalid
, status
);
6418 return floatx80_default_nan(status
);
6421 if ( aSig0
== 0 ) return packFloatx80( 0, 0, 0 );
6422 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
6424 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFF;
6425 zSig0
= estimateSqrt32( aExp
, aSig0
>>32 );
6426 shift128Right( aSig0
, 0, 2 + ( aExp
& 1 ), &aSig0
, &aSig1
);
6427 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
6428 doubleZSig0
= zSig0
<<1;
6429 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
6430 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
6431 while ( (int64_t) rem0
< 0 ) {
6434 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
6436 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
6437 if ( ( zSig1
& UINT64_C(0x3FFFFFFFFFFFFFFF) ) <= 5 ) {
6438 if ( zSig1
== 0 ) zSig1
= 1;
6439 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
6440 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
6441 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
6442 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
6443 while ( (int64_t) rem1
< 0 ) {
6445 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
6447 term2
|= doubleZSig0
;
6448 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
6450 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
6452 shortShift128Left( 0, zSig1
, 1, &zSig0
, &zSig1
);
6453 zSig0
|= doubleZSig0
;
6454 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
6455 0, zExp
, zSig0
, zSig1
, status
);
6458 /*----------------------------------------------------------------------------
6459 | Returns the result of converting the quadruple-precision floating-point
6460 | value `a' to the extended double-precision floating-point format. The
6461 | conversion is performed according to the IEC/IEEE Standard for Binary
6462 | Floating-Point Arithmetic.
6463 *----------------------------------------------------------------------------*/
6465 floatx80
float128_to_floatx80(float128 a
, float_status
*status
)
6469 uint64_t aSig0
, aSig1
;
6471 aSig1
= extractFloat128Frac1( a
);
6472 aSig0
= extractFloat128Frac0( a
);
6473 aExp
= extractFloat128Exp( a
);
6474 aSign
= extractFloat128Sign( a
);
6475 if ( aExp
== 0x7FFF ) {
6476 if ( aSig0
| aSig1
) {
6477 floatx80 res
= commonNaNToFloatx80(float128ToCommonNaN(a
, status
),
6479 return floatx80_silence_nan(res
, status
);
6481 return packFloatx80(aSign
, floatx80_infinity_high
,
6482 floatx80_infinity_low
);
6485 if ( ( aSig0
| aSig1
) == 0 ) return packFloatx80( aSign
, 0, 0 );
6486 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6489 aSig0
|= UINT64_C(0x0001000000000000);
6491 shortShift128Left( aSig0
, aSig1
, 15, &aSig0
, &aSig1
);
6492 return roundAndPackFloatx80(80, aSign
, aExp
, aSig0
, aSig1
, status
);
6496 /*----------------------------------------------------------------------------
6497 | Returns the remainder of the quadruple-precision floating-point value `a'
6498 | with respect to the corresponding value `b'. The operation is performed
6499 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6500 *----------------------------------------------------------------------------*/
6502 float128
float128_rem(float128 a
, float128 b
, float_status
*status
)
6505 int32_t aExp
, bExp
, expDiff
;
6506 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, q
, term0
, term1
, term2
;
6507 uint64_t allZero
, alternateASig0
, alternateASig1
, sigMean1
;
6510 aSig1
= extractFloat128Frac1( a
);
6511 aSig0
= extractFloat128Frac0( a
);
6512 aExp
= extractFloat128Exp( a
);
6513 aSign
= extractFloat128Sign( a
);
6514 bSig1
= extractFloat128Frac1( b
);
6515 bSig0
= extractFloat128Frac0( b
);
6516 bExp
= extractFloat128Exp( b
);
6517 if ( aExp
== 0x7FFF ) {
6518 if ( ( aSig0
| aSig1
)
6519 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
6520 return propagateFloat128NaN(a
, b
, status
);
6524 if ( bExp
== 0x7FFF ) {
6525 if (bSig0
| bSig1
) {
6526 return propagateFloat128NaN(a
, b
, status
);
6531 if ( ( bSig0
| bSig1
) == 0 ) {
6533 float_raise(float_flag_invalid
, status
);
6534 return float128_default_nan(status
);
6536 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
6539 if ( ( aSig0
| aSig1
) == 0 ) return a
;
6540 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6542 expDiff
= aExp
- bExp
;
6543 if ( expDiff
< -1 ) return a
;
6545 aSig0
| UINT64_C(0x0001000000000000),
6547 15 - ( expDiff
< 0 ),
6552 bSig0
| UINT64_C(0x0001000000000000), bSig1
, 15, &bSig0
, &bSig1
);
6553 q
= le128( bSig0
, bSig1
, aSig0
, aSig1
);
6554 if ( q
) sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
6556 while ( 0 < expDiff
) {
6557 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
6558 q
= ( 4 < q
) ? q
- 4 : 0;
6559 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
6560 shortShift192Left( term0
, term1
, term2
, 61, &term1
, &term2
, &allZero
);
6561 shortShift128Left( aSig0
, aSig1
, 61, &aSig0
, &allZero
);
6562 sub128( aSig0
, 0, term1
, term2
, &aSig0
, &aSig1
);
6565 if ( -64 < expDiff
) {
6566 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
6567 q
= ( 4 < q
) ? q
- 4 : 0;
6569 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
6571 if ( expDiff
< 0 ) {
6572 shift128Right( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
6575 shortShift128Left( aSig0
, aSig1
, expDiff
, &aSig0
, &aSig1
);
6577 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
6578 sub128( aSig0
, aSig1
, term1
, term2
, &aSig0
, &aSig1
);
6581 shift128Right( aSig0
, aSig1
, 12, &aSig0
, &aSig1
);
6582 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
6585 alternateASig0
= aSig0
;
6586 alternateASig1
= aSig1
;
6588 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
6589 } while ( 0 <= (int64_t) aSig0
);
6591 aSig0
, aSig1
, alternateASig0
, alternateASig1
, (uint64_t *)&sigMean0
, &sigMean1
);
6592 if ( ( sigMean0
< 0 )
6593 || ( ( ( sigMean0
| sigMean1
) == 0 ) && ( q
& 1 ) ) ) {
6594 aSig0
= alternateASig0
;
6595 aSig1
= alternateASig1
;
6597 zSign
= ( (int64_t) aSig0
< 0 );
6598 if ( zSign
) sub128( 0, 0, aSig0
, aSig1
, &aSig0
, &aSig1
);
6599 return normalizeRoundAndPackFloat128(aSign
^ zSign
, bExp
- 4, aSig0
, aSig1
,
6603 /*----------------------------------------------------------------------------
6604 | Returns the square root of the quadruple-precision floating-point value `a'.
6605 | The operation is performed according to the IEC/IEEE Standard for Binary
6606 | Floating-Point Arithmetic.
6607 *----------------------------------------------------------------------------*/
6609 float128
float128_sqrt(float128 a
, float_status
*status
)
6613 uint64_t aSig0
, aSig1
, zSig0
, zSig1
, zSig2
, doubleZSig0
;
6614 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
6616 aSig1
= extractFloat128Frac1( a
);
6617 aSig0
= extractFloat128Frac0( a
);
6618 aExp
= extractFloat128Exp( a
);
6619 aSign
= extractFloat128Sign( a
);
6620 if ( aExp
== 0x7FFF ) {
6621 if (aSig0
| aSig1
) {
6622 return propagateFloat128NaN(a
, a
, status
);
6624 if ( ! aSign
) return a
;
6628 if ( ( aExp
| aSig0
| aSig1
) == 0 ) return a
;
6630 float_raise(float_flag_invalid
, status
);
6631 return float128_default_nan(status
);
6634 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( 0, 0, 0, 0 );
6635 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6637 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFE;
6638 aSig0
|= UINT64_C(0x0001000000000000);
6639 zSig0
= estimateSqrt32( aExp
, aSig0
>>17 );
6640 shortShift128Left( aSig0
, aSig1
, 13 - ( aExp
& 1 ), &aSig0
, &aSig1
);
6641 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
6642 doubleZSig0
= zSig0
<<1;
6643 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
6644 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
6645 while ( (int64_t) rem0
< 0 ) {
6648 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
6650 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
6651 if ( ( zSig1
& 0x1FFF ) <= 5 ) {
6652 if ( zSig1
== 0 ) zSig1
= 1;
6653 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
6654 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
6655 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
6656 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
6657 while ( (int64_t) rem1
< 0 ) {
6659 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
6661 term2
|= doubleZSig0
;
6662 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
6664 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
6666 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 14, &zSig0
, &zSig1
, &zSig2
);
6667 return roundAndPackFloat128(0, zExp
, zSig0
, zSig1
, zSig2
, status
);
6671 static inline FloatRelation
6672 floatx80_compare_internal(floatx80 a
, floatx80 b
, bool is_quiet
,
6673 float_status
*status
)
6677 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
6678 float_raise(float_flag_invalid
, status
);
6679 return float_relation_unordered
;
6681 if (( ( extractFloatx80Exp( a
) == 0x7fff ) &&
6682 ( extractFloatx80Frac( a
)<<1 ) ) ||
6683 ( ( extractFloatx80Exp( b
) == 0x7fff ) &&
6684 ( extractFloatx80Frac( b
)<<1 ) )) {
6686 floatx80_is_signaling_nan(a
, status
) ||
6687 floatx80_is_signaling_nan(b
, status
)) {
6688 float_raise(float_flag_invalid
, status
);
6690 return float_relation_unordered
;
6692 aSign
= extractFloatx80Sign( a
);
6693 bSign
= extractFloatx80Sign( b
);
6694 if ( aSign
!= bSign
) {
6696 if ( ( ( (uint16_t) ( ( a
.high
| b
.high
) << 1 ) ) == 0) &&
6697 ( ( a
.low
| b
.low
) == 0 ) ) {
6699 return float_relation_equal
;
6701 return 1 - (2 * aSign
);
6704 /* Normalize pseudo-denormals before comparison. */
6705 if ((a
.high
& 0x7fff) == 0 && a
.low
& UINT64_C(0x8000000000000000)) {
6708 if ((b
.high
& 0x7fff) == 0 && b
.low
& UINT64_C(0x8000000000000000)) {
6711 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
6712 return float_relation_equal
;
6714 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
6719 FloatRelation
floatx80_compare(floatx80 a
, floatx80 b
, float_status
*status
)
6721 return floatx80_compare_internal(a
, b
, 0, status
);
6724 FloatRelation
floatx80_compare_quiet(floatx80 a
, floatx80 b
,
6725 float_status
*status
)
6727 return floatx80_compare_internal(a
, b
, 1, status
);
6730 static inline FloatRelation
6731 float128_compare_internal(float128 a
, float128 b
, bool is_quiet
,
6732 float_status
*status
)
6736 if (( ( extractFloat128Exp( a
) == 0x7fff ) &&
6737 ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) ) ||
6738 ( ( extractFloat128Exp( b
) == 0x7fff ) &&
6739 ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )) {
6741 float128_is_signaling_nan(a
, status
) ||
6742 float128_is_signaling_nan(b
, status
)) {
6743 float_raise(float_flag_invalid
, status
);
6745 return float_relation_unordered
;
6747 aSign
= extractFloat128Sign( a
);
6748 bSign
= extractFloat128Sign( b
);
6749 if ( aSign
!= bSign
) {
6750 if ( ( ( ( a
.high
| b
.high
)<<1 ) | a
.low
| b
.low
) == 0 ) {
6752 return float_relation_equal
;
6754 return 1 - (2 * aSign
);
6757 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
6758 return float_relation_equal
;
6760 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
6765 FloatRelation
float128_compare(float128 a
, float128 b
, float_status
*status
)
6767 return float128_compare_internal(a
, b
, 0, status
);
6770 FloatRelation
float128_compare_quiet(float128 a
, float128 b
,
6771 float_status
*status
)
6773 return float128_compare_internal(a
, b
, 1, status
);
6776 floatx80
floatx80_scalbn(floatx80 a
, int n
, float_status
*status
)
6782 if (floatx80_invalid_encoding(a
)) {
6783 float_raise(float_flag_invalid
, status
);
6784 return floatx80_default_nan(status
);
6786 aSig
= extractFloatx80Frac( a
);
6787 aExp
= extractFloatx80Exp( a
);
6788 aSign
= extractFloatx80Sign( a
);
6790 if ( aExp
== 0x7FFF ) {
6792 return propagateFloatx80NaN(a
, a
, status
);
6806 } else if (n
< -0x10000) {
6811 return normalizeRoundAndPackFloatx80(status
->floatx80_rounding_precision
,
6812 aSign
, aExp
, aSig
, 0, status
);
6815 float128
float128_scalbn(float128 a
, int n
, float_status
*status
)
6819 uint64_t aSig0
, aSig1
;
6821 aSig1
= extractFloat128Frac1( a
);
6822 aSig0
= extractFloat128Frac0( a
);
6823 aExp
= extractFloat128Exp( a
);
6824 aSign
= extractFloat128Sign( a
);
6825 if ( aExp
== 0x7FFF ) {
6826 if ( aSig0
| aSig1
) {
6827 return propagateFloat128NaN(a
, a
, status
);
6832 aSig0
|= UINT64_C(0x0001000000000000);
6833 } else if (aSig0
== 0 && aSig1
== 0) {
6841 } else if (n
< -0x10000) {
6846 return normalizeRoundAndPackFloat128( aSign
, aExp
, aSig0
, aSig1
6851 static void __attribute__((constructor
)) softfloat_init(void)
6853 union_float64 ua
, ub
, uc
, ur
;
6855 if (QEMU_NO_HARDFLOAT
) {
6859 * Test that the host's FMA is not obviously broken. For example,
6860 * glibc < 2.23 can perform an incorrect FMA on certain hosts; see
6861 * https://sourceware.org/bugzilla/show_bug.cgi?id=13304
6863 ua
.s
= 0x0020000000000001ULL
;
6864 ub
.s
= 0x3ca0000000000000ULL
;
6865 uc
.s
= 0x0020000000000000ULL
;
6866 ur
.h
= fma(ua
.h
, ub
.h
, uc
.h
);
6867 if (ur
.s
!= 0x0020000000000001ULL
) {
6868 force_soft_fma
= true;